o
    ei-                    @   sH  d Z ddlZddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* e'+e,Z-G dd dej.Z/G dd dej.Z0G dd dej.Z1G dd dej.Z2G dd dej.Z3G dd dej.Z4G dd dej.Z5G dd  d eZ6G d!d" d"ej.Z7e&G d#d$ d$e"Z8G d%d& d&e8Z9e&G d'd( d(e8Z:e&d)d*G d+d, d,e8eZ;e&G d-d. d.e8Z<e&d/d*G d0d1 d1e8Z=e&G d2d3 d3e8Z>e&G d4d5 d5e8Z?g d6Z@dS )7zPyTorch mT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringloggingtorch_compilable_check   )	MT5Configc                       s&   e Zd Zd fdd	Zdd Z  ZS )MT5LayerNormư>c                    s&   t    tt|| _|| _dS )zd
        Construct a layernorm module in the MT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mt5/modeling_mt5.pyr"   0   s   

zMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor$   float32powmeanrsqrtr'   r&   dtypefloat16bfloat16)r(   hidden_statesvariancer-   r-   r.   forward8   s
   
zMT5LayerNorm.forward)r    )__name__
__module____qualname__r"   r<   __classcell__r-   r-   r+   r.   r   /   s    r   c                       *   e Zd Zdef fddZdd Z  ZS )MT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r!   r"   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr(   rC   r+   r-   r.   r"   J   s
   
zMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rJ   rP   rN   
isinstancerK   r&   r$   Tensorr7   int8r2   r(   r:   r-   r-   r.   r<   Q   s   



zMT5DenseActDense.forwardr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   rB   I   s    rB   c                       rA   )MT5DenseGatedActDenserC   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rD   )r!   r"   r   rG   rH   rI   wi_0wi_1rK   rL   rM   rN   r   rO   rP   rQ   r+   r-   r.   r"   a   s   
zMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rR   )rP   rY   rZ   rN   rS   rK   r&   r$   rT   r7   rU   r2   )r(   r:   hidden_geluhidden_linearr-   r-   r.   r<   i   s   


zMT5DenseGatedActDense.forwardrW   r-   r-   r+   r.   rX   `   s    rX   c                       rA   )
MT5LayerFFrC   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr*   )r!   r"   is_gated_actrX   DenseReluDenserB   r   rH   layer_norm_epsilon
layer_normr   rL   rM   rN   rQ   r+   r-   r.   r"      s   

zMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rR   )rb   r`   rN   )r(   r:   forwarded_statesr-   r-   r.   r<      s   

zMT5LayerFF.forwardrW   r-   r-   r+   r.   r]   ~   s    
r]   c                       sb   e Zd Z		ddededB f fddZedd
dZdddZ								dddZ	  Z
S )MT5AttentionFNrC   	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrE   )r!   r"   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerH   d_kvkey_value_proj_dim	num_headsn_headsrM   rN   	inner_dimre   loggerwarning_oncer,   r=   r   rG   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr(   rC   rg   re   r+   r-   r.   r"      s,   

zMT5Attention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r/   r   )r2   r$   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger-   r-   r.   _relative_position_bucket   s*   z&MT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r7   device)r   r   r   )r/   r   r   r   )rv   r&   r   r$   aranger{   r2   r   rf   rh   ri   permute	unsqueeze)
r(   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluesr-   r-   r.   compute_bias   s    
 
zMT5Attention.compute_biasc
                 C   s  |j dd \}
}|du}| |}||
d| j| jdd}d}t|tr8|j	| j
}|r4|j}n|j}n|}|r>|n|}|rW|durW|rW|j| j
 j}|j| j
 j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|s|	nd}	|||| j
d|	i\}}|rt|trd|j| j
< t||dd}|du r|j d	 }|dur|n|	d d }| jstjd| j||f|j|jd
}| jr| jrd|_n| j|||j|	d}|dddd| dddf }|dur|ddddddd|j d	 f }|| }|}||7 }tjj|  dd!|}tjj"|| j"| jd}t||}|dd# }||
d| j$}| %|}||f}|rY||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr/   r0   r   Fr   Tr   )r   r7   )r   r   dim)ptraining)&shaperq   viewrm   rk   	transposerS   r   
is_updatedgetre   cross_attention_cacheself_attention_cachelayerskeysr   rr   rs   updater$   matmulrg   zerosr   r7   rw   r   requires_gradr   r   
functionalsoftmaxr   type_asrN   
contiguousrn   rt   )r(   r:   maskkey_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr-   r-   r.   r<      sp   






"
&

zMT5Attention.forwardFN)Try   rz   )NN)NNNNNFFN)r=   r>   r?   r   intr"   staticmethodr   r   r<   r@   r-   r-   r+   r.   rd      s(    "
/rd   c                       s>   e Zd ZddedB f fddZ						d	ddZ  ZS )
MT5LayerSelfAttentionFNre   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nrg   re   r^   )r!   r"   rd   SelfAttentionr   rH   ra   rb   r   rL   rM   rN   rx   r+   r-   r.   r"   _  s   
zMT5LayerSelfAttention.__init__c              	   C   sJ   |  |}| j|||||||d}	|| |	d  }|f|	dd   }
|
S )N)r   r   r   r   r   r   r   r   )rb   r   rN   )r(   r:   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   r-   r-   r.   r<   g  s   

	zMT5LayerSelfAttention.forwardr   )NNNFFNr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   r   ^  s    r   c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
MT5LayerCrossAttentionNre   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr   r^   )r!   r"   rd   EncDecAttentionr   rH   ra   rb   r   rL   rM   rN   )r(   rC   re   r+   r-   r.   r"     s   
zMT5LayerCrossAttention.__init__Fc
                 C   sN   |  |}
| j|
||||||||	d	}|| |d  }|f|dd   }|S )N)r   r   r   r   r   r   r   r   r   r   )rb   r   rN   )r(   r:   r   r   r   r   r   r   r   r   r   r   layer_outputr   r-   r-   r.   r<     s   
zMT5LayerCrossAttention.forwardrR   )NNNFNFNr   r-   r-   r+   r.   r     s    
r   c                       sF   e Zd Zd	dedB f fddZ										d
ddZ  ZS )MT5BlockFNre   c                    s`   t    |j| _t | _| jt|||d | jr&| jt||d | jt	| d S )Nr   )re   )
r!   r"   rf   r   
ModuleListlayerappendr   r   r]   rx   r+   r-   r.   r"     s   

zMT5Block.__init__Tc              
   C   s  | j d ||||||	|d}|d }|dd  }|jtjkr?tt| t|jjd t|jj}tj	|| |d}| j
oE|d u}|r| j d ||||||d d ||	d}|d }|jtjkrtt| t|jjd t|jj}tj	|| |d}||dd   }| j d |}|jtjkrtt| t|jjd t|jj}tj	|| |d}|f}|| S )Nr   )r   r   r   r   r   r   r   i  )r}   maxr0   )r   r   r   r   r   r   r   )r   r7   r$   r8   r   isinfanyfinfor   clamprf   )r(   r:   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   r-   r-   r.   r<     sd   	

zMT5Block.forwardr   )
NNNNNNFFTNr   r-   r-   r+   r.   r     s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	MT5ClassificationHeadz-Head for sentence-level classification tasks.rC   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r!   r"   r   rG   rH   denserL   classifier_dropoutrN   
num_labelsout_projrQ   r+   r-   r.   r"     s   
zMT5ClassificationHead.__init__r:   returnc                 C   s6   |  |}| |}t|}|  |}| |}|S rR   )rN   r   r$   tanhr   rV   r-   r-   r.   r<     s   




zMT5ClassificationHead.forward)
r=   r>   r?   __doc__r   r"   r$   rT   r<   r@   r-   r-   r+   r.   r     s    r   c                   @   sR   e Zd ZU eed< dZdZdZdgZdgZ	e
dd Ze dd	 Zd
d ZdS )MT5PreTrainedModelrC   transformerTr   rK   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r$   tensorr   r   )r(   r   
input_maskdummy_inputsr-   r-   r.   r   !  s   

zMT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|trt|j|d  d	S t|ttt	t
fr^tj|jjd|d d t|dr>| j js>tj|jjd|d d t|dr\tj|jjd|| j jd  d t|jj d	S d	S t|trt|dr}tj|jjd|d d t|jj d	S d	S t|trtj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|tr$tj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr |jjd	ur"t|jj d	S d	S d	S t|trtj|jjd|| j jd  d t|jdrO|jjd	urOt|jj tj|jjd|| j jd  d t|jdrt|jjd	urtt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|t r| j j}| j j!}| j j"}tj|j#jd||| d  d tj|j$jd||d  d tj|j%jd||d  d tj|j&jd||| d  d |j'rtj|j(jd||d  d d	S d	S d	S )
zInitialize the weightsg      ?g        )r5   stdlm_head
qa_outputsg      
classifierrF   N))rC   initializer_factorrS   r   init	constant_r&   MT5ModelMT5ForConditionalGenerationMT5EncoderModelMT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr   r   rH   zeros_rF   MT5ForTokenClassificationr   r   r   r   rB   rJ   rK   rI   rX   rY   rZ   rd   rj   rl   rq   rr   rs   rt   rg   rv   )r(   modulefactorrH   rk   rm   r-   r-   r.   _init_weights,  st   


 


          z MT5PreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzself.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id. See MT5 docs for more information..r0   r   ).r   z1self.model.config.pad_token_id has to be defined.)rC   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r(   r   r  r	  shifted_input_idsr-   r-   r.   _shift_rightc  s    zMT5PreTrainedModel._shift_rightN)r=   r>   r?   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr   r$   no_gradr  r  r-   r-   r-   r.   r     s   
 


6r   c                       sD   e Zd Z fddZdd Z											dddZ  ZS )	MT5Stackc                    sx   t    t j j| _ j| _t fddt	 j
D | _t j jd| _t j| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   r   )r   bool).0irC   r-   r.   
<listcomp>  s   " z%MT5Stack.__init__.<locals>.<listcomp>r^   F)r!   r"   r   ru   
vocab_sizerH   embed_tokensrf   r   range
num_layersblockr   ra   final_layer_normrL   rM   rN   	post_initrw   rQ   r+   r  r.   r"   {  s   
zMT5Stack.__init__c                 C   s
   || _ d S rR   )r  r(   new_embeddingsr-   r-   r.   set_input_embeddings     
zMT5Stack.set_input_embeddingsNc                 K   sh  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|
d ur$|
n| j j}
|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|rtd
 d}|d u r| jd u rtd| |}|\}}|du r| jstd|  d| jr|r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}| j jrt| j |||t|tr|jn|d}nt| j ||d}d }| jr|d urt| j |||d}|	rdnd }|r%dnd }|r0| jr0dnd }d }d }| |}| jD ]I}|	rH||f }|||||||||||
|d}|d }|d }| jrq|d urq||rndnd }|r||d f }| jr||d f }q>| |}| |}|	r||f }|
stdd |||||fD S t|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer0   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr  r   )r   )rC   r+  r   r   r   )rC   r+  r   )rC   r+  r   r   r-   )r   r   r   r   r   r   r   r/      c                 s   s    | ]	}|d ur|V  qd S rR   r-   )r  rs   r-   r-   r.   	<genexpr>!  s    z#MT5Stack.forward.<locals>.<genexpr>)last_hidden_stater   r:   
attentionscross_attentions)rC   r   r   output_hidden_statesuse_return_dictrf   r
  sizer   rw   r   ro   rp   r  is_encoder_decoderr   r
   get_seq_lengthr$   r   r   r   rS   r   r   rN   r"  r#  tupler   )r(   r   r   r   r   r+  r   r   r   r1  r   r   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r:   layer_modulelayer_outputsr-   r-   r.   r<     s   








zMT5Stack.forward)NNNNNNNNNNN)r=   r>   r?   r"   r'  r<   r@   r-   r-   r+   r.   r  z  s    r  c                       s  e Zd ZU dZdZeed< dgZdddZdef fddZ	d	d
 Z
dd Ze													ddejdB dejdB dejdB dejdB deeej  dB dedB dejdB dejdB dedB dedB dedB dedB dejdB deej eB fddZ  ZS )r   aw  
    Examples:

    ```python
    >>> from transformers import MT5Model, AutoTokenizer

    >>> model = MT5Model.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, return_tensors="pt")
    >>> labels = tokenizer(text_target=summary, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```mt5rC   Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightc                    sl   t  | t|j|j| _t|}d|_	d|_
t|| _t|}d|_	|j|_t|| _|   d S NFT)r!   r"   r   ru   r  rH   r   copydeepcopyrf   r   r  encodernum_decoder_layersr!  decoderr$  r(   rC   encoder_configdecoder_configr+   r-   r.   r"   Q  s   



zMT5Model.__init__c                 C      | j S rR   r   r(   r-   r-   r.   get_input_embeddingsc     zMT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rR   r   rJ  r'  rL  r%  r-   r-   r.   r'  g     zMT5Model.set_input_embeddingsNr   r   r   r   encoder_outputsr   r+  decoder_inputs_embedsr   r   r1  r   r   r   c                 K   s   |	dur|	n| j j}	|dur|n| j j}|du r$| j||||
||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }| j|||||||	|
|||d}|sb|| S t|j	|j
|j|j|j|j	|j|jdS )	a\
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5Model.from_pretrained("google/mt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model.
        >>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r+  r   r1  r   r   r   r/   r.  r:   r/  r   r   r+  r   r   r   r   r   r1  r   r   )r.  r   decoder_hidden_statesdecoder_attentionsr0  encoder_last_hidden_stater   encoder_attentions)rC   r   r2  rJ  rS   r   lenrL  r   r.  r   r:   r/  r0  )r(   r   r   r   r   rX  r   r+  rY  r   r   r1  r   r   r7  r:   decoder_outputsr-   r-   r.   r<   l  sV   DzMT5Model.forwardNNNNNNNNNNNNN)r=   r>   r?   r   
model_typer   r  "_keys_to_ignore_on_load_unexpected_tied_weights_keysr"   rS  r'  r   r$   
LongTensorFloatTensor
BoolTensorr6  r	   rT   r  r   r<   r@   r-   r-   r+   r.   r   5  sl   
 	
r   z;
    MT5 Model with a `language modeling` head on top.
    )custom_introc                !       s*  e Zd ZU dZdZeed< dgZddddZdef fddZ	d	d
 Z
dd Ze														d!dejdB dejdB dejdB dejdB deeej  dB dedB dejdB dejdB dejdB dedB dedB dedB dedB dejdB deej eB fddZdejfdd Z  ZS )"r   a  
    Examples:

    ```python
    >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer

    >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```rA  rC   rB  rC  )rE  rF  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _tj|j|jdd| _|   d S )NFTrE   )r!   r"   rH   	model_dimr   ru   r  r   rH  rI  rf   r   r  rJ  rK  r!  rL  rG   r   r$  rM  r+   r-   r.   r"     s   



z$MT5ForConditionalGeneration.__init__c                 C   rP  rR   rQ  rR  r-   r-   r.   rS    rT  z0MT5ForConditionalGeneration.get_input_embeddingsc                 C   rU  rR   rV  r%  r-   r-   r.   r'    rW  z0MT5ForConditionalGeneration.set_input_embeddingsNr   r   r   r   rX  r   r+  rY  labelsr   r   r1  r   r   r   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|du r$| j||||||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }|	dur]|du r]|du r]| |	}| j|||||||
||||d}|d }| 	|}d}|	durt
dd	}|	|j}	||d
|d
|	d
}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer(
        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        >>> # studies have shown that owning a dog is good for you.
        ```NrZ  r   r   r/   r[  r\  r  ignore_indexr0   	losslogitsr   r]  r^  r0  r_  r   r`  )rC   r   r2  rJ  rS   r   ra  r  rL  r   r   r2   r   r   r3  r   r   r:   r/  r0  r.  )r(   r   r   r   r   rX  r   r+  rY  rl  r   r   r1  r   r   r7  r:   rb  sequence_output	lm_logitsrp  loss_fctoutputr-   r-   r.   r<   !  sl   J


z#MT5ForConditionalGeneration.forwardc                 C   s
   |  |S rR   )r  )r(   rl  r-   r-   r.   %prepare_decoder_input_ids_from_labels  r(  zAMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNN)r=   r>   r?   r   rd  r   r  re  rf  r"   rS  r'  r   r$   rg  rh  ri  r6  rT   r	   r  r   r<   rv  r@   r-   r-   r+   r.   r     sx   
 	
 r   c                       s   e Zd ZU dZdZeed< ddiZdef fddZdd	 Z	d
d Z
e						ddejdB dejdB dejdB dedB dedB dedB deej eB fddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import MT5EncoderModel, AutoTokenizer

    >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```rA  rC   rE  rC  c                    sD   t  | t|j|j| _|}d|_d|_t	|| _
|   d S )NF)r!   r"   r   ru   r  rH   r   r   r4  r  rJ  r$  )r(   rC   rN  r+   r-   r.   r"     s   
zMT5EncoderModel.__init__c                 C   rP  rR   rQ  rR  r-   r-   r.   rS    rT  z$MT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rR   )r   rJ  r'  r%  r-   r-   r.   r'    s   z$MT5EncoderModel.set_input_embeddingsNr   r   r+  r   r1  r   r   c           	      K   s.   |dur|n| j j}| j||||||d}|S )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrZ  )rC   r2  rJ  )	r(   r   r   r+  r   r1  r   r7  rX  r-   r-   r.   r<     s   #	zMT5EncoderModel.forward)NNNNNN)r=   r>   r?   r   rd  r   r  rf  r"   rS  r'  r   r$   rg  rh  r  r6  r   r<   r@   r-   r-   r+   r.   r     s>   
 	r   z
    MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd ZdgZdef fddZe												ddejdB dej	dB dejdB d	ejdB d
e
ej dB dejdB dejdB dejdB dedB dedB dedB dedB deeB fddZ  ZS )MT5ForSequenceClassificationrB  rC   c                    s,   t  | t|| _t|| _|   d S rR   )r!   r"   r   r   r   classification_headr$  rQ   r+   r-   r.   r"     s   

z%MT5ForSequenceClassification.__init__Nr   r   r   r   rX  r+  rY  rl  r   r   r1  r   r   c                 K   s`  |dur|n| j j}|durd}	|du r!|dur!td| jj |du r6|du r6|du r1td| |}| j||||||||	|
||d}|d }|| j j	
|j}tt|d dkd |j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j |j!|j"|j#|j$d	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r   r   rX  r+  rY  r   r   r1  r   r   r   z7All examples must have the same number of <eos> tokens.r0   
regressionsingle_label_classificationmulti_label_classificationro  )%rC   r2  NotImplementedErrorr,   r=   r
  r  r   eqeos_token_idr2   r   r   r$   unique_consecutivesumnumelr   r   rx  problem_typer   r7   r{   r   r   squeezer   r   r   r   r]  r^  r0  r_  r   r`  )r(   r   r   r   r   rX  r+  rY  rl  r   r   r1  r   r7  r   rr  eos_maskr   _r)   sentence_representationrq  rp  rt  ru  r-   r-   r.   r<   $  s   1
,


$

z$MT5ForSequenceClassification.forward)NNNNNNNNNNNN)r=   r>   r?   re  r   r"   r   r$   rg  rT   listrh  r  r6  r   r<   r@   r-   r-   r+   r.   rw    sV    	
rw  c                       s   e Zd Zdef fddZe							ddejdB dejdB dejdB dejdB d	edB d
edB dedB de	ej e
B fddZ  ZS )r  rC   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rR   )r!   r"   r   r   r   r   rL   r   rN   rG   r)   r   r$  rQ   r+   r-   r.   r"     s   
z"MT5ForTokenClassification.__init__Nr   r   r+  rl  r   r1  r   r   c                 K   s   |dur|n| j j}| j||||||d}	|	d }
| |
}
| |
}d}|dur9t }||d| j|d}|sN||	dd f}|durL|f| S |S t|||	j	|	j
dS )a>  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./t5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   r+  r   r1  r   r   r0   r/   )rp  rq  r:   r/  )rC   r2  r   rN   r   r   r   r   r   r:   r/  )r(   r   r   r+  rl  r   r1  r   r7  r   r:   rq  rp  rt  ru  r-   r-   r.   r<     s2   	

z!MT5ForTokenClassification.forward)NNNNNNN)r=   r>   r?   r   r"   r   r$   rT   r  r6  r   r<   r@   r-   r-   r+   r.   r    s6    
r  c                       s   e Zd ZdgZdddZdef fddZdd Zd	d
 Ze														dde
jdB de
jdB de
jdB de
jdB deee
j  dB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB dedB dee
j eB fddZ  ZS )r   rB  rC  rD  rC   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _|j| _t|j|j| _|   d S rG  )r!   r"   rH   rk  r   ru   r  r   rH  rI  rf   r   r  rJ  rK  r!  rL  r   rG   r)   r   r$  rM  r+   r-   r.   r"     s   



z MT5ForQuestionAnswering.__init__c                 C   rP  rR   rQ  rR  r-   r-   r.   rS    rT  z,MT5ForQuestionAnswering.get_input_embeddingsc                 C   rU  rR   rV  r%  r-   r-   r.   r'    rW  z,MT5ForQuestionAnswering.set_input_embeddingsNr   r   r   r   rX  start_positionsend_positionsr+  rY  r   r   r1  r   r   c                 K   sd  |dur|n| j j}|
dur|
n| j j}
|dur|durd}
|du r3|	du r3|du r.td| |}|
dur9|
n| j j}
|durC|n| j j}|du rW| j||||||d}n$|r{t|ts{t|d t|dkrl|d ndt|dkrw|d ndd}|d }| j	|||	d|||
|||d	
}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|dur|durt| dkr|d
|j}t| dkr|d
|j}|d}|d|}|d|}t|d}|||}|||}|| d }|s||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )az  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        NFry  rZ  r   r   r/   r[  )
r   r   r+  r   r   r   r   r   r1  r   r0   r   rm  )
rp  start_logits
end_logitsr   r]  r^  r0  r_  r   r`  )rC   r2  r   r
  r  rJ  rS   r   ra  rL  r   splitr  r   r3  r2   r   r   r   r   r   r:   r/  r0  r.  )r(   r   r   r   r   rX  r  r  r+  rY  r   r   r1  r   r7  r:   rb  rr  rq  r  r  
total_lossignored_indexrt  
start_lossend_lossru  r-   r-   r.   r<     s   /





zMT5ForQuestionAnswering.forwardrc  )r=   r>   r?   re  rf  r   r"   rS  r'  r   r$   rg  rh  ri  r6  rT   r  r   r<   r@   r-   r-   r+   r.   r     sf    	
r   )r   r   r   rw  r  r   r   )Ar   rH  r   r$   r   torch.nnr   r   r   r*  r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_mt5r   
get_loggerr=   ro   Moduler   rB   rX   r]   rd   r   r   r   r   r   r  r   r   r   rw  r  r   __all__r-   r-   r-   r.   <module>   sh   $	
 N#%^b < / M\ H 7