o
    eiM%                    @   sH  d Z ddlZddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* e'+e,Z-G dd dej.Z/G dd dej.Z0G dd dej.Z1G dd dej.Z2G dd dej.Z3G dd dej.Z4G dd dej.Z5G dd  d eZ6G d!d" d"ej.Z7e%G d#d$ d$e!Z8G d%d& d&e8Z9e%G d'd( d(e8Z:e%d)d*G d+d, d,e8eZ;e%G d-d. d.e8Z<e%d/d*G d0d1 d1e8Z=e%G d2d3 d3e8Z>e%G d4d5 d5e8Z?g d6Z@dS )7zPyTorch UMT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilingloggingtorch_compilable_check   )
UMT5Configc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/umt5/modeling_umt5.pyr"   7   s   

zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor$   float32powmeanrsqrtr'   r&   dtypefloat16bfloat16)r(   hidden_statesvariancer-   r-   r.   forward?   s
   
zUMT5LayerNorm.forward)r    )__name__
__module____qualname__r"   r<   __classcell__r-   r-   r+   r.   r   6   s    r   c                       *   e Zd Zdef fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r!   r"   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr(   rC   r+   r-   r.   r"   Q   s
   
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rJ   rP   rN   
isinstancerK   r&   r$   Tensorr7   int8r2   r(   r:   r-   r-   r.   r<   X   s   



zUMT5DenseActDense.forwardr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   rB   P   s    rB   c                       rA   )UMT5DenseGatedActDenserC   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rD   )r!   r"   r   rG   rH   rI   wi_0wi_1rK   rL   rM   rN   r   rO   rP   rQ   r+   r-   r.   r"   h   s   
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rR   )rP   rY   rZ   rN   rS   rK   r&   r$   rT   r7   rU   r2   )r(   r:   hidden_geluhidden_linearr-   r-   r.   r<   p   s   


zUMT5DenseGatedActDense.forwardrW   r-   r-   r+   r.   rX   g   s    rX   c                       rA   )UMT5LayerFFrC   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr*   )r!   r"   is_gated_actrX   DenseReluDenserB   r   rH   layer_norm_epsilon
layer_normr   rL   rM   rN   rQ   r+   r-   r.   r"      s   

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rR   )rb   r`   rN   )r(   r:   forwarded_statesr-   r-   r.   r<      s   

zUMT5LayerFF.forwardrW   r-   r-   r+   r.   r]      s    
r]   c                       s   e Zd ZdZddedB f fddZdejdejfd	d
Zdd Z	dddZ
				ddejdejdB dedB dejdB dejdB f
ddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrzt| j| j
| _d S d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrE   )r!   r"   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerH   d_kvkey_value_proj_dim	num_headsn_headsrM   rN   	inner_dimre   loggerwarning_oncer,   r=   r   rG   qkvo	Embeddingrelative_attention_bias)r(   rC   rg   re   r+   r-   r.   r"      s,   
zUMT5Attention.__init__
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr0   r   r/   r   r   )sizerm   rk   viewpermute)r(   rw   new_projection_shapenew_projectionr-   r-   r.   _shape   s   zUMT5Attention._shapec           	      C   s   d}| j }| j}| js!|d }||dktj| 7 }t|}n
t|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r/   r   )rh   ri   rf   r2   r$   longabsmin
zeros_likelogfloatmath	full_likewhere)	r(   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larger-   r-   r.   _relative_position_bucket   s$    z'UMT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf }tj|tj|ddddf }|| }| |}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r7   device)r/   r   r   r   )	rv   r&   r   r$   aranger   r   r{   	unsqueeze)
r(   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluesr-   r-   r.   compute_bias   s   
 

zUMT5Attention.compute_biasr:   encoder_hidden_statespast_key_valuesattention_maskr   c                 C   sd  |j d d \}}|d u}| |}	|	|d| j| jdd}	d}
|d ur<t|tr<|j	| j
}
|r8|j}n|j}n|}|rB|n|}|r[|d ur[|
r[|j| j
 j}|j| j
 j}nJ| |}| |}||d| j| jdd}||d| j| jdd}|d ur|s|nd }|||| j
d|i\}}|rt|trd|j| j
< t|	|dd}|d ur||  n|}|j d }| jstjd| j||f|j|jd	}n| j|||j|d
}|d d d d | d d d f }|d ur|| }|}||7 }tjj| dd|}tjj || j | j!d}t||}|dd" }|||d}| #|}||fS )Nr/   r0   r   Fr   Tr   )r   r7   )r   r   dim)ptraining)$shaperq   rz   rm   rk   	transposerS   r   
is_updatedgetre   cross_attention_cacheself_attention_cachelayerskeysr   rr   rs   updater$   matmulget_seq_lengthrg   zerosr   r7   r   r   
functionalsoftmaxr   type_asrN   r   
contiguousrt   )r(   r:   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biasposition_bias_maskedattn_weightsattn_outputr-   r-   r.   r<      s`   




"
zUMT5Attention.forward)FN)NNNNNN)r=   r>   r?   __doc__intr"   r$   rT   r~   r   r   r	   r<   r@   r-   r-   r+   r.   rd      s*    
/rd   c                       s8   e Zd ZddedB f fddZ			dddZ  ZS )	UMT5LayerSelfAttentionNre   c                    >   t    t|d|d| _t|j|jd| _t	|j
| _d S )NTrg   re   r^   )r!   r"   rd   SelfAttentionr   rH   ra   rb   r   rL   rM   rN   r(   rC   re   r+   r-   r.   r"   P     
zUMT5LayerSelfAttention.__init__c                 C   sD   |  |}| j||||d}|| |d  }|f|dd   }|S )Nr   r   r   r   r   )rb   r   rN   )r(   r:   r   r   r   normed_hidden_statesattention_outputoutputsr-   r-   r.   r<   V  s   
zUMT5LayerSelfAttention.forwardrR   )NNNr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   r   O  s    	r   c                       s:   e Zd ZddedB f fddZ				dddZ  ZS )	UMT5LayerCrossAttentionNre   c                    r   )NFr   r^   )r!   r"   rd   EncDecAttentionr   rH   ra   rb   r   rL   rM   rN   r   r+   r-   r.   r"   j  r   z UMT5LayerCrossAttention.__init__c           
      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }	|	S )Nr   r   r   r   r   r   )rb   r   rN   )
r(   r:   r   r   r   r   r   r   layer_outputr   r-   r-   r.   r<   p  s   
zUMT5LayerCrossAttention.forwardrR   r   r   r-   r-   r+   r.   r   i  s    	r   c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
	UMT5BlockNre   c                    s^   t    |j| _t | _| jt||d | jr%| jt||d | jt	| d S )Nre   )
r!   r"   rf   r   
ModuleListlayerappendr   r   r]   r   r+   r-   r.   r"     s   

zUMT5Block.__init__Fc	                 C   sL  | j d ||||d\}}	|jtjkr1t|jj}
tt| |
d |
}tj	|| |d}d }| j
o9|d u}|rn| j d |||||d\}}|jtjkrnt|jj}
tt| |
d |
}tj	|| |d}| j d |}|jtjkrt|jj}
tt| |
d |
}tj	|| |d}|f}|r||	|f7 }|S )Nr   r   i  )r   maxr   r   r0   )r   r7   r$   r8   finfor   r   isinfanyclamprf   )r(   r:   r   r   encoder_attention_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   r-   r-   r.   r<     sB   

zUMT5Block.forwardrR   )NNNNFFNr   r-   r-   r+   r.   r     s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	UMT5ClassificationHeadz-Head for sentence-level classification tasks.rC   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r!   r"   r   rG   rH   denserL   classifier_dropoutrN   
num_labelsout_projrQ   r+   r-   r.   r"     s   
zUMT5ClassificationHead.__init__r:   rx   c                 C   s6   |  |}| |}t|}|  |}| |}|S rR   )rN   r   r$   tanhr   rV   r-   r-   r.   r<     s   




zUMT5ClassificationHead.forward)
r=   r>   r?   r   r   r"   r$   rT   r<   r@   r-   r-   r+   r.   r     s    r   c                   @   sR   e Zd ZU eed< dZdZdZdgZdgZ	e
dd Ze dd	 Zd
d ZdS )UMT5PreTrainedModelrC   transformerTr   rK   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r$   tensorr   r   )r(   r   
input_maskdummy_inputsr-   r-   r.   r     s   

z UMT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|trt|j|d  d	S t|ttt	t
fr^tj|jjd|d d t|dr>| j js>tj|jjd|d d t|dr\tj|jjd|| j jd  d t|jj d	S d	S t|trt|dr}tj|jjd|d d t|jj d	S d	S t|trtj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|tr$tj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr |jjd	ur"t|jj d	S d	S d	S t|trtj|jjd|| j jd  d t|jdrO|jjd	urOt|jj tj|jjd|| j jd  d t|jdrt|jjd	urtt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|t r| j j}| j j!}| j j"}tj|j#jd||| d  d tj|j$jd||d  d tj|j%jd||d  d tj|j&jd||| d  d |j'rtj|j(jd||d  d d	S d	S d	S )
zInitialize the weights      ?g        )r5   stdlm_head
qa_outputs      
classifierrF   N))rC   initializer_factorrS   r   init	constant_r&   	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr   r   rH   zeros_rF   UMT5ForTokenClassificationr   r   r   r   rB   rJ   rK   rI   rX   rY   rZ   rd   rj   rl   rq   rr   rs   rt   rg   rv   )r(   modulefactorrH   rk   rm   r-   r-   r.   _init_weights  s|   

 


          
z!UMT5PreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information..r0   r   ).r   z1self.model.config.pad_token_id has to be defined.)rC   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r(   r   r  r  shifted_input_idsr-   r-   r.   _shift_right5  s    z UMT5PreTrainedModel._shift_rightN)r=   r>   r?   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr   r$   no_gradr	  r  r-   r-   r-   r.   r     s   
 


Br   c                       sD   e Zd Z fddZdd Z											dddZ  ZS )		UMT5Stackc                    sx   t    t j j| _ j| _t fddt	 j
D | _t j jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irC   r-   r.   
<listcomp>P  s    z&UMT5Stack.__init__.<locals>.<listcomp>r^   F)r!   r"   r   ru   
vocab_sizerH   embed_tokensrf   r   range
num_layersblockr   ra   final_layer_normrL   rM   rN   gradient_checkpointing	post_initrQ   r+   r  r.   r"   L  s    zUMT5Stack.__init__c                 C   s
   || _ d S rR   )r!  r(   new_embeddingsr-   r-   r.   set_input_embeddingsX     
zUMT5Stack.set_input_embeddingsNc                  K   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|
d ur$|
n| j j}
|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|rtd
 d}|d u r| jd u rtd| |}|\}}|du r| jstd|  d| jr|r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr	t| j ||||d}n&|d ur-|d d d d d d f }|j|jd}d| t|jj }nd }| jrV|d urV| \}}}||f}|d u rPtj||jd}| |}nd }|	r]dnd }|rddnd }|ro| jrodnd }| |}t| jD ]2\}}|	r||f }|||||||||d}|d }|r||d f7 }| jr||d f7 }q{|  |}| |}|	r||f }|
st!dd |||||fD S t"|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer0   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr  r   )r   )rC   r.  r   r   r   )r7   r   r-   )r   r   r   r   r   r   r/   c                 s   s    | ]	}|d ur|V  qd S rR   r-   )r  rs   r-   r-   r.   	<genexpr>  s    z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_stater   r:   
attentionscross_attentions)#rC   r   r   output_hidden_statesuse_return_dictrf   r  ry   rz   r&  r   ro   rp   r!  is_encoder_decoderr   r
   r   r$   r   r   r   r%   r   r2   r7   r   r   invert_attention_maskrN   	enumerater$  r%  tupler   ) r(   r   r   r   r   r.  r   r   r   r3  return_dictr   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthcausal_maskencoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr:   r  layer_modulelayer_outputsr-   r-   r.   r<   [  s   








zUMT5Stack.forward)NNNNNNNNNNN)r=   r>   r?   r"   r*  r<   r@   r-   r-   r+   r.   r  K  s    r  c                       s   e Zd ZU dZdZeed< dddZ fddZdd	 Z	d
d Z
e													ddejdB dejdB dejdB dejdB deeej  dB dedB dejdB dejdB dedB dedB dedB dedB dejdB deej eB fddZ  ZS )r   ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rC   shared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightc                    sl   t  | t|j|j| _t|}d|_	d|_
t|| _t|}d|_	|j|_t|| _|   d S NFT)r!   r"   r   ru   r   rH   r  copydeepcopyrf   r   r  encodernum_decoder_layersr#  decoderr'  r(   rC   encoder_configdecoder_configr+   r-   r.   r"     s   



zUMT5Model.__init__c                 C      | j S rR   r  r(   r-   r-   r.   get_input_embeddings(     zUMT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rR   r  rR  r*  rT  r(  r-   r-   r.   r*  ,     zUMT5Model.set_input_embeddingsNr   r   r   r   encoder_outputsr   r.  decoder_inputs_embedsr   r   r3  r9  r   rx   c                 K   s   |	dur|	n| j j}	|dur|n| j j}|du r$| j||||
||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }| j|||||||	|
|||d}|sb|| S t|j	|j
|j|j|j|j	|j|jdS )	ah
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r.  r   r3  r9  r   r   r/   r0  r:   r1  r   r   r.  r   r   r   r   r   r3  r9  r   )r0  r   decoder_hidden_statesdecoder_attentionsr2  encoder_last_hidden_stater   encoder_attentions)rC   r   r4  rR  rS   r   lenrT  r   r0  r   r:   r1  r2  )r(   r   r   r   r   r`  r   r.  ra  r   r   r3  r9  r   r:  r:   decoder_outputsr-   r-   r.   r<   1  sV   CzUMT5Model.forwardNNNNNNNNNNNNN)r=   r>   r?   r   
model_typer   r  _tied_weights_keysr"   r[  r*  r   r$   
LongTensorFloatTensor
BoolTensorr8  r	   rT   boolr   r<   r@   r-   r-   r+   r.   r     sj   
 	
r   z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc                !       s  e Zd ZdZdZddddZ fddZdd Zd	d
 Ze															dde
jdB de
jdB de
jdB de
jdB deee
j  dB dedB de
jdB de
jdB de
jdB dedB dedB dedB dedB de
jdB dee
j eB fddZde
jfddZ  ZS ) r   a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```rJ  rK  )rM  rN  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _tj|j|jdd| _|   d S )NFTrE   )r!   r"   rH   	model_dimr   ru   r   r  rP  rQ  rf   r   r  rR  rS  r#  rT  rG   r   r'  rU  r+   r-   r.   r"     s   



z%UMT5ForConditionalGeneration.__init__c                 C   rX  rR   rY  rZ  r-   r-   r.   r[    r\  z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   r]  rR   r^  r(  r-   r-   r.   r*    r_  z1UMT5ForConditionalGeneration.set_input_embeddingsNr   r   r   r   r`  r   r.  ra  labelsr   r   r3  r9  r   rx   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|du r$| j||||||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }|	dur]|du r]|du r]| |	}| j|||||||
||||d}|d }| j j	r||| j
d  }| |}d}|	durtd	d
}|	|j}	||d|d|	d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nrb  r   r   r/   rc  rd  r   r
  ignore_indexr0   	losslogitsr   re  rf  r2  rg  r   rh  )rC   r   r4  rR  rS   r   ri  r  rT  r  rs  r   r   r2   r   rz   ry   r   r   r:   r1  r2  r0  )r(   r   r   r   r   r`  r   r.  ra  rt  r   r   r3  r9  r   r:  r:   rj  sequence_output	lm_logitsrx  loss_fctoutputr-   r-   r.   r<     sp   G


z$UMT5ForConditionalGeneration.forwardc                 C   s
   |  |S rR   )r  )r(   rt  r-   r-   r.   %prepare_decoder_input_ids_from_labelsv  r+  zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNN)r=   r>   r?   r   rl  rm  r"   r[  r*  r   r$   rn  ro  rp  r8  rT   r	   rq  r   r<   r~  r@   r-   r-   r+   r.   r     st    	
 r   c                       s   e Zd ZdZdZddiZ fddZdd Zd	d
 Ze							dde
jdB de
jdB de
jdB dedB dedB dedB dee
j eB fddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```rJ  rM  rK  c                    sJ   t  | t|j|j| _t|}d|_	d|_
t|| _|   d S )NF)r!   r"   r   ru   r   rH   r  rP  rQ  r   r5  r  rR  r'  )r(   rC   rV  r+   r-   r.   r"     s   

zUMT5EncoderModel.__init__c                 C   rX  rR   rY  rZ  r-   r-   r.   r[    r\  z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rR   )r  rR  r*  r(  r-   r-   r.   r*    s   z%UMT5EncoderModel.set_input_embeddingsNr   r   r.  r   r3  r9  rx   c           	      K   s.   |dur|n| j j}| j||||||d}|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nrb  )rC   r4  rR  )	r(   r   r   r.  r   r3  r9  r:  r`  r-   r-   r.   r<     s   #	zUMT5EncoderModel.forward)NNNNNN)r=   r>   r?   r   rl  rm  r"   r[  r*  r   r$   rn  ro  rq  r8  r   r<   r@   r-   r-   r+   r.   r   z  s<    	r   z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd ZdgZdef fddZe												ddejdB dej	dB dejdB d	ejdB d
e
ej dB dejdB dejdB dejdB dedB dedB dedB dedB deeB fddZ  ZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightrC   c                    s,   t  | t|| _t|| _|   d S rR   )r!   r"   r   r   r   classification_headr'  rQ   r+   r-   r.   r"     s   

z&UMT5ForSequenceClassification.__init__Nr   r   r   r   r`  r.  ra  rt  r   r   r3  r9  rx   c                 K   s`  |dur|n| j j}|durd}	|du r!|dur!td| jj |du r6|du r6|du r1td| |}| j||||||||	|
||d}|d }|| j j	
|j}tt|d dkd |j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j |j!|j"|j#|j$d	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r   r   r`  r.  ra  r   r   r3  r9  r   r   z7All examples must have the same number of <eos> tokens.r0   
regressionsingle_label_classificationmulti_label_classificationrw  )%rC   r4  NotImplementedErrorr,   r=   r  r  r   eqeos_token_idr2   r   r   r$   unique_consecutivesumnumelr   rz   r  problem_typer   r7   r   r   r   squeezer   r   r   r   re  rf  r2  rg  r   rh  )r(   r   r   r   r   r`  r.  ra  rt  r   r   r3  r9  r:  r   rz  eos_maskr   rB  r)   sentence_representationry  rx  r|  r}  r-   r-   r.   r<     s   0
,


$

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNN)r=   r>   r?   "_keys_to_ignore_on_load_unexpectedr   r"   r   r$   rn  rT   listro  rq  r8  r   r<   r@   r-   r-   r+   r.   r    sV    	
r  c                       s   e Zd ZdgZdef fddZe							ddejdB dejdB dejdB d	ejdB d
e	dB de	dB de	dB de
ej eB fddZ  ZS )r  r  rC   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rR   )r!   r"   r   r   r   r   rL   r   rN   rG   r)   r   r'  rQ   r+   r-   r.   r"   r  s   
z#UMT5ForTokenClassification.__init__Nr   r   r.  rt  r   r3  r9  rx   c                 K   s   |dur|n| j j}| j||||||d}	|	d }
| |
}
| |
}d}|dur9t }||d| j|d}|sN||	dd f}|durL|f| S |S t|||	j	|	j
dS )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   r.  r   r3  r9  r   r0   r/   )rx  ry  r:   r1  )rC   r4  r   rN   r   r   rz   r   r   r:   r1  )r(   r   r   r.  rt  r   r3  r9  r:  r   r:   ry  rx  r|  r}  r-   r-   r.   r<   }  s2   	

z"UMT5ForTokenClassification.forward)NNNNNNN)r=   r>   r?   r  r   r"   r   r$   rT   rq  r8  r   r<   r@   r-   r-   r+   r.   r  m  s8    
r  c                       s   e Zd ZdddZ fddZdd Zdd Ze																										dd
ej	d	B dej
d	B dej	d	B dejd	B deeej  d	B dej	d	B dej	d	B dej
d	B dej
d	B ded	B ded	B ded	B ded	B deej
 eB fddZ  ZS )r   rK  rL  c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _|j| _t|j|j| _|   d S rO  )r!   r"   rH   rs  r   ru   r   r  rP  rQ  rf   r   r  rR  rS  r#  rT  r   rG   r   r'  rU  r+   r-   r.   r"     s   



z!UMT5ForQuestionAnswering.__init__c                 C   rX  rR   rY  rZ  r-   r-   r.   r[    r\  z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   r]  rR   r^  r(  r-   r-   r.   r*    r_  z-UMT5ForQuestionAnswering.set_input_embeddingsNr   r   r   r   r`  start_positionsend_positionsr.  ra  r   r   r3  r9  rx   c                 K   sd  |dur|n| j j}|
dur|
n| j j}
|dur|durd}
|du r3|	du r3|du r.td| |}|
dur9|
n| j j}
|durC|n| j j}|du rW| j||||||d}n$|r{t|ts{t|d t|dkrl|d ndt|dkrw|d ndd}|d }| j	|||	d|||
|||d	
}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|dur|durt| dkr|d
|j}t| dkr|d
|j}|d}|d|}|d|}t|d}|||}|||}|| d }|s||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        NFr  rb  r   r   r/   rc  )
r   r   r.  r   r   r   r   r   r3  r9  r0   r   ru  )
rx  start_logits
end_logitsr   re  rf  r2  rg  r   rh  )rC   r4  r   r  r  rR  rS   r   ri  rT  r   splitr  r   ry   r2   r   r   r   r   r   r:   r1  r2  r0  )r(   r   r   r   r   r`  r  r  r.  ra  r   r   r3  r9  r:  r:   rj  rz  ry  r  r  
total_lossignored_indexr|  
start_lossend_lossr}  r-   r-   r.   r<     s   .





z UMT5ForQuestionAnswering.forwardrk  )r=   r>   r?   rm  r"   r[  r*  r   r$   rn  ro  rp  r8  rT   rq  r   r<   r@   r-   r-   r+   r.   r     sd    	
r   )r   r   r   r  r  r   r   )Ar   rP  r   r$   r   torch.nnr   r   r   r-  r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_umt5r   
get_loggerr=   ro   Moduler   rB   rX   r]   rd   r   r   r   r   r   r  r   r   r   r  r  r   __all__r-   r-   r-   r.   <module>   sh   $	 
 9Fm 2 , N[ J 4