o
    i7c                    @   sz  d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- e& rddl.m/Z/ ddl0m1Z1 e)2e3Z4G dd dej5Z6G dd dej5Z7G dd dej5Z8G dd dej5Z9G dd dej5Z:G dd dej5Z;G d d! d!ej5Z<G d"d# d#eZ=G d$d% d%ej5Z>e%G d&d' d'e!Z?G d(d) d)e?Z@e%G d*d+ d+e?ZAe%d,d-G d.d/ d/e?eZBe%G d0d1 d1e?ZCe%d2d-G d3d4 d4e?ZDe%G d5d6 d6e?ZEe%G d7d8 d8e?ZFg d9ZGdS ):zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/umt5/modeling_umt5.pyr'   ?   s   

zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor)   float32powmeanrsqrtr,   r+   dtypefloat16bfloat16)r-   hidden_statesvariancer2   r2   r3   forwardG   s
   
zUMT5LayerNorm.forward)r%   )__name__
__module____qualname__r'   rA   __classcell__r2   r2   r0   r3   r$   >   s    r$   c                       *   e Zd Zdef fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r&   r'   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr-   rH   r0   r2   r3   r'   Y   s
   
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rO   rU   rS   
isinstancerP   r+   r)   Tensorr<   int8r7   r-   r?   r2   r2   r3   rA   `   s   



zUMT5DenseActDense.forwardrB   rC   rD   r!   r'   rA   rE   r2   r2   r0   r3   rG   X   s    rG   c                       rF   )UMT5DenseGatedActDenserH   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rI   )r&   r'   r   rL   rM   rN   wi_0wi_1rP   rQ   rR   rS   r	   rT   rU   rV   r0   r2   r3   r'   p   s   
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rW   )rU   r^   r_   rS   rX   rP   r+   r)   rY   r<   rZ   r7   )r-   r?   hidden_geluhidden_linearr2   r2   r3   rA   x   s   


zUMT5DenseGatedActDense.forwardr\   r2   r2   r0   r3   r]   o   s    r]   c                       rF   )UMT5LayerFFrH   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr/   )r&   r'   is_gated_actr]   DenseReluDenserG   r$   rM   layer_norm_epsilon
layer_normr   rQ   rR   rS   rV   r0   r2   r3   r'      s   

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rW   )rg   re   rS   )r-   r?   forwarded_statesr2   r2   r3   rA      s   

zUMT5LayerFF.forwardr\   r2   r2   r0   r3   rb      s    
rb   c                       s   e Zd ZdZddee f fddZdejdejfd	d
Z	dd Z
dddZedddd					ddejdeej dee deej deej deej fddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrJ   )r&   r'   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerM   d_kvkey_value_proj_dim	num_headsn_headsrR   rS   	inner_dimrj   loggerwarning_oncer1   rB   r   rL   qkvo	Embeddingrelative_attention_biassetpruned_heads)r-   rH   rl   rj   r0   r2   r3   r'      s,   
zUMT5Attention.__init__
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr5   r   r4   r    r   )sizerr   rp   viewpermute)r-   r~   new_projection_shapenew_projectionr2   r2   r3   _shape   s   zUMT5Attention._shapec           	      C   s   d}| j }| j}| js!|d }||dktj| 7 }t|}n
t|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r4   r    )rm   rn   rk   r7   r)   longabsmin
zeros_likelogfloatmath	full_likewhere)	r-   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larger2   r2   r3   _relative_position_bucket   s$    z'UMT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf }tj|tj|ddddf }|| }| |}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r<   device)r4   r   r    r   )	r{   r+   r   r)   aranger   r   r   	unsqueeze)
r-   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluesr2   r2   r3   compute_bias   s   
 

zUMT5Attention.compute_biaspast_key_valuepast_key_values4.58new_nameversionr?   encoder_hidden_statesattention_masklayer_head_maskr   c                 C   s  |j d d \}}|d u}	| |}
|
|d| j| jdd}
d}|d ur<t|tr<|j	| j
}|	r8|j}n|j}n|}|	rB|n|}|	r[|d ur[|r[|j| j
 j}|j| j
 j}nJ| |}| |}||d| j| jdd}||d| j| jdd}|d ur|	s|nd }|||| j
d|i\}}|	rt|trd|j| j
< t|
|dd}|d ur||  n|}|j d }| jstjd| j||f|j|jd	}n| j|||j|d
}|d d d d | d d d f }|d ur
|d d d d d d d |j d f }|| }| jr(t|j d }d|t| j< |d d | f }n|}||7 }tj j!|" dd#|}tj j$|| j$| j%d}|d urO|| }t||}|dd& }|||d}| '|}||fS )Nr4   r5   r    Fr   Tr   )r   r<   )r   r   r   dim)ptraining)(shaperv   r   rr   rp   	transposerX   r   
is_updatedgetrj   cross_attention_cacheself_attention_cachelayerskeysr   rw   rx   updater)   matmulget_seq_lengthrl   zerosr   r<   r   r}   r*   listboolr   
functionalsoftmaxr   type_asrS   r   
contiguousry   )r-   r?   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputr2   r2   r3   rA     sn   





"
&

zUMT5Attention.forward)FN)NNNNNNN)rB   rC   rD   __doc__r   intr'   r)   rY   r   r   r   r   r
   rA   rE   r2   r2   r0   r3   ri      s2    
/ri   c                       sH   e Zd Zddee f fddZedddd				dd	d
Z  ZS )UMT5LayerSelfAttentionNrj   c                    >   t    t|d|d| _t|j|jd| _t	|j
| _d S )NTrl   rj   rc   )r&   r'   ri   SelfAttentionr$   rM   rf   rg   r   rQ   rR   rS   r-   rH   rj   r0   r2   r3   r'   g     
zUMT5LayerSelfAttention.__init__r   r   r   r   c           	      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }|S )Nr   r   r   r   r   r    )rg   r   rS   )	r-   r?   r   r   r   r   normed_hidden_statesattention_outputoutputsr2   r2   r3   rA   m  s   
	zUMT5LayerSelfAttention.forwardrW   )NNNN	rB   rC   rD   r   r   r'   r   rA   rE   r2   r2   r0   r3   r   f  s    r   c                       sJ   e Zd Zddee f fddZedddd					dd	d
Z  ZS )UMT5LayerCrossAttentionNrj   c                    r   )NFr   rc   )r&   r'   ri   EncDecAttentionr$   rM   rf   rg   r   rQ   rR   rS   r   r0   r2   r3   r'     r   z UMT5LayerCrossAttention.__init__r   r   r   r   c                 C   sH   |  |}| j||||||d}|| |d  }	|	f|dd   }
|
S )Nr   r   r   r   r   r   r    )rg   r   rS   )r-   r?   r   r   r   r   r   r   r   layer_outputr   r2   r2   r3   rA     s   

zUMT5LayerCrossAttention.forwardrW   r   r   r2   r2   r0   r3   r     s    r   c                       sR   e Zd Zddee f fddZedddd											dd
dZ  ZS )	UMT5BlockNrj   c                    s^   t    |j| _t | _| jt||d | jr%| jt||d | jt	| d S )Nrj   )
r&   r'   rk   r   
ModuleListlayerappendr   r   rb   r   r0   r2   r3   r'     s   

zUMT5Block.__init__r   r   r   r   Fc                 C   sP  | j d |||||
d\}}|jtjkr2t|jj}tt| |d |}tj	|| |d}d }| j
o:|d u}|rp| j d ||||||
d\}}|jtjkrpt|jj}tt| |d |}tj	|| |d}| j d |}|jtjkrt|jj}tt| |d |}tj	|| |d}|f}|	r|||f7 }|S )Nr   r   i  )r   maxr    r   r5   )r   r<   r)   r=   finfor   r   isinfanyclamprk   )r-   r?   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   r2   r2   r3   rA     sF   
	
	zUMT5Block.forwardrW   )	NNNNNNFFNr   r2   r2   r0   r3   r     s    
r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	UMT5ClassificationHeadz-Head for sentence-level classification tasks.rH   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r&   r'   r   rL   rM   denserQ   classifier_dropoutrS   
num_labelsout_projrV   r0   r2   r3   r'     s   
zUMT5ClassificationHead.__init__r?   r   c                 C   s6   |  |}| |}t|}|  |}| |}|S rW   )rS   r   r)   tanhr   r[   r2   r2   r3   rA     s   




zUMT5ClassificationHead.forward)
rB   rC   rD   r   r!   r'   r)   rY   rA   rE   r2   r2   r0   r3   r     s    r   c                   @   sJ   e Zd ZU eed< dZdZdZdgZdgZ	e
dd Zdd	 Zd
d ZdS )UMT5PreTrainedModelrH   transformerTr   rP   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r)   tensorr   r   )r-   r  
input_maskdummy_inputsr2   r2   r3   r  	  s   

z UMT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|tr|jj|d  d	S t|ttt	t
fr^|jjjjd|d d t|dr>| j js>|jjjjd|d d t|dr\|jjjjd|| j jd  d |jjj  d	S d	S t|trt|dr}|jjjjd|d d |jjj  d	S d	S t|tr|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|tr$|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr |jjd	ur"|jjj  d	S d	S d	S t|tr|jjjjd|| j jd  d t|jdrO|jjd	urO|jjj  |jjjjd|| j jd  d t|jdrt|jjd	urt|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|t r| j j}| j j!}| j j"}|j#jjjd||| d  d |j$jjjd||d  d |j%jjjd||d  d |j&jjjd||| d  d |j'r|j(jjjd||d  d d	S d	S d	S )
zInitialize the weights      ?        )r:   stdlm_head
qa_outputs      
classifierrK   N))rH   initializer_factorrX   r$   r+   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr	  r
  rM   rK   zero_UMT5ForTokenClassificationr  r   r   r   rG   rO   rP   rN   r]   r^   r_   ri   ro   rq   rv   rw   rx   ry   rl   r{   )r-   modulefactorrM   rp   rr   r2   r2   r3   _init_weights  s|   

 


          
z!UMT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r5   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rH   decoder_start_token_idpad_token_id
ValueErrorr   r)   fullr   cat	new_zerosclonemasked_fill_)r-   r  r  r  shifted_input_idsr2   r2   r3   _shift_rightV  s      z UMT5PreTrainedModel._shift_rightN)rB   rC   rD   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr  r  r'  r2   r2   r2   r3   r     s   
 

Br   c                       s   e Zd Zd fdd	Zdd Z													dddZ	dd	eejd
f dejdejde	de
f
ddZed	ejdededejdejdefddZ  ZS )	UMT5StackNc                    sl   t    || _ j| _t fddt jD | _t	 j
 jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irH   r2   r3   
<listcomp>w  s    z&UMT5Stack.__init__.<locals>.<listcomp>rc   F)r&   r'   embed_tokensrk   r   r   range
num_layersblockr$   rM   rf   final_layer_normrQ   rR   rS   gradient_checkpointing	post_init)r-   rH   r4  r0   r2  r3   r'   s  s    zUMT5Stack.__init__c                 C   s
   || _ d S rW   )r4  r-   new_embeddingsr2   r2   r3   set_input_embeddings     
zUMT5Stack.set_input_embeddingsc           #      C   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd u rtd| |}|\}}|	du r| jstd|  d| jr|	r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr| |||t|tr|jn||
}n&|d ur5|d d d d d d f }|j|jd}d| t|jj }nd }| jr^|d ur^| \}}}||f}|d u rXtj||jd}| |}nd }| || j j }| || j j }|rudnd }|
r|dnd }|
r| jrdnd }| !|}t"| j#D ]<\}}|| } || }!|r||f }|||||| |!||	|
|d
}"|"d }|
r||"d f7 }| jr||"d f7 }q| $|}| !|}|r||f }|st%dd |||||fD S t&|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer5   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr2  r   r   )r<   r  r2   )r   r   r   r   r   r   r   r    r4   c                 s   s    | ]	}|d ur|V  qd S rW   r2   )r0  rx   r2   r2   r3   	<genexpr>  s    z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_stater   r?   
attentionscross_attentions)'rH   r   r   output_hidden_statesuse_return_dictrk   r   r   r   r9  r   rt   ru   r4  is_encoder_decoderr   r   r   r)   r   r   r   r*   _update_causal_maskrX   r   r7   r<   r   r   invert_attention_maskget_head_maskr6  rS   	enumerater7  r8  tupler   )#r-   r  r   r   r   rA  	head_maskcross_attn_head_maskr   r   r   rG  return_dictr   err_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr?   r1  layer_moduler   r   layer_outputsr2   r2   r3   rA     s   



	





zUMT5Stack.forwardFr   r"   input_tensorr   r   r   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r  flex_attentionr   Fsdpa)rA  rT  is_trainingr    r5   )sequence_lengthtarget_lengthr<   r   r   )cudaxpunpu)rH   _attn_implementationr   rX   r)   rY   r#   r   is_compileabler   _ignore_causal_mask_sdpar   r<   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r-   r   r`  r   r   r   past_seen_tokensusing_compilable_cacher<   re  rf  r   	min_dtyper2   r2   r3   rJ  .  sT   




zUMT5Stack._update_causal_maskre  rf  r<   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer<   r   r    )diagonalrB  r5   r   )r   r)   r   r   r!  r   triur   reshapeexpandr$  r   r7   masked_fill)r   re  rf  r<   r   r   kwargsr   rs  mask_lengthpadding_maskr2   r2   r3   rn  r  s,    $
6  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionrW   )NNNNNNNNNNNNN)F)rB   rC   rD   r'   r=  rA   r   r)   rY   r
   r   rJ  staticmethodr   r<   rn  rE   r2   r2   r0   r3   r/  r  sX    
 3
Dr/  c                &       s>  e Zd ZU dZdZeed< ddgZ fddZdd	 Z	d
d Z
dd Zdd Zdd Ze																d&deej deej deej deej deej deej deej deeeej   dee deej deej dee dee d ee d!ee d"eej d#eeej ef f"d$d%Z  ZS )'r  ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5rH   encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S NFT)r&   r'   r   rz   
vocab_sizerM   r  copydeepcopyrk   r   tie_encoder_decoderr/  encodernum_decoder_layersr6  decoderr:  r-   rH   encoder_configdecoder_configr0   r2   r3   r'     s   

zUMT5Model.__init__c                 C      | j S rW   r  r-   r2   r2   r3   get_input_embeddings     zUMT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rW   r  r  r=  r  r;  r2   r2   r3   r=       zUMT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S rW   rH   r  _tie_or_clone_weightsr  r4  r  r  r  r2   r2   r3   _tie_weights     zUMT5Model._tie_weightsc                 C   r  rW   r  r  r2   r2   r3   get_encoder  r  zUMT5Model.get_encoderc                 C   s*   |  D ]\}}| jj| j| qdS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   	attentionprune_headsr-   heads_to_pruner   headsr2   r2   r3   _prune_heads  s   zUMT5Model._prune_headsNr  r   r   r  rO  decoder_head_maskrP  encoder_outputsr   rA  decoder_inputs_embedsr   r   rG  rQ  r   r   c                 C   s   |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }| j||||	|||||||||d}|se|| S t|j	|j
|j|j|j|j	|j|jdS )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r   rA  rO  r   rG  rQ  r   r    r4   rD  r?   rE  r  r   rA  r   r   r   rO  rP  r   r   rG  rQ  r   )rD  r   decoder_hidden_statesdecoder_attentionsrF  encoder_last_hidden_stater   encoder_attentions)rH   r   rH  r  rX   r   lenr  r   rD  r   r?   rE  rF  )r-   r  r   r   r  rO  r  rP  r  r   rA  r  r   r   rG  rQ  r   r?   decoder_outputsr2   r2   r3   rA     s\   Q	zUMT5Model.forwardNNNNNNNNNNNNNNNN)rB   rC   rD   r   
model_typer!   r(  _tied_weights_keysr'   r  r=  r  r  r  r   r   r)   
LongTensorFloatTensor
BoolTensorrY   rN  r
   r   r   r   rA   rE   r2   r2   r0   r3   r    s~   
 	
r  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc                (       sH  e Zd ZdZdZg dZ fddZdd Zdd	 Zd
d Z	dd Z
e																	d%deej deej deej deej deej deej deej deeeej   dee deej deej deej dee dee dee dee deej d eeej ef f$d!d"Zdejfd#d$Z  ZS )&r  a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTrJ   )r&   r'   rM   	model_dimr   rz   r  r  r  r  rk   r   r  r/  r  r  r6  r  rL   r	  r:  r  r0   r2   r3   r'     s   

z%UMT5ForConditionalGeneration.__init__c                 C   r  rW   r  r  r2   r2   r3   r    r  z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   r  rW   r  r;  r2   r2   r3   r=    r  z1UMT5ForConditionalGeneration.set_input_embeddingsc                 C   r  rW   r  r  r2   r2   r3   r    r  z)UMT5ForConditionalGeneration._tie_weightsc                 C   r  rW   r  r  r2   r2   r3   r    r  z(UMT5ForConditionalGeneration.get_encoderNr  r   r   r  rO  r  rP  r  r   rA  r  labelsr   r   rG  rQ  r   r   c                 C   s  |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }|dur^|du r^|du r^| |}| j||||	|||||||||d}|d }| j j	r|| j
d  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r    r4   r  r  r  r  ignore_indexr5   	losslogitsr   r  r  rF  r  r   r  )rH   r   rH  r  rX   r   r  r'  r  r  r  r	  r   r7   r   r   r   r   r   r?   rE  rF  rD  )r-   r  r   r   r  rO  r  rP  r  r   rA  r  r  r   r   rG  rQ  r   r?   r  sequence_output	lm_logitsr  loss_fctoutputr2   r2   r3   rA     sv   U	


z$UMT5ForConditionalGeneration.forwardc                 C   s
   |  |S rW   )r'  )r-   r  r2   r2   r3   %prepare_decoder_input_ids_from_labelsa  r>  zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNNN)rB   rC   rD   r   r  r  r'   r  r=  r  r  r   r   r)   r  r  r  rY   rN  r
   r   r   r   rA   r  rE   r2   r2   r0   r3   r  z  s    	
 #r  c                       s   e Zd ZdZdZdgZ fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )r  a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S NF)r&   r'   r   rz   r  rM   r  r  r  r   rI  r/  r  r:  )r-   rH   r  r0   r2   r3   r'   y  s   
zUMT5EncoderModel.__init__c                 C   r  rW   r  r  r2   r2   r3   r    r  z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rW   )r  r  r=  r;  r2   r2   r3   r=    s   z%UMT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S rW   )rH   r  r  r  r4  r  r  r2   r2   r3   r    s   zUMT5EncoderModel._tie_weightsc                 C   r  rW   r  r  r2   r2   r3   r    r  zUMT5EncoderModel.get_encoderc                 C   s0   |  D ]\}}| jj| jd j| qdS )r  r   N)r  r  r7  r   r   r  r  r2   r2   r3   r    s   zUMT5EncoderModel._prune_headsNr  r   rO  rA  r   rG  rQ  r   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )rH   rH  r  )	r-   r  r   rO  rA  r   rG  rQ  r  r2   r2   r3   rA     s   #
zUMT5EncoderModel.forward)NNNNNNN)rB   rC   rD   r   r  r  r'   r  r=  r  r  r  r   r   r)   r  r  r   r   rN  r   rA   rE   r2   r2   r0   r3   r  e  sF    	r  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZdgZddgZdef fddZe															ddee	j
 d	ee	j d
ee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rH   c                    s2   t  | t|| _t|| _|   d| _d S r  )r&   r'   r  r   r   classification_headr:  model_parallelrV   r0   r2   r3   r'     s
   


z&UMT5ForSequenceClassification.__init__Nr  r   r   r  rO  r  rP  r  rA  r  r  r   r   rG  rQ  r   c                 C   sh  |dur|n| j j}|durd}|du r!|	dur!td| jj |du r6|
du r6|du r1td| |}| j|||||||||	|
||||d}|d }|| j j	
|j}tt|ddkrhtd|j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s |f|dd  }|dur|f| S |S t|||j|j|j|j |j!|j"|j#d	S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r   r  rO  r  rP  r  rA  r  r   r   rG  rQ  r   r    z7All examples must have the same number of <eos> tokens.r5   
regressionsingle_label_classificationmulti_label_classificationr  )$rH   rH  NotImplementedErrorr1   rB   r   r'  r   eqeos_token_idr7   r   r  r)   unique_consecutivesumr   r   r  problem_typer   r<   r   r   r   squeezer   r   r   r   r  r  rF  r  r   r  )r-   r  r   r   r  rO  r  rP  r  rA  r  r  r   r   rG  rQ  r   r  eos_maskr   rX  r.   sentence_representationr  r  r  r  r2   r2   r3   rA     s   >
,


$

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNNNNN)rB   rC   rD   "_keys_to_ignore_on_load_unexpectedr  r!   r'   r   r   r)   r  rY   r   r  r   r   rN  r   rA   rE   r2   r2   r0   r3   r    sj    
	

r  c                       s   e Zd ZdgZdgZdef fddZe								ddee	j
 dee	j
 d	ee	j
 d
ee	j
 dee	j
 dee dee dee deee	j
 ef fddZ  ZS )r  r  z'transformer.encoder.embed_tokens.weightrH   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rW   )r&   r'   r   r  r   r   rQ   r   rS   rL   r.   r  r:  rV   r0   r2   r3   r'     s   
z#UMT5ForTokenClassification.__init__Nr  r   rO  rA  r  r   rG  rQ  r   c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dur:t }||d| j|d}|sO||	dd f}|durM|f| S |S t|||	j	|	j
dS )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rO  rA  r   rG  rQ  r   r5   r4   )r  r  r?   rE  )rH   rH  r   rS   r  r   r   r   r   r?   rE  )r-   r  r   rO  rA  r  r   rG  rQ  r   r?   r  r  r  r  r2   r2   r3   rA     s4   


z"UMT5ForTokenClassification.forward)NNNNNNNN)rB   rC   rD   r  r  r!   r'   r   r   r)   rY   r   r   rN  r   rA   rE   r2   r2   r0   r3   r  {  s@    	
r  c                &       s&  e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Ze																	d!de
ej de
ej de
ej de
ej de
ej de
ej de
ej de
eeej   de
ej de
ej de
ej de
ej de
e de
e de
e de
e deeej ef f"dd Z  ZS )"r  r  r  c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _|j| _t|j|j| _|   d S r  )r&   r'   rM   r  r   rz   r  r  r  r  rk   r   r  r/  r  r  r6  r  r   rL   r
  r:  r  r0   r2   r3   r'     s    

z!UMT5ForQuestionAnswering.__init__c                 C   r  rW   r  r  r2   r2   r3   r    r  z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   r  rW   r  r;  r2   r2   r3   r=    r  z-UMT5ForQuestionAnswering.set_input_embeddingsc                 C   r  rW   r  r  r2   r2   r3   r    r  z%UMT5ForQuestionAnswering._tie_weightsc                 C   r  rW   r  r  r2   r2   r3   r    r  z$UMT5ForQuestionAnswering.get_encoderNr  r   r   r  rO  r  rP  r  start_positionsend_positionsrA  r  r   r   rG  rQ  r   c                 C   sj  |dur|n| j j}|dur|n| j j}|	dur|
durd}|du r3|du r3|du r.td| |}|dur9|n| j j}|durC|n| j j}|du rX| j|||||||d}n$|r|t|ts|t|d t|dkrm|d ndt|dkrx|d ndd}|d }| j	|||d||||||||d	}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|	dur|
durt|	 dkr|	d
|j}	t|
 dkr|
d
|j}
|d}|	d|}	|
d|}
t|d}|||	}|||
}|| d }|s ||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFr  r  r   r    r4   r  )r  r   rA  r   r   r   rO  rP  r   r   rG  rQ  r5   r   r  )
r  start_logits
end_logitsr   r  r  rF  r  r   r  )rH   rH  r   r   r'  r  rX   r   r  r  r
  splitr  r   r   r7   r   r   r   r   r   r?   rE  rF  rD  )r-   r  r   r   r  rO  r  rP  r  r  r  rA  r  r   r   rG  rQ  r?   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  r2   r2   r3   rA     s   <
	




z UMT5ForQuestionAnswering.forwardr  )rB   rC   rD   r  r'   r  r=  r  r  r   r   r)   r  r  r  rY   rN  r   r   r   rA   rE   r2   r2   r0   r3   r    sv    	
r  )r  r  r  r  r  r  r   )Hr   r  r   typingr   r   r)   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_umt5r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrB   rt   Moduler$   rG   r]   rb   ri   r   r   r   r   r   r/  r  r  r  r  r  r  __all__r2   r2   r2   r3   <module>   sr   $	$	
 HKr  ; O gl $L N