o
    wii                    @   sn  d Z ddlZddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ e& rddl,m-Z- ddl.m/Z/ e)0e1Z2G dd dej3Z4G dd dej3Z5G dd dej3Z6G dd dej3Z7G dd dej3Z8G dd dej3Z9G dd  d ej3Z:G d!d" d"eZ;G d#d$ d$ej3Z<e%G d%d& d&e!Z=G d'd( d(e=Z>e%G d)d* d*e=Z?e%d+d,G d-d. d.e=eZ@e%G d/d0 d0e=ZAe%d1d,G d2d3 d3e=ZBe%G d4d5 d5e=ZCe%G d6d7 d7e=ZDg d8ZEdS )9zPyTorch UMT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/umt5/modeling_umt5.pyr&   >   s   

zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor(   float32powmeanrsqrtr+   r*   dtypefloat16bfloat16)r,   hidden_statesvariancer1   r1   r2   forwardF   s
   
zUMT5LayerNorm.forward)r$   )__name__
__module____qualname__r&   r@   __classcell__r1   r1   r/   r2   r#   =   s    r#   c                       *   e Zd Zdef fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r%   r&   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr,   rG   r/   r1   r2   r&   X   s
   
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rN   rT   rR   
isinstancerO   r*   r(   Tensorr;   int8r6   r,   r>   r1   r1   r2   r@   _   s   



zUMT5DenseActDense.forwardrA   rB   rC   r    r&   r@   rD   r1   r1   r/   r2   rF   W   s    rF   c                       rE   )UMT5DenseGatedActDenserG   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rH   )r%   r&   r   rK   rL   rM   wi_0wi_1rO   rP   rQ   rR   r	   rS   rT   rU   r/   r1   r2   r&   o   s   
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rV   )rT   r]   r^   rR   rW   rO   r*   r(   rX   r;   rY   r6   )r,   r>   hidden_geluhidden_linearr1   r1   r2   r@   w   s   


zUMT5DenseGatedActDense.forwardr[   r1   r1   r/   r2   r\   n   s    r\   c                       rE   )UMT5LayerFFrG   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr.   )r%   r&   is_gated_actr\   DenseReluDenserF   r#   rL   layer_norm_epsilon
layer_normr   rP   rQ   rR   rU   r/   r1   r2   r&      s   

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rV   )rf   rd   rR   )r,   r>   forwarded_statesr1   r1   r2   r@      s   

zUMT5LayerFF.forwardr[   r1   r1   r/   r2   ra      s    
ra   c                       s   e Zd ZdZddee f fddZdejdejfd	d
Z	dd Z
dddZ					ddejdeej deeej  deej deej deej fddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrI   )r%   r&   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerL   d_kvkey_value_proj_dim	num_headsn_headsrQ   rR   	inner_dimri   loggerwarning_oncer0   rA   r   rK   qkvo	Embeddingrelative_attention_biassetpruned_heads)r,   rG   rk   ri   r/   r1   r2   r&      s,   
zUMT5Attention.__init__
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr4   r   r3   r   r   )sizerq   ro   viewpermute)r,   r}   new_projection_shapenew_projectionr1   r1   r2   _shape   s   zUMT5Attention._shapec           	      C   s   d}| j }| j}| js!|d }||dktj| 7 }t|}n
t|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r3   r   )rl   rm   rj   r6   r(   longabsmin
zeros_likelogfloatmath	full_likewhere)	r,   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larger1   r1   r2   _relative_position_bucket   s$    z'UMT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf }tj|tj|ddddf }|| }| |}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r;   device)r3   r   r   r   )	rz   r*   r   r(   aranger   r   r   	unsqueeze)
r,   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluesr1   r1   r2   compute_bias   s   
 

zUMT5Attention.compute_biasr>   encoder_hidden_statespast_key_valueattention_masklayer_head_maskr   c                 C   s  |j d d \}}|d u}	| |}
|
|d| j| jdd}
|d ur4|j| j}|	r1|j	}n|j
}|	r8|n|}|	rO|d urO|rO|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|d ur|	s}|nd }|||| jd|i\}}|	rd|j| j< t|
|dd}|d ur||  n|}|j d }| jstjd| j||f|j|jd}n| j|||j|d	}|d d d d | d d d f }|d ur|d d d d d d d |j d f }|| }| jrt|j d }d
|t| j< |d d | f }n|}||7 }tjj| dd |}tjj!|| j!| j"d}|d ur=|| }t||}|dd# }|||d}| $|}|||fS )Nr3   r4   r   r   Tr   )r   r;   )r   r   r   dim)ptraining)%shaperu   r   rq   ro   	transpose
is_updatedgetri   cross_attention_cacheself_attention_cache	key_cachevalue_cacherv   rw   updater(   matmulget_seq_lengthrk   zerosr   r;   r   r|   r)   listboolr   
functionalsoftmaxr   type_asrR   r   
contiguousrx   )r,   r>   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputr1   r1   r2   r@     sj   	




"&


zUMT5Attention.forward)FN)NNNNNNN)rA   rB   rC   __doc__r   intr&   r(   rX   r   r   r   tupler@   rD   r1   r1   r/   r2   rh      s0    
/rh   c                       s:   e Zd Zddee f fddZ				dddZ  ZS )	UMT5LayerSelfAttentionNri   c                    >   t    t|d|d| _t|j|jd| _t	|j
| _d S )NTrk   ri   rb   )r%   r&   rh   SelfAttentionr#   rL   re   rf   r   rP   rQ   rR   r,   rG   ri   r/   r1   r2   r&   a     
zUMT5LayerSelfAttention.__init__c           	      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }|S )Nr   r   r   r   r   r   )rf   r   rR   )	r,   r>   r   r   r   r   normed_hidden_statesattention_outputoutputsr1   r1   r2   r@   g  s   
zUMT5LayerSelfAttention.forwardrV   )NNNNrA   rB   rC   r   r   r&   r@   rD   r1   r1   r/   r2   r   `  s    	r   c                       s<   e Zd Zddee f fddZ					dddZ  ZS )	UMT5LayerCrossAttentionNri   c                    r   )NFr   rb   )r%   r&   rh   EncDecAttentionr#   rL   re   rf   r   rP   rQ   rR   r   r/   r1   r2   r&   }  r   z UMT5LayerCrossAttention.__init__c                 C   sH   |  |}| j||||||d}|| |d  }	|	f|dd   }
|
S )Nr   r   r   r   r   r   r   )rf   r   rR   )r,   r>   r   r   r   r   r   r   r   layer_outputr   r1   r1   r2   r@     s   
	zUMT5LayerCrossAttention.forwardrV   r   r   r1   r1   r/   r2   r   |  s    	r   c                       sD   e Zd Zddee f fddZ									d	ddZ  ZS )
	UMT5BlockNri   c                    s^   t    |j| _t | _| jt||d | jr%| jt||d | jt	| d S )Nri   )
r%   r&   rj   r   
ModuleListlayerappendr   r   ra   r   r/   r1   r2   r&     s   

zUMT5Block.__init__Fc                 C   sV  | j d |||||
d\}}}|jtjkr3t|jj}tt| |d |}tj	|| |d}d }| j
o;|d u}|rr| j d ||||||
d\}}}|jtjkrrt|jj}tt| |d |}tj	|| |d}| j d |}|jtjkrt|jj}tt| |d |}tj	|| |d}||f}|	r|||f7 }|S )Nr   r   i  )r   maxr   r   r4   )r   r;   r(   r<   finfor   r   isinfanyclamprj   )r,   r>   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr   r1   r1   r2   r@     sJ   		zUMT5Block.forwardrV   )	NNNNNNFFNr   r1   r1   r/   r2   r     s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	UMT5ClassificationHeadz-Head for sentence-level classification tasks.rG   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r%   r&   r   rK   rL   denserP   classifier_dropoutrR   
num_labelsout_projrU   r/   r1   r2   r&     s   
zUMT5ClassificationHead.__init__r>   r~   c                 C   s6   |  |}| |}t|}|  |}| |}|S rV   )rR   r   r(   tanhr   rZ   r1   r1   r2   r@     s   




zUMT5ClassificationHead.forward)
rA   rB   rC   r   r    r&   r(   rX   r@   rD   r1   r1   r/   r2   r     s    r   c                   @   sH   e Zd ZeZdZdZdZdZdgZ	dgZ
edd Zdd Zd	d
 ZdS )UMT5PreTrainedModeltransformerTr   rO   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r(   tensorr   r   )r,   r   
input_maskdummy_inputsr1   r1   r2   r     s   

z UMT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|tr|jj|d  d	S t|ttt	t
fr^|jjjjd|d d t|dr>| j js>|jjjjd|d d t|dr\|jjjjd|| j jd  d |jjj  d	S d	S t|trt|dr}|jjjjd|d d |jjj  d	S d	S t|tr|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|tr$|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr |jjd	ur"|jjj  d	S d	S d	S t|tr|jjjjd|| j jd  d t|jdrO|jjd	urO|jjj  |jjjjd|| j jd  d t|jdrt|jjd	urt|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|t r| j j}| j j!}| j j"}|j#jjjd||| d  d |j$jjjd||d  d |j%jjjd||d  d |j&jjjd||| d  d |j'r|j(jjjd||d  d d	S d	S d	S )
zInitialize the weights      ?        )r9   stdlm_head
qa_outputs      
classifierrJ   N))rG   initializer_factorrW   r#   r*   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr  r  rL   rJ   zero_UMT5ForTokenClassificationr  r   r   r   rF   rN   rO   rM   r\   r]   r^   rh   rn   rp   ru   rv   rw   rx   rk   rz   )r,   modulefactorrL   ro   rq   r1   r1   r2   _init_weights  s|   

 


          
z!UMT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r4   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rG   decoder_start_token_idpad_token_id
ValueErrorr   r(   fullr   cat	new_zerosclonemasked_fill_)r,   r   r  r  shifted_input_idsr1   r1   r2   _shift_rightP  s      z UMT5PreTrainedModel._shift_rightN)rA   rB   rC   r    config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_static_cache_no_split_modules_keep_in_fp32_modulespropertyr   r  r"  r1   r1   r1   r2   r     s    

Br   c                       s   e Zd Zd fdd	Zdd Zdd Z													ddd	Z	
ddeej	df dej	dej	de
def
ddZedej	dededejdej	defddZ  ZS )	UMT5StackNc                    sl   t    || _ j| _t fddt jD | _t	 j
 jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irG   r1   r2   
<listcomp>q  s    z&UMT5Stack.__init__.<locals>.<listcomp>rb   F)r%   r&   embed_tokensrj   r   r   range
num_layersblockr#   rL   re   final_layer_normrP   rQ   rR   gradient_checkpointing	post_init)r,   rG   r0  r/   r.  r2   r&   m  s    zUMT5Stack.__init__c                 C      | j S rV   r0  r,   r1   r1   r2   get_input_embeddingsy     zUMT5Stack.get_input_embeddingsc                 C   
   || _ d S rV   r8  r,   new_embeddingsr1   r1   r2   set_input_embeddings|     
zUMT5Stack.set_input_embeddingsc           '      C   sj  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd u rtd| |}|\}}|	du r| jstd|  dd}d}| jr|	s|d urt|trt|tsd}t|t }n#t|tsd}td t|}n|d u rtt t }n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr/| ||||d ur*|jnd |
}n&|d urS|d d d d d d f }|j|jd}d| t|jj }nd }| jr||d ur|| \}}}||f}|d u rvtj||jd}| |}nd }|  || j j!}|  || j j!}|rdnd }|
rdnd }|
r| jrdnd }| "|}t#| j$D ]C\} }!||  }"||  }#|r||f }|!|||||"|#||	|
|d
}$|$d }|	r|$d }%|
r||$d f7 }| jr||$d f7 }q| %|}| "|}|r||f }|	r|%nd }&|r|j}&|r|& }&|s,t'dd ||&|||fD S t(||&|||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer4   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r;   r  r1   )r   r   r   r   r   r   r   r   r3   r   c                 s   s    | ]	}|d ur|V  qd S rV   r1   )r,  rw   r1   r1   r2   	<genexpr>&  s    z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesr>   
attentionscross_attentions))rG   r   r   output_hidden_statesuse_return_dictrj   r  r   r   r5  r   rs   rt   r0  rW   r
   r   r   from_legacy_cacher   r(   r   r   r   r)   _update_causal_maskr   r6   r;   r   r   invert_attention_maskget_head_maskr2  rR   	enumerater3  r4  to_legacy_cacher   r   )'r,   r   r   r   r   rC  	head_maskcross_attn_head_maskrG  r   r   rJ  return_dictr   err_msg_prefixinput_shaper   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr>   r-  layer_moduler   r   layer_outputsnext_decoder_cache
next_cacher1   r1   r2   r@     s  










zUMT5Stack.forwardFr   r!   input_tensorr   rG  r   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r  flex_attentionr   Fsdpa)rC  rY  is_trainingr   r4   )sequence_lengthtarget_lengthr;   r   r   )cudaxpunpu)rG   _attn_implementationr   rW   r(   rX   r"   r   is_compileabler   _ignore_causal_mask_sdpar   r;   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r,   r   rg  r   rG  r   past_seen_tokensusing_compilable_cacher;   rl  rm  r   	min_dtyper1   r1   r2   rM  :  sT   




zUMT5Stack._update_causal_maskrl  rm  r;   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer;   r   r   )diagonalrD  r4   r   )r   r(   r   r   r  r   triur   reshapeexpandr  r   r6   masked_fill)r   rl  rm  r;   r   r   kwargsr   rz  mask_lengthpadding_maskr1   r1   r2   ru  ~  s,    $
6  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionrV   )NNNNNNNNNNNNN)F)rA   rB   rC   r&   r:  r?  r@   r   r(   rX   r
   r   rM  staticmethodr   r;   ru  rD   r1   r1   r/   r2   r+  l  sZ    
 B
Dr+  c                &       sJ  e Zd ZdZdZeZddgZ fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Ze																d'deej deej deej deej deej deej deej deeeej   deeeej   deej deej dee d ee d!ee d"ee d#eej d$eeej ef f"d%d&Z  ZS )(r  ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S NFT)r%   r&   r   ry   
vocab_sizerL   r  copydeepcopyrj   r   is_encoder_decoderr+  encodernum_decoder_layersr2  decoderr6  r,   rG   encoder_configdecoder_configr/   r1   r2   r&     s   

zUMT5Model.__init__c                 C   r7  rV   r  r9  r1   r1   r2   r:    r;  zUMT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rV   r  r  r?  r  r=  r1   r1   r2   r?       zUMT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S rV   rG   r  _tie_or_clone_weightsr  r0  r  r  r9  r1   r1   r2   _tie_weights     zUMT5Model._tie_weightsc                 C   r7  rV   r  r9  r1   r1   r2   get_encoder  r;  zUMT5Model.get_encoderc                 C   r7  rV   r  r9  r1   r1   r2   get_decoder  r;  zUMT5Model.get_decoderc                 C   s*   |  D ]\}}| jj| j| qdS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   	attentionprune_headsr,   heads_to_pruner   headsr1   r1   r2   _prune_heads  s   zUMT5Model._prune_headsNr   r   r   r   rR  decoder_head_maskrS  encoder_outputsrG  rC  decoder_inputs_embedsr   r   rJ  rT  r   r~   c                 C   s   |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }| j||||	|||||||||d}|se|| S t|j	|j
|j|j|j|j	|j|jdS )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   rC  rR  r   rJ  rT  r   r   r3   rF  r>   rH  r   r   rC  rG  r   r   rR  rS  r   r   rJ  rT  r   )rF  rG  decoder_hidden_statesdecoder_attentionsrI  encoder_last_hidden_stater   encoder_attentions)rG   r   rK  r  rW   r   lenr  r   rF  rG  r>   rH  rI  )r,   r   r   r   r   rR  r  rS  r  rG  rC  r  r   r   rJ  rT  r   r>   decoder_outputsr1   r1   r2   r@     s\   Q	zUMT5Model.forwardNNNNNNNNNNNNNNNN)rA   rB   rC   r   
model_typer    r#  _tied_weights_keysr&   r:  r?  r  r  r  r  r   r   r(   
LongTensorFloatTensor
BoolTensorrX   r   r   r   r   r@   rD   r1   r1   r/   r2   r    s    	
r  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc                (       sv  e Zd ZdZdZg dZ fddZdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Ze																	d-deej deej deej deej deej deej deej deeeej   deeeej   deej deej d eej d!ee d"ee d#ee d$ee d%eej d&eeej ef f$d'd(Zd ejfd)d*Zed+d, Z  ZS ).r  a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTrI   )r%   r&   rL   	model_dimr   ry   r  r  r  r  rj   r   r  r+  r  r  r2  r  rK   r  r6  r  r/   r1   r2   r&     s   

z%UMT5ForConditionalGeneration.__init__c                 C   r7  rV   r  r9  r1   r1   r2   r:    r;  z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   r  rV   r  r=  r1   r1   r2   r?    r  z1UMT5ForConditionalGeneration.set_input_embeddingsc                 C   r  rV   r  r9  r1   r1   r2   r    r  z)UMT5ForConditionalGeneration._tie_weightsc                 C   r<  rV   r  r=  r1   r1   r2   set_output_embeddings  r@  z2UMT5ForConditionalGeneration.set_output_embeddingsc                 C   r7  rV   r  r9  r1   r1   r2   get_output_embeddings  r;  z2UMT5ForConditionalGeneration.get_output_embeddingsc                 C   r7  rV   r  r9  r1   r1   r2   r    r;  z(UMT5ForConditionalGeneration.get_encoderc                 C   r7  rV   r  r9  r1   r1   r2   r    r;  z(UMT5ForConditionalGeneration.get_decoderNr   r   r   r   rR  r  rS  r  rG  rC  r  labelsr   r   rJ  rT  r   r~   c                 C   s  |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }|dur^|du r^|du r^| |}| j||||	|||||||||d}|d }| j j	r|| j
d  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr  r   r   r3   r  r  r  r  ignore_indexr4   	losslogitsrG  r  r  rI  r  r   r  )rG   r   rK  r  rW   r   r  r"  r  r  r  r  r   r6   r   r   r   r   rG  r>   rH  rI  rF  )r,   r   r   r   r   rR  r  rS  r  rG  rC  r  r  r   r   rJ  rT  r   r>   r  sequence_output	lm_logitsr  loss_fctoutputr1   r1   r2   r@     sv   U	


z$UMT5ForConditionalGeneration.forwardc                 C   s
   |  |S rV   )r"  )r,   r  r1   r1   r2   %prepare_decoder_input_ids_from_labels}  r@  zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr1   c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectr6   r   )r,  
past_statebeam_idxr1   r2   rE    s   " z>UMT5ForConditionalGeneration._reorder_cache.<locals>.<genexpr>)r   )rG  r  reordered_past
layer_pastr1   r  r2   _reorder_cache  s   z+UMT5ForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNNNN)rA   rB   rC   r   r  r  r&   r:  r?  r  r  r  r  r  r   r   r(   r  r  r  rX   r   r   r   r   r@   r  r  r  rD   r1   r1   r/   r2   r    s    	
 #r  c                       s   e Zd ZdZdZdgZ fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )r  a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S NF)r%   r&   r   ry   r  rL   r  r  r  r   r  r+  r  r6  )r,   rG   r  r/   r1   r2   r&     s   
zUMT5EncoderModel.__init__c                 C   r7  rV   r  r9  r1   r1   r2   r:    r;  z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rV   )r  r  r?  r=  r1   r1   r2   r?    s   z%UMT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S rV   )rG   r  r  r  r0  r  r9  r1   r1   r2   r    s   zUMT5EncoderModel._tie_weightsc                 C   r7  rV   r  r9  r1   r1   r2   r    r;  zUMT5EncoderModel.get_encoderc                 C   s0   |  D ]\}}| jj| jd j| qdS )r  r   N)r  r  r3  r   r   r  r  r1   r1   r2   r    s   zUMT5EncoderModel._prune_headsNr   r   rR  rC  r   rJ  rT  r~   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )rG   rK  r  )	r,   r   r   rR  rC  r   rJ  rT  r  r1   r1   r2   r@     s   #
zUMT5EncoderModel.forward)NNNNNNN)rA   rB   rC   r   r  r  r&   r:  r?  r  r  r  r   r   r(   r  r  r   r   r   r   r@   rD   r1   r1   r/   r2   r    sF    	r  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZdgZddgZdef fddZe															ddee	j
 d	ee	j d
ee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rG   c                    s2   t  | t|| _t|| _|   d| _d S r  )r%   r&   r  r   r   classification_headr6  model_parallelrU   r/   r1   r2   r&     s
   


z&UMT5ForSequenceClassification.__init__Nr   r   r   r   rR  r  rS  r  rC  r  r  r   r   rJ  rT  r~   c                 C   sh  |dur|n| j j}|durd}|du r!|	dur!td| jj |du r6|
du r6|du r1td| |}| j|||||||||	|
||||d}|d }|| j j	
|j}tt|ddkrhtd|j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s |f|dd  }|dur|f| S |S t|||j|j|j|j |j!|j"|j#d	S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r   r   rR  r  rS  r  rC  r  r   r   rJ  rT  r   r   z7All examples must have the same number of <eos> tokens.r4   
regressionsingle_label_classificationmulti_label_classificationr  )$rG   rK  NotImplementedErrorr0   rA   r  r"  r   eqeos_token_idr6   r   r  r(   unique_consecutivesumr   r   r  problem_typer   r;   r   r   r   squeezer   r   r   rG  r  r  rI  r  r   r  )r,   r   r   r   r   rR  r  rS  r  rC  r  r  r   r   rJ  rT  r   r  eos_maskr   r]  r-   sentence_representationr  r  r  r  r1   r1   r2   r@     s   >
,


$

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNNNNN)rA   rB   rC   "_keys_to_ignore_on_load_unexpectedr  r    r&   r   r   r(   r  rX   r   r  r   r   r   r   r@   rD   r1   r1   r/   r2   r    sj    
	

r  c                       s   e Zd ZdgZdgZdef fddZe								ddee	j
 dee	j
 d	ee	j
 d
ee	j
 dee	j
 dee dee dee deee	j
 ef fddZ  ZS )r  r  z'transformer.encoder.embed_tokens.weightrG   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rV   )r%   r&   r   r  r   r   rP   r   rR   rK   r-   r  r6  rU   r/   r1   r2   r&     s   
z#UMT5ForTokenClassification.__init__Nr   r   rR  rC  r  r   rJ  rT  r~   c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dur:t }||d| j|d}|sO||	dd f}|durM|f| S |S t|||	j	|	j
dS )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rR  rC  r   rJ  rT  r   r4   r3   )r  r  r>   rH  )rG   rK  r   rR   r  r   r   r   r   r>   rH  )r,   r   r   rR  rC  r  r   rJ  rT  r   r>   r  r  r  r  r1   r1   r2   r@     s4   


z"UMT5ForTokenClassification.forward)NNNNNNNN)rA   rB   rC   r  r  r    r&   r   r   r(   rX   r   r   r   r   r@   rD   r1   r1   r/   r2   r    s@    	
r  c                &       s.  e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Zdd Z	e
																d#deej deej deej deej deej deej deej deeeej   deej deej deej deej dee dee dee dee d eeej ef f"d!d"Z  ZS )$r  r  r  c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _|j| _t|j|j| _|   d S r  )r%   r&   rL   r  r   ry   r  r  r  r  rj   r   r  r+  r  r  r2  r  r   rK   r  r6  r  r/   r1   r2   r&     s    

z!UMT5ForQuestionAnswering.__init__c                 C   r7  rV   r  r9  r1   r1   r2   r:  
  r;  z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   r  rV   r  r=  r1   r1   r2   r?    r  z-UMT5ForQuestionAnswering.set_input_embeddingsc                 C   r  rV   r  r9  r1   r1   r2   r    r  z%UMT5ForQuestionAnswering._tie_weightsc                 C   r7  rV   r  r9  r1   r1   r2   r    r;  z$UMT5ForQuestionAnswering.get_encoderc                 C   r7  rV   r  r9  r1   r1   r2   r    r;  z$UMT5ForQuestionAnswering.get_decoderNr   r   r   r   rR  r  rS  r  start_positionsend_positionsrC  r  r   r   rJ  rT  r~   c                 C   sj  |dur|n| j j}|dur|n| j j}|	dur|
durd}|du r3|du r3|du r.td| |}|dur9|n| j j}|durC|n| j j}|du rX| j|||||||d}n$|r|t|ts|t|d t|dkrm|d ndt|dkrx|d ndd}|d }| j	|||d||||||||d	}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|	dur|
durt|	 dkr|	d
|j}	t|
 dkr|
d
|j}
|d}|	d|}	|
d|}
t|d}|||	}|||
}|| d }|s ||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFr  r  r   r   r3   r  )r   r   rC  rG  r   r   rR  rS  r   r   rJ  rT  r4   r   r  )
r  start_logits
end_logitsrG  r  r  rI  r  r   r  )rG   rK  r   r  r"  r  rW   r   r  r  r  splitr  r   r   r6   r   r   r   r   rG  r>   rH  rI  rF  )r,   r   r   r   r   rR  r  rS  r  r  r  rC  r  r   r   rJ  rT  r>   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  r1   r1   r2   r@   !  s   <
	




z UMT5ForQuestionAnswering.forwardr  )rA   rB   rC   r  r&   r:  r?  r  r  r  r   r   r(   r  r  r  rX   r   r   r   r   r@   rD   r1   r1   r/   r2   r    sx    	
r  )r  r  r  r  r  r  r   )Fr   r  r   typingr   r   r(   r   torch.nnr   r   r   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_umt5r    !torch.nn.attention.flex_attentionr!   integrations.flex_attentionr"   
get_loggerrA   rs   Moduler#   rF   r\   ra   rh   r   r   r   r   r   r+  r  r  r  r  r  r  __all__r1   r1   r1   r2   <module>   sp   $	$	
 CMr  M S |l $L R