o
    i                    @   s  d Z ddlZddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- e& rddl.m/Z/ ddl0m1Z1 e)2e3Z4d^dej5de6de6de6dej5f
ddZ7dej5de6de6dej5fddZ8d^dej5de6de6de6dej5f
d d!Z9de6dej5fd"d#Z:d$ej5de6dej5fd%d&Z;d'ej5de6d(ej<dej5fd)d*Z=d'ej5d+e6de>ej5ej5f fd,d-Z?d'ej5d+e6dej5fd.d/Z@d0ej5d1ej5d2e6dej5fd3d4ZAG d5d6 d6e	jBZCzdd7lDmEZE eEZCe4Fd8 W n eGyA   Y n eHyO   e4Id9 Y nw G d:d; d;e	jBZJG d<d= d=e	jBZKG d>d? d?e	jBZLG d@dA dAe	jBZMG dBdC dCe	jBZNG dDdE dEe	jBZOG dFdG dGe	jBZPG dHdI dIe	jBZQG dJdK dKe	jBZRG dLdM dMe	jBZSG dNdO dOeZTe%G dPdQ dQeZUG dRdS dSeUZVdTZWe%G dUdV dVeUZXe%dWdXG dYdZ dZeUeZYe%G d[d\ d\eUZZg d]Z[dS )_zPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 C   s   | j |  | }t| j s"t| j }||  |7  < tj|| jdS dg| j }d|f||< t|ddd d}tj	j
| |d|d} | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr(   ndimsumr   
functionalr.   )r"   r#   r$   r%   pad_len	new_shaper.   r+   r+   ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multiple@   s   

r<   c                 C   s~   | j | | dkrt| ||dd} | j | | }| j d| ||f | j |d d  }d|v r:tj|| j| jdS | |S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r%   Nr   r(   device)r1   r<   r4   emptyr(   r>   reshape)r"   r#   r$   
num_blocksoutput_shaper+   r+   r;   _split_into_blocksP   s   (
rC   	block_dimsequence_dimc           	      C   s   | j | }dg| j }d||< t|ddd d}tjj| |d|d} g }tdD ]}td	dg| j }t||| ||< t|}|	| |  q)t
j||d
S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r)   )r   r   Nr*   r+   r,   r-   r   r   r$   )r1   r6   r7   r   r8   r.   rangeslicetupleappendr4   cat)	r"   rD   rE   r%   rA   r.   blocks_listiindicesr+   r+   r;   _concatenate_3_blocks_   s   
rO   c                 C   s:   t jd|  t jd}|| |   }|d|d }|S )z:Makes 3-blocked relative position ids for local attention.r   r'   r   r   )r4   arangeint32	unsqueeze)r#   position_idscenter_position_idsrelative_position_idsr+   r+   r;   "_make_3block_relative_position_idsx   s   rV   local_attention_maskc                 C   sF   t |}t||k }|ddddddf }|| j}t| |S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rV   r4   abstor>   logical_and)rW   r#   rU   locality_maskr+   r+   r;   _mask_local_attention_mask   s
   r\   attention_maskr>   c                 C   sV   t | |dd}t|ddd}|d}|d}t||}t||}|d|S )z;Prepare attention mask to be applied for a local attention.r   rF      rD   rE   r*   )rC   rO   rR   r4   rZ   r\   rY   )r]   r#   r>   _blocked_attention_mask_3blocked_attention_maskrW   r+   r+   r;   _get_local_attention_mask   s   


rc   global_block_sizec                    s^  | j dd \}dtjdtjf fdd}tj| | jd  }tj|dd	| }t| d
kdd| j}t	|| d | j}tj
d|j|jd}t||k||}||  | d  }||}  }|dkr|tj|ddj|ddd}	ntj|d|j|jd}	tjt||ddd }
|
| j}
t|
|	kdd}
|tj|
tjfS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nr^   	block_idsr&   c                    sd   t    d k}|| j}t || dk}|dd| jd }t 	| |k | |} | S )Nr   r   r*   )
r4   rP   rY   r>   rZ   r7   rR   typer(   where)re   
block_endstrue_block_endsfull_blocksrd   seq_lenr+   r;   handle_orphan_tokens   s   z:_make_global_fixed_block_ids.<locals>.handle_orphan_tokensr>   r   )axis              ?g     @r*   r=   r   rF   )r1   r4   Tensor	ones_liker>   cumsumrg   rf   r(   floortensormaxvaluesrepeat	transposer5   onesrY   int)r]   rd   
batch_sizerm   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsr+   rk   r;   _make_global_fixed_block_ids   s,   
"r   c                 C   s@   t | |\}}|jd }tj||jd}||d  }|tjS )zBCreate the relative position tensor for local -> global attention.r*   rn   .N)r   r1   r4   rP   r>   rf   int64)r]   rd   re   r   global_seq_lenglobal_positionsside_relative_positionr+   r+   r;    _make_side_relative_position_ids   s
   
r   hidden_statesre   r   c                 C   sf   | |dktj||j|jd}tj|tj	|d ddddddf }t
d| || jS )zFCompute individual block aggregates by summing over individual blocks.r   r=   r   Nr*   z...nd,...ng->...gd)rg   r4   rv   r(   r>   r   r8   one_hotrf   r   einsum)r   re   r   one_hot_block_idsr+   r+   r;   _create_global_aggregates   s
   0r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LongT5LayerNormư>c                    s&   t    tt|| _|| _dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr4   r{   weightvariance_epsilon)selfhidden_sizeeps	__class__r+   r;   r      s   

zLongT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )Nr^   r*   T)keepdim)rY   r4   float32powmeanrsqrtr   r   r(   float16bfloat16)r   r   variancer+   r+   r;   forward   s
   
zLongT5LayerNorm.forward)r   )__name__
__module____qualname__r   r   __classcell__r+   r+   r   r;   r      s    r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                       *   e Zd Zdef fddZdd Z  ZS )LongT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   r+   r;   r   
  s
   
zLongT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)r   r   r   
isinstancer   r   r4   rr   r(   int8rY   )r   r   r+   r+   r;   r     s   



zLongT5DenseActDense.forwardr   r   r   r   r   r   r   r+   r+   r   r;   r   	  s    r   c                       r   )LongT5DenseGatedActDenser   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   r   r+   r;   r      s   
z!LongT5DenseGatedActDense.__init__c                 C   s:   |  | |}| |}|| }| |}| |}|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linearr+   r+   r;   r   (  s   


z LongT5DenseGatedActDense.forwardr   r+   r+   r   r;   r     s    r   c                       r   )LongT5LayerFFr   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r+   r;   r   3  s   

zLongT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   r   forwarded_statesr+   r+   r;   r   =  s   

zLongT5LayerFF.forwardr   r+   r+   r   r;   r   2  s    
r   c                       sz   e Zd Z		ddedee f fddZdd ZedddZ	dddZ
edddd									dddZ  ZS )LongT5AttentionFNr   	layer_idxc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   r+   r;   r   F  s.   

zLongT5Attention.__init__c                 C      t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S Nr   r   rF   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexr+   r+   r;   prune_headsi     zLongT5Attention.prune_headsT       c                 C      d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r^   r   rY   r4   longrX   min
zeros_likelogfloatmath	full_likerg   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger+   r+   r;   _relative_position_buckety  s*   z)LongT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )%Compute binned relative position biasNr=   r   r   r   r^   r   r   r   )r   r   r>   r4   rP   r   rY   r  r   r   r   permuterR   )
r   query_length
key_lengthr>   cache_positioncontext_positionmemory_positionr   relative_position_bucketrx   r+   r+   r;   compute_bias  s    
 
zLongT5Attention.compute_biaspast_key_valuepast_key_values4.58new_nameversionc                 C   s  |j dd \}}|du}| |}||d| j| jdd}d}t|tr8|j	| j
}|r4|j}n|j}n|}|r>|n|}|rW|durW|rW|j| j
 j}|j| j
 j}nJ| |}| |}||d| j| jdd}||d| j| jdd}|dur|s|
nd}
|||| j
d|
i\}}|rt|trd|j| j
< t||dd}|du r|j d	 }|dur|n|
d d }| jstjd| j||f|j|jd
}| jr| jrd|_n| j|||j|
d}|dddd| dddf }|dur|ddddddd|j d	 f }|| }| jr2t|j d }d|t| j< |dd|  f }n|}||7 }t!j"j#|$ dd%|}t!j"j&|| j&| jd}|durY|| }t||}|dd' }||d| j(}| )|}||f}|	r||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr^   r*   r   Fr	  Tr   r`   r>   r(   )r>   r	  r   rF   ptraining)*r1   r   viewr   r   rz   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysrx   r   r   updater4   matmulr   r5   r>   r(   r   r  requires_gradr  r   r{   r3   boolr   r8   softmaxr   type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biasr  layer_head_maskr  	use_cacheoutput_attentionsr	  r}   
seq_lengthis_cross_attentionquery_statesr  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr+   r+   r;   r     s|   






"
&


zLongT5Attention.forwardFNTr   r   )NN)	NNNNNNFFN)r   r   r   r   r   r|   r   r   staticmethodr  r  r   r   r   r+   r+   r   r;   r   E  s.    #
/r   c                       sb   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
				dddZ  ZS )LongT5LocalAttentionFr   r   r&   Nc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrmt| j| j
| _t | _d| _d S )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r;   r   /  s(   

zLongT5LocalAttention.__init__c                 C   r   r   r   r   r+   r+   r;   r   I  r   z LongT5LocalAttention.prune_headsTr   r   c                 C   r   r   r   r   r+   r+   r;   r  Y  *   z.LongT5LocalAttention._relative_position_bucketblock_lengthc                 C      | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r  metaNr   r=   r  r  r   r   r   r>   rf   r4   rP   r   r  r   r   r   r  rR   r   r@  target_devicer  r
  r   r  rx   r+   r+   r;   r        
 
z!LongT5LocalAttention.compute_biasc                    s  |j d d \ } fdd} fdd}||}	||}
||}t|	jdd}	t|
jdd}
t|jdd}t|
ddd}
t|ddd}td	|	|
}|d u rj	s~tj
ddjjd
j f|j|jd}jr}jr}d|_nj}|d urt|dkdd}||dd }||7 }tjj| dd|}tjj|jjd}|d ur|| }||j}|td||}|d d d |d d f }|}||f}|r||f }|S )Nr^   c                       |   djjS 
projectionr*   r  r   r   statesr}   r   r+   r;   r1        z+LongT5LocalAttention.forward.<locals>.shapec                       |    djS r@   r*   r%  r  r   rL  rN  r+   r;   unshape  rO  z-LongT5LocalAttention.forward.<locals>.unshaper   rF   r_   ...qhd,...khd->...hqkr   r  Tr   rp       _r*   r  ...hqk,...khd->...qhd)r1   r   r   r   rC   r#   rO   r4   r   r   r5   r   r>   r(   r   r  r!  r  rg   rz   r   r8   r#  r   r$  r   rf   r   )r   r   r   r'  r(  r*  r+  r1   rS  r-  r0  r1  r2  r6  r7  r8  r+   rN  r;   r     sR   

zLongT5LocalAttention.forwardFr:  NNNF)r   r   r   r   r"  r   r   r;  r  r|   r  r   r   r+   r+   r   r;   r<  .  s    /r<  c                       s~   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
dejdejdejfddZ				dddZ  ZS )LongT5TransientGlobalAttentionFr   r   r&   Nc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j
| _t | _| jrt| j| j
| _t|j|jd| _d S )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   r=  r#   rd   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normr>  r   r+   r;   r     s.   
z'LongT5TransientGlobalAttention.__init__c                 C   r   r   r   r   r+   r+   r;   r     r   z*LongT5TransientGlobalAttention.prune_headsTr   r   c                 C   r   r   r   r   r+   r+   r;   r     r?  z8LongT5TransientGlobalAttention._relative_position_bucketr@  c                 C   rA  rB  rD  rE  r+   r+   r;   r  Q  rG  z+LongT5TransientGlobalAttention.compute_biasr   r   c                 C   s   t |d |d d d d d f d d d df }t |dkdd}t|| j}| j|| j | j| jd}| 	|}|
g d}|| }|S )Nr   .r   rp   rU  r  )r   r   r   r^   )r4   eqrg   r   rd   r  r   r   r   rZ  r  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biasr+   r+   r;   compute_side_biasi  s   0
z0LongT5TransientGlobalAttention.compute_side_biasc                    s2  |j d d \ } fdd} fdd}t|d ur|n	t|j d d j\}	}
|
j d }t||	|}|}||}||}|	|}||}|	|}t
|jdd}t
|jdd}t
|jdd}t|ddd	}t|ddd	}dg|jd  }|j d |d< |d|}|d|}tj||gdd}tj||gdd}td
||}|d urt|j|j}t|dkdd}nd }|d u rEjstjddjjdj f|j|jd}jrjrd|_nj}|d ur||dd }||j}|d u r t |}||
}t
|jdddd}||j |j}tj||gdd}||7 }t!j"j#|$ dd%|}t!j"j&|j&jd}|d urj|| }||j}|td||}|d d d |d d f }'|}||f}|r||f }|S )Nr^   c                    rH  rI  rK  rL  rN  r+   r;   r1     rO  z5LongT5TransientGlobalAttention.forward.<locals>.shapec                    rP  rQ  rR  rL  rN  r+   r;   rS    rO  z7LongT5TransientGlobalAttention.forward.<locals>.unshaper*   r   rF   r_   rT  r   rp   rU  r   r  Tr`   r  rV  )(r1   r   r4   r{   rd   r   r[  r   r   r   rC   r#   rO   r6   rR   ry   rK   r   rc   r>   rg   r   r5   r   r(   r   r  r!  r  rz   rf   ra  rY   r   r8   r#  r   r$  r   r   )r   r   r   r'  r(  r*  r+  r1   rS  re   r   _global_seq_lenglobal_inputsr-  r0  r1  side_key_statesside_value_statesrepsr2  rW   side_position_biasr6  r7  r8  r+   rN  r;   r   ~  s   







z&LongT5TransientGlobalAttention.forwardrW  r:  rX  )r   r   r   r   r"  r   r   r;  r  r|   r  r4   rr   ra  r   r   r+   r+   r   r;   rY    s    /rY  c                       sN   e Zd Zddee f fddZedddd								dd
dZ  ZS )LongT5LayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   r   r+   r;   r     s   
z!LongT5LayerSelfAttention.__init__r  r  r  r  c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)r   r'  r(  r  r)  r*  r	  r   r   )r   rj  r   )r   r   r]   r'  r(  r  r)  r*  r	  normed_hidden_statesattention_outputr8  r+   r+   r;   r      s   

z LongT5LayerSelfAttention.forwardr9  )NNNNFFN	r   r   r   r   r|   r   r   r   r   r+   r+   r   r;   rh    s    rh  c                       D   e Zd ZdZd
dee f fddZ				ddefdd	Z  Z	S )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                    <   t    t||d| _t|j|jd| _t	|j
| _d S N)r   r   )r   r   r<  LocalSelfAttentionr   r   r   r   r   r   r   r   r   r   r+   r;   r     s   
z&LongT5LayerLocalSelfAttention.__init__kwargsc           
      K   F   |  |}| j|||||d}|| |d  }|f|dd   }	|	S N)r   r'  r(  r*  r   r   )r   rr  r   
r   r   r]   r'  r(  r*  rs  rk  rl  r8  r+   r+   r;   r   %     
	z%LongT5LayerLocalSelfAttention.forwardr9  rX  
r   r   r   __doc__r   r|   r   r   r   r   r+   r+   r   r;   ro    s    	ro  c                       rn  )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                    rp  rq  )r   r   rY  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   r   r+   r;   r   >  s   
z0LongT5LayerTransientGlobalSelfAttention.__init__rs  c           
      K   rt  ru  )r   r{  r   rv  r+   r+   r;   r   F  rw  z/LongT5LayerTransientGlobalSelfAttention.forwardr9  rX  rx  r+   r+   r   r;   rz  ;  s    rz  c                       sP   e Zd Zddee f fddZedddd										dd
dZ  ZS )LongT5LayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFri  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   r+   r;   r   ^  s   
z"LongT5LayerCrossAttention.__init__r  r  r  r  Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	r   r&  r'  r(  r  r)  r  r*  r	  r   r   )r   r}  r   )r   r   r&  r]   r'  r(  r  r)  r  r*  r	  rk  rl  layer_outputr8  r+   r+   r;   r   d  s    
z!LongT5LayerCrossAttention.forwardr   )NNNNFNFNrm  r+   r+   r   r;   r|  ]  s    r|  c                       sX   e Zd Zddee f fddZedddd												
	dddZ  ZS )LongT5BlockFNr   c                    s   t    |j| _|jrt}n|jdkrt}n|jdkrt}n	td|j dt	 | _
| j
||||d | jrE| j
t||d | j
t| d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .ri  )r   )r   r   r   rh  encoder_attention_typero  rz  
ValueErrorr   
ModuleListlayerrJ   r|  r   )r   r   r   r   attention_layerr   r+   r;   r     s(   



zLongT5Block.__init__r  r  r  r  Tc                 C   sX  | j d |||||	|
||d}|d }|dd  }|jtjkr8t| r8t|jjd }tj|| |d}| j	o>|d u}|r| j d ||||||	|d d |
||d
}|d }|jtjkryt| ryt|jjd }tj|| |d}||dd   }| j d |}|jtjkrt| rt|jjd }tj|| |d}|f| S )Nr   )r]   r'  r(  r  r)  r*  r	  r   i  )r   rw   r*   )	r&  r]   r'  r(  r  r  r)  r*  r	  )
r  r(   r4   r   isinfanyfinforw   clampr   )r   r   r]   r'  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr(  cross_attn_layer_head_maskr  r)  r*  return_dictr	  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr+   r+   r;   r     sP   

zLongT5Block.forwardr9  )NNNNNNNNFFTNrm  r+   r+   r   r;   r    s     r  c                       s`   e Zd ZU eed< dZdZdgZdZe	dd Z
dd	 Ze fd
dZdd Zdd Z  ZS )LongT5PreTrainedModelr   transformerTr  Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r4   rv   r   r   )r   r  
input_maskdummy_inputsr+   r+   r;   r    s   

z"LongT5PreTrainedModel.dummy_inputsc                 C   sJ   | }| d}|dD ]}t||s d S t||}q| || j d S )Nz.weightr  )removesuffixsplithasattrgetattr_tie_or_clone_weightsshared)r   keymodulesub_keyr+   r+   r;   _try_load_missing_tied_module  s   

z3LongT5PreTrainedModel._try_load_missing_tied_modulec                    s   | dd}d|d< t j|i |\}}| dg }t|dr;t|dr;|D ]}td| d| d	 || q'|rA||fS |S )
Noutput_loading_infoFTmissing_keysr  _tied_weights_keysz!Recovering a missing tied weight z2 from a legacy LongT5 checkpoint. Consider saving zF in your checkpoint or updating the config (tie_word_embeddings=true).)r  r   from_pretrainedr  r   warningr  )r   argsrs  requested_loading_infomodelloading_infor  missing_keyr   r+   r;   r    s   z%LongT5PreTrainedModel.from_pretrainedc                 C   s  | j j}t|tr|jj|d  dS t|ttt	frC|j
jjjd|d d t|dr?| j jsA|jjjjd|d d dS dS dS t|tr|jjjjd|| j jd  d t|jdrk|jjdurk|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  dS dS dS t|tr|jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur	|jjj  dS dS dS t|tttfr| j j}| j j}| j j}|jjjjd||| d  d |jjjjd||d  d |j jjjd||d  d |j!jjjd||| d  d |j"r|j#jjjd||d  d t|tr|j$jjjd||d  d dS dS dS dS )zInitialize the weightsrq   rp   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelr  normal_r  tie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   r<  rY  r   r   r   r   r   r   r   r   rZ  )r   r  factorr   r   r   r+   r+   r;   _init_weights  sX   

       


z#LongT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r*   )r   .rF   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r4   fullr1   rK   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idsr+   r+   r;   _shift_rightG  s      z"LongT5PreTrainedModel._shift_right)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r  classmethodr  r  r  r   r+   r+   r   r;   r    s   
 


1r  c                       s   e Zd Zd fdd	Zdd Z													dddZ	dd	eejd
f dejdejde	de
f
ddZed	ejdededejdejdefddZ  ZS )LongT5StackNc                    s   t    t j j| _|d ur|j| j_ j| _ j	| _	| j	d | _
t fddt jD | _t j jd| _t j| _d| _|   d S )Nr   c                    s"   g | ]}t  t|d k|dqS )r   ri  )r  r"  ).0rM   r   r+   r;   
<listcomp>p  s    z(LongT5Stack.__init__.<locals>.<listcomp>r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   r=  r#   r  rG   
num_layersblockr   r   final_layer_normr   r   r   r   	post_init)r   r   r  r   r  r;   r   d  s    

zLongT5Stack.__init__c                 C   s
   || _ d S r   )r  r   new_embeddingsr+   r+   r;   set_input_embeddings~     
z LongT5Stack.set_input_embeddingsc           %      C   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd usJ d| |}|\}}| jr|	r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr| |||t|tr|jn||
}n| j jdkrt|| j|j}n|}| jr:|d ur:| \}}}||f}|d u r4tj||jd}| |}nd }| || j j}| || j j}|rQdnd }|
rXdnd }|
rc| jrcdnd }d }d }|  |}t!| j"D ]U\} }!||  }"||  }#|r||f }|!|||||||"|#||	|
||d}$|$d }|$d }| jr|d ur|$|
rdnd }|
r||$d f }| jr||$d f }qs| #|}|  |}|r||f }|st$dd |||||fD S t%|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer*   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsr  r   rn   r  r+   )r(  r  r  r)  r*  r  r	  r   r   r^      c                 s   s    | ]	}|d ur|V  qd S r   r+   )r  r   r+   r+   r;   	<genexpr>!  s    z&LongT5Stack.forward.<locals>.<genexpr>)last_hidden_stater  r   
attentionscross_attentions)&r   r)  r*  output_hidden_statesuse_return_dictr   r  sizer  r   r  r   r   r  is_encoder_decoderr   r
   get_seq_lengthr4   rP   r>   r   r{   _update_causal_maskr   r  r  rc   r#   invert_attention_maskget_head_maskr  r   	enumerater  r  rI   r   )%r   r  r]   r  r  r  	head_maskcross_attn_head_maskr  r)  r*  r  r  r	  err_msg_prefixinput_shaper}   r+  past_key_values_lengthmask_seq_lengthr4  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr'  r  r   rM   layer_moduler(  r  layer_outputsr+   r+   r;   r     s   







zLongT5Stack.forwardFr]   r    input_tensorr	  r  r*  c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2rp   flex_attentionr   Fsdpa)r  r  is_trainingr   r*   )sequence_lengthtarget_lengthr(   r	  r}   )cudaxpunpu)r   _attn_implementationr  r   r4   rr   r!   r  is_compileabler   _ignore_causal_mask_sdpar  r(   r1   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr>   rf   r  r   _unmask_unattended)r   r]   r  r	  r  r*  past_seen_tokensusing_compilable_cacher(   r  r  r4  	min_dtyper+   r+   r;   r  5  sT   




zLongT5Stack._update_causal_maskr  r  r(   r}   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer(   r>   r   )diagonalrn   r*   r   )r$   r4   r  r   r  r>   triurP   r@   expandr  r1   rY   masked_fill)r]   r  r  r(   r	  r}   rs  r4  r  mask_lengthpadding_maskr+   r+   r;   r  y  s,    $
6  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positionr   )NNNNNNNNNNNNNrW  )r   r   r   r   r  r   r   r4   rr   r	   r"  r  r;  r|   r(   r  r   r+   r+   r   r;   r  c  sX    
 ;
Dr  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c                &       s8  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
dd Ze																d%deej deej deej deej deej deej deej deeeej   dee deej deej dee dee dee d ee d!eej d"eeej ef f"d#d$Z  ZS )&r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S )NFT)r   r   r   r   r  r   r  copydeepcopyr   r)  tie_encoder_decoderr  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   r+   r;   r     s   

zLongT5Model.__init__c                 C      | j S r   r  r   r+   r+   r;   get_input_embeddings     z LongT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S r   r  r  r  r  r  r+   r+   r;   r       z LongT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S r   r   r  r  r  r  r  r  r%  r+   r+   r;   _tie_weights     zLongT5Model._tie_weightsc                 C   r#  r   r  r%  r+   r+   r;   get_encoder  r'  zLongT5Model.get_encoderc                 C   *   |  D ]\}}| jj| j| qdS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr  r  	attentionr   r   heads_to_pruner  r   r+   r+   r;   _prune_heads     zLongT5Model._prune_headsNr  r]   r  r  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr)  r*  r  r  r	  r&   c                 C   s"  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }| j||||	|||||||||d}|s}|| S t|j|j|j|j|j|j|j|jdS )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r]   r  r  r*  r  r  r   r   r^   r  r   r  r  r]   r  r  r  r  r  r  r)  r*  r  r  r	  )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r)  r  r  r  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r   r  r   r  r  r   r  r  )r   r  r]   r  r  r  r:  r  r;  r  r  r<  r)  r*  r  r  r	  r   decoder_outputsr+   r+   r;   r     sd   Q	zLongT5Model.forward)NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpectedr  r   r   r&  r  r-  r0  r8  r   r   r4   
LongTensorFloatTensor
BoolTensorrr   rI   r	   r"  r   r   r   r   r+   r+   r   r;   r    s|    	
r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc                (       sL  e Zd ZdgZg dZdef fddZdd Zdd	 Zd
d Z	dd Z
e																	d%deej deej deej deej deej deej deej deeeej   dee deej deej deej dee dee dee dee deej d eeej ef f$d!d"Zdejfd#d$Z  ZS )&r  r  )r  r  zlm_head.weightr   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr   )r   r   r   	model_dimr   r   r  r  r  r  r   r)  r  r  r  r  r  r  r   r  r  r   r   r+   r;   r     s   

z'LongT5ForConditionalGeneration.__init__c                 C   r#  r   r$  r%  r+   r+   r;   r&    r'  z3LongT5ForConditionalGeneration.get_input_embeddingsc                 C   r(  r   r)  r  r+   r+   r;   r    r*  z3LongT5ForConditionalGeneration.set_input_embeddingsc                 C   r+  r   r,  r%  r+   r+   r;   r-    r.  z+LongT5ForConditionalGeneration._tie_weightsc                 C   r#  r   r/  r%  r+   r+   r;   r0    r'  z*LongT5ForConditionalGeneration.get_encoderNr  r]   r  r  r  r:  r  r;  r  r  r<  labelsr)  r*  r  r  r	  r&   c                 C   s  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }|durv|du rv|du rv| |}| j||||	|||||||||d}|d }| j jr|| jd  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr=  r   r   r^   r>  r?  r  r  )ignore_indexr*   )	losslogitsr  r@  rA  r  rB  r  rC  )r   r)  r  r  r  rD  rE  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGrG  r  r   r   r   r  r  r  rN  r  r   rY   r>   r  r  r   r  r   r  r  r  )r   r  r]   r  r  r  r:  r  r;  r  r  r<  rO  r)  r*  r  r  r	  r   rH  sequence_output	lm_logitsrQ  loss_fctoutputr+   r+   r;   r     s~   U	


z&LongT5ForConditionalGeneration.forwardc                 C   s
   |  |S r   )r  )r   rO  r+   r+   r;   %prepare_decoder_input_ids_from_labelsV  r  zDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNNN)r   r   r   rI  r  r   r   r&  r  r-  r0  r   r   r4   rJ  rK  rL  rr   rI   r	   r"  r   r   r   rX  r   r+   r+   r   r;   r  {  s    	
 )r  c                       s   e Zd ZdgZdgZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )r  r  r  r   c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S )NF)r   r   r   r   r  r   r  r  r  r)  r  r  r  r  )r   r   r!  r   r+   r;   r   _  s   
zLongT5EncoderModel.__init__c                 C   r#  r   r$  r%  r+   r+   r;   r&  k  r'  z'LongT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )r  r  r  r  r+   r+   r;   r  n  s   z'LongT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S r   )r   r  r  r  r  r  r%  r+   r+   r;   r-  r  s   zLongT5EncoderModel._tie_weightsc                 C   r#  r   r/  r%  r+   r+   r;   r0  v  r'  zLongT5EncoderModel.get_encoderc                 C   r1  r2  r3  r6  r+   r+   r;   r8  y  r9  zLongT5EncoderModel._prune_headsNr  r]   r  r  r*  r  r  r&   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr=  )r   r  r  )	r   r  r]   r  r  r*  r  r  r;  r+   r+   r;   r     s   #
zLongT5EncoderModel.forward)NNNNNNN)r   r   r   r  rI  r   r   r&  r  r-  r0  r8  r   r   r4   rJ  rK  r"  r   rI   r   r   r   r+   r+   r   r;   r  Z  sD    	r  )r  r  r  r  )r   )\ry  r  r   rD  typingr   r   r   r4   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_longt5r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerr   r   rr   r|   r<   rC   rO   rV   r\   r>   rc   rI   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  r   r   r   r   r<  rY  rh  ro  rz  r|  r  r  r  __HEAD_MASK_WARNING_MSGr  r  r  __all__r+   r+   r+   r;   <module>   s   $	
$$	 	
1	

 j C  	%"'`~  R @ [X