o
    wi                    @   s  d Z ddlZddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ e& rddl,m-Z- ddl.m/Z/ e)0e1Z2d]dej3de4de4de4dej3f
ddZ5dej3de4de4dej3fddZ6d]dej3de4de4de4dej3f
dd Z7de4dej3fd!d"Z8d#ej3de4dej3fd$d%Z9d&ej3de4d'ej:dej3fd(d)Z;d&ej3d*e4de<ej3ej3f fd+d,Z=d&ej3d*e4dej3fd-d.Z>d/ej3d0ej3d1e4dej3fd2d3Z?G d4d5 d5e	j@ZAzdd6lBmCZC eCZAe2Dd7 W n eEy;   Y n eFyI   e2Gd8 Y nw G d9d: d:e	j@ZHG d;d< d<e	j@ZIG d=d> d>e	j@ZJG d?d@ d@e	j@ZKG dAdB dBe	j@ZLG dCdD dDe	j@ZMG dEdF dFe	j@ZNG dGdH dHe	j@ZOG dIdJ dJe	j@ZPG dKdL dLe	j@ZQG dMdN dNeZRe%G dOdP dPeZSG dQdR dReSZTdSZUe%G dTdU dUeSZVe%dVdWG dXdY dYeSeZWe%G dZd[ d[eSZXg d\ZYdS )^zPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 C   s   | j |  | }t| j s"t| j }||  |7  < tj|| jdS dg| j }d|f||< t|ddd d}tj	j
| |d|d} | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr'   ndimsumr   
functionalr-   )r!   r"   r#   r$   pad_len	new_shaper-   r*   r*   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multiple?   s   

r;   c                 C   s~   | j | | dkrt| ||dd} | j | | }| j d| ||f | j |d d  }d|v r:tj|| j| jdS | |S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r$   Nr   r'   device)r0   r;   r3   emptyr'   r=   reshape)r!   r"   r#   
num_blocksoutput_shaper*   r*   r:   _split_into_blocksO   s   (
rB   	block_dimsequence_dimc           	      C   s   | j | }dg| j }d||< t|ddd d}tjj| |d|d} g }tdD ]}td	dg| j }t||| ||< t|}|	| |  q)t
j||d
S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r(   )r   r   Nr)   r*   r+   r,   r   r   r#   )r0   r5   r6   r   r7   r-   rangeslicetupleappendr3   cat)	r!   rC   rD   r$   r@   r-   blocks_listiindicesr*   r*   r:   _concatenate_3_blocks^   s   
rN   c                 C   s:   t jd|  t jd}|| |   }|d|d }|S )z:Makes 3-blocked relative position ids for local attention.r   r&   r   r   )r3   arangeint32	unsqueeze)r"   position_idscenter_position_idsrelative_position_idsr*   r*   r:   "_make_3block_relative_position_idsw   s   rU   local_attention_maskc                 C   sF   t |}t||k }|ddddddf }|| j}t| |S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rU   r3   abstor=   logical_and)rV   r"   rT   locality_maskr*   r*   r:   _mask_local_attention_mask   s
   r[   attention_maskr=   c                 C   sV   t | |dd}t|ddd}|d}|d}t||}t||}|d|S )z;Prepare attention mask to be applied for a local attention.r   rE      rC   rD   r)   )rB   rN   rQ   r3   rY   r[   rX   )r\   r"   r=   _blocked_attention_mask_3blocked_attention_maskrV   r*   r*   r:   _get_local_attention_mask   s   


rb   global_block_sizec                    s^  | j dd \}dtjdtjf fdd}tj| | jd  }tj|dd	| }t| d
kdd| j}t	|| d | j}tj
d|j|jd}t||k||}||  | d  }||}  }|dkr|tj|ddj|ddd}	ntj|d|j|jd}	tjt||ddd }
|
| j}
t|
|	kdd}
|tj|
tjfS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nr]   	block_idsr%   c                    sd   t    d k}|| j}t || dk}|dd| jd }t 	| |k | |} | S )Nr   r   r)   )
r3   rO   rX   r=   rY   r6   rQ   typer'   where)rd   
block_endstrue_block_endsfull_blocksrc   seq_lenr*   r:   handle_orphan_tokens   s   z:_make_global_fixed_block_ids.<locals>.handle_orphan_tokensr=   r   )axis              ?g     @r)   r<   r   rE   )r0   r3   Tensor	ones_liker=   cumsumrf   re   r'   floortensormaxvaluesrepeat	transposer4   onesrX   int)r\   rc   
batch_sizerl   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsr*   rj   r:   _make_global_fixed_block_ids   s,   
"r   c                 C   s@   t | |\}}|jd }tj||jd}||d  }|tjS )zBCreate the relative position tensor for local -> global attention.r)   rm   .N)r   r0   r3   rO   r=   re   int64)r\   rc   rd   r   global_seq_lenglobal_positionsside_relative_positionr*   r*   r:    _make_side_relative_position_ids   s
   
r   hidden_statesrd   r   c                 C   sf   | |dktj||j|jd}tj|tj	|d ddddddf }t
d| || jS )zFCompute individual block aggregates by summing over individual blocks.r   r<   r   Nr)   z...nd,...ng->...gd)rf   r3   ru   r'   r=   r   r7   one_hotre   r   einsum)r   rd   r   one_hot_block_idsr*   r*   r:   _create_global_aggregates   s
   0r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LongT5LayerNormư>c                    s&   t    tt|| _|| _dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr3   rz   weightvariance_epsilon)selfhidden_sizeeps	__class__r*   r:   r      s   

zLongT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )Nr]   r)   T)keepdim)rX   r3   float32powmeanrsqrtr   r   r'   float16bfloat16)r   r   variancer*   r*   r:   forward   s
   
zLongT5LayerNorm.forward)r   )__name__
__module____qualname__r   r   __classcell__r*   r*   r   r:   r      s    r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                       *   e Zd Zdef fddZdd Z  ZS )LongT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   r*   r:   r   	  s
   
zLongT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)r   r   r   
isinstancer   r   r3   rq   r'   int8rX   )r   r   r*   r*   r:   r     s   



zLongT5DenseActDense.forwardr   r   r   r   r   r   r   r*   r*   r   r:   r     s    r   c                       r   )LongT5DenseGatedActDenser   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   r   r*   r:   r     s   
z!LongT5DenseGatedActDense.__init__c                 C   s:   |  | |}| |}|| }| |}| |}|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linearr*   r*   r:   r   '  s   


z LongT5DenseGatedActDense.forwardr   r*   r*   r   r:   r     s    r   c                       r   )LongT5LayerFFr   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r*   r:   r   2  s   

zLongT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   r   forwarded_statesr*   r*   r:   r   <  s   

zLongT5LayerFF.forwardr   r*   r*   r   r:   r   1  s    
r   c                       sl   e Zd Z		ddedee f fddZdd ZedddZ	dddZ
									dddZ  ZS )LongT5AttentionFNr   	layer_idxc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   r*   r:   r   E  s.   

zLongT5Attention.__init__c                 C      t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S Nr   r   rE   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexr*   r*   r:   prune_headsh     zLongT5Attention.prune_headsT       c                 C      d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r]   r   rX   r3   longrW   min
zeros_likelogfloatmath	full_likerf   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger*   r*   r:   _relative_position_bucketx  s*   z)LongT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )%Compute binned relative position biasNr<   r   r   r   r]   r   r   r   )r   r   r=   r3   rO   r   rX   r  r   r   r   permuterQ   )
r   query_length
key_lengthr=   cache_positioncontext_positionmemory_positionr   relative_position_bucketrw   r*   r*   r:   compute_bias  s    
 
zLongT5Attention.compute_biasc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|dur4|j| j}|r1|j	}n|j
}|r8|n|}|rO|durO|rO|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|dur|s}|
nd}
|||| jd|
i\}}|rd|j| j< t||dd}|du r|j d }|dur|n|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|
d
}|dddd| dddf }|dur|ddddddd|j d f }|| }| jr%t|j d }d|t| j< |dd| f }n|}||7 }tjj |! dd"|}tjj#|| j#| jd}|durL|| }t||}|dd$ }||d| j%}| &|}|||f}|	rt||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr]   r)   r   r  Tr   r_   r=   r'   )r=   r  r   rE   ptraining)'r0   r   viewr   r   ry   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater3   matmulr   r4   r=   r'   r   r  requires_gradr  r   rz   r2   boolr   r7   softmaxr   type_asr   
contiguousr   r   )r   r   r~   key_value_statesposition_biaspast_key_valuelayer_head_maskr  	use_cacheoutput_attentionsr  r|   
seq_lengthis_cross_attentionquery_statesr  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr*   r*   r:   r     sx   





"
&



zLongT5Attention.forwardFNTr   r   )NN)	NNNNNNFFN)r   r   r   r   r   r{   r   r   staticmethodr  r  r   r   r*   r*   r   r:   r   D  s,    #
/r   c                       sb   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
				dddZ  ZS )LongT5LocalAttentionFr   r   r%   Nc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrmt| j| j
| _t | _d| _d S )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r:   r   )  s(   

zLongT5LocalAttention.__init__c                 C   r   r   r   r   r*   r*   r:   r   C  r   z LongT5LocalAttention.prune_headsTr   r   c                 C   r   r   r   r   r*   r*   r:   r  S  *   z.LongT5LocalAttention._relative_position_bucketblock_lengthc                 C      | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r  metaNr   r<   r  r  r   r   r   r=   re   r3   rO   r   r  r   r   r   r  rQ   r   r:  target_devicer
  r	  r   r  rw   r*   r*   r:   r        
 
z!LongT5LocalAttention.compute_biasc                    s  |j d d \ } fdd} fdd}||}	||}
||}t|	jdd}	t|
jdd}
t|jdd}t|
ddd}
t|ddd}td	|	|
}|d u rj	s~tj
ddjjd
j f|j|jd}jr}jr}d|_nj}|d urt|dkdd}||dd }||7 }tjj| dd|}tjj|jjd}|d ur|| }||j}|td||}|d d d |d d f }|}d }|f|f |f }|r||f }|S )Nr]   c                       |   djjS 
projectionr)   r  r   r   statesr|   r   r*   r:   r0        z+LongT5LocalAttention.forward.<locals>.shapec                       |    djS r?   r)   r  r  r   rF  rH  r*   r:   unshape  rI  z-LongT5LocalAttention.forward.<locals>.unshaper   rE   r^   ...qhd,...khd->...hqkr   r  Tr   ro       _r)   r  ...hqk,...khd->...qhd)r0   r   r   r   rB   r"   rN   r3   r   r   r4   r   r=   r'   r   r  r  r  rf   ry   r   r7   r  r   r  r   re   r   )r   r   r~   r   r"  r$  r%  r0   rM  r'  r*  r+  r,  r0  r1  present_key_value_stater2  r*   rH  r:   r     sP   

zLongT5LocalAttention.forwardFr4  NNNF)r   r   r   r   r  r   r   r5  r  r{   r  r   r   r*   r*   r   r:   r6  (  s    /r6  c                       s~   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
dejdejdejfddZ				dddZ  ZS )LongT5TransientGlobalAttentionFr   r   r%   Nc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j
| _t | _| jrt| j| j
| _t|j|jd| _d S )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   r7  r"   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normr8  r   r*   r:   r     s.   
z'LongT5TransientGlobalAttention.__init__c                 C   r   r   r   r   r*   r*   r:   r     r   z*LongT5TransientGlobalAttention.prune_headsTr   r   c                 C   r   r   r   r   r*   r*   r:   r    r9  z8LongT5TransientGlobalAttention._relative_position_bucketr:  c                 C   r;  r<  r>  r?  r*   r*   r:   r  I  rA  z+LongT5TransientGlobalAttention.compute_biasr~   r   c                 C   s   t |d |d d d d d f d d d df }t |dkdd}t|| j}| j|| j | j| jd}| 	|}|
g d}|| }|S )Nr   .r   ro   rO  r  )r   r   r   r]   )r3   eqrf   r   rc   r  r   r   r   rU  r  )r   r~   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biasr*   r*   r:   compute_side_biasa  s   0
z0LongT5TransientGlobalAttention.compute_side_biasc                    s@  |j d d \ } fdd} fdd}t|d ur|n	t|j d d j\}	}
|
j d }t||	|}|}||}||}|	|}||}|	|}t
|jdd}t
|jdd}t
|jdd}t|ddd	}t|ddd	}dg|jd  }|j d |d< |d|}|d|}tj||gdd}tj||gdd}td
||}|d urt|j|j}t|dkdd}nd }|d u rEjstjddjjdj f|j|jd}jrjrd|_nj}|d ur||dd }||j}|d u r t |}||
}t
|jdddd}||j |j}tj||gdd}||7 }t!j"j#|$ dd%|}t!j"j&|j&jd}|d urj|| }||j}|td||}|d d d |d d f }'|}d }|f|f |f }|r||f }|S )Nr]   c                    rB  rC  rE  rF  rH  r*   r:   r0     rI  z5LongT5TransientGlobalAttention.forward.<locals>.shapec                    rJ  rK  rL  rF  rH  r*   r:   rM    rI  z7LongT5TransientGlobalAttention.forward.<locals>.unshaper)   r   rE   r^   rN  r   ro   rO  r   r  Tr_   r  rP  )(r0   r   r3   rz   rc   r   rV  r   r   r   rB   r"   rN   r5   rQ   rx   rJ   r   rb   r=   rf   r   r4   r   r'   r   r  r  r  ry   re   r\  rX   r   r7   r  r   r  r   r   )r   r   r~   r   r"  r$  r%  r0   rM  rd   r   _global_seq_lenglobal_inputsr'  r*  r+  side_key_statesside_value_statesrepsr,  rV   side_position_biasr0  r1  rQ  r2  r*   rH  r:   r   v  s   







z&LongT5TransientGlobalAttention.forwardrR  r4  rS  )r   r   r   r   r  r   r   r5  r  r{   r  r3   rq   r\  r   r   r*   r*   r   r:   rT    s    /rT  c                       s@   e Zd Zddee f fddZ							d	ddZ  ZS )
LongT5LayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   r   r*   r:   r     s   
z!LongT5LayerSelfAttention.__init__c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)r~   r   r"  r!  r#  r$  r  r   r   )r   re  r   )r   r   r\   r   r"  r!  r#  r$  r  normed_hidden_statesattention_outputr2  r*   r*   r:   r     s   

z LongT5LayerSelfAttention.forwardr3  )NNNNFFNr   r   r   r   r{   r   r   r   r*   r*   r   r:   rc    s    rc  c                       D   e Zd ZdZd
dee f fddZ				ddefdd	Z  Z	S )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                    <   t    t||d| _t|j|jd| _t	|j
| _d S N)r   r   )r   r   r6  LocalSelfAttentionr   r   r   r   r   r   r   r   r   r   r*   r:   r     s   
z&LongT5LayerLocalSelfAttention.__init__kwargsc           
      K   F   |  |}| j|||||d}|| |d  }|f|dd   }	|	S N)r~   r   r"  r$  r   r   )r   rm  r   
r   r   r\   r   r"  r$  rn  rf  rg  r2  r*   r*   r:   r        
	z%LongT5LayerLocalSelfAttention.forwardr3  rS  
r   r   r   __doc__r   r{   r   r   r   r   r*   r*   r   r:   rj    s    	rj  c                       ri  )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                    rk  rl  )r   r   rT  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   r   r*   r:   r   6  s   
z0LongT5LayerTransientGlobalSelfAttention.__init__rn  c           
      K   ro  rp  )r   rv  r   rq  r*   r*   r:   r   >  rr  z/LongT5LayerTransientGlobalSelfAttention.forwardr3  rS  rs  r*   r*   r   r:   ru  3  s    ru  c                       sB   e Zd Zddee f fddZ								d	ddZ  ZS )
LongT5LayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFrd  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   r*   r:   r   V  s   
z"LongT5LayerCrossAttention.__init__Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	r~   r  r   r"  r!  r#  r  r$  r  r   r   )r   rx  r   )r   r   r  r\   r   r"  r!  r#  r  r$  r  rf  rg  layer_outputr2  r*   r*   r:   r   \  s    
z!LongT5LayerCrossAttention.forwardr   )NNNNFNFNrh  r*   r*   r   r:   rw  U  s    
rw  c                       sJ   e Zd Zd	dee f fddZ												d
ddZ  ZS )LongT5BlockFNr   c                    s   t    |j| _|jrt}n|jdkrt}n|jdkrt}n	td|j dt	 | _
| j
||||d | jrE| j
t||d | j
t| d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .rd  )r   )r   r   r   rc  encoder_attention_typerj  ru  
ValueErrorr   
ModuleListlayerrI   rw  r   )r   r   r   r   attention_layerr   r*   r:   r   |  s(   



zLongT5Block.__init__Tc                 C   s  | j d |||||	|
||d}|d d \}}	|dd  }|jtjkr<t| r<t|jjd }tj|| |d}| j	oB|d u}|r| j d ||||||	|d d |
||d
}|d d \}}	|jtjkrt| rt|jjd }tj|| |d}||dd   }| j d |}|jtjkrt| rt|jjd }tj|| |d}|f}|
r||	f | }|S || }|S )	Nr   )r\   r   r"  r!  r#  r$  r  r]   i  )r   rv   r   r)   )	r  r\   r   r"  r!  r  r#  r$  r  )
r  r'   r3   r   isinfanyfinforv   clampr   )r   r   r\   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr"  cross_attn_layer_head_maskr!  r#  r$  return_dictr  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr2  r*   r*   r:   r     sX   

zLongT5Block.forwardr3  )NNNNNNNNFFTNrh  r*   r*   r   r:   rz  {  s    rz  c                   @   sB   e Zd ZeZdZdZdgZdZdZ	e
dd Zdd Zd	d
 ZdS )LongT5PreTrainedModeltransformerTrz  Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r3   ru   r   r   )r   r  
input_maskdummy_inputsr*   r*   r:   r    s   

z"LongT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr|jj|d  dS t|ttt	frC|j
jjjd|d d t|dr?| j jsA|jjjjd|d d dS dS dS t|tr|jjjjd|| j jd  d t|jdrk|jjdurk|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  dS dS dS t|tr|jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur	|jjj  dS dS dS t|tttfr| j j}| j j}| j j}|jjjjd||| d  d |jjjjd||d  d |j jjjd||d  d |j!jjjd||| d  d |j"r|j#jjjd||d  d t|tr|j$jjjd||d  d dS dS dS dS )zInitialize the weightsrp   ro   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharednormal_hasattrtie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   r6  rT  r   r   r   r   r   r   r   r   rU  )r   modulefactorr   r   r   r*   r*   r:   _init_weights  sX   

       


z#LongT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r)   )r   .rE   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr~  r   r3   fullr0   rJ   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idsr*   r*   r:   _shift_right%  s      z"LongT5PreTrainedModel._shift_rightN)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_cache_class_supports_static_cachepropertyr  r  r  r*   r*   r*   r:   r    s    

1r  c                       s   e Zd Zd fdd	Zdd Zdd Z													ddd	Z	
ddeej	df dej	dej	de
def
ddZedej	dededejdej	defddZ  ZS )LongT5StackNc                    s   t    t j j| _|d ur|j| j_ j| _ j	| _	| j	d | _
t fddt jD | _t j jd| _t j| _d| _|   d S )Nr   c                    s"   g | ]}t  t|d k|dqS )r   rd  )rz  r  ).0rL   r   r*   r:   
<listcomp>N  s    z(LongT5Stack.__init__.<locals>.<listcomp>r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   r7  r"   r  rF   
num_layersblockr   r   final_layer_normr   r   r   r   	post_init)r   r   r  r   r  r:   r   B  s    

zLongT5Stack.__init__c                 C      | j S r   r  r   r*   r*   r:   get_input_embeddings\     z LongT5Stack.get_input_embeddingsc                 C   
   || _ d S r   r  r   new_embeddingsr*   r*   r:   set_input_embeddings`     
z LongT5Stack.set_input_embeddingsc           )      C   s|  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd usJ d| |}|\}}d}d}| jr|	s|d urt|trt|tsd}t|t }n#t|tsd}td t|}n|d u rtt t }n| jsd }|d ur| nd}|d u rtj||| |jd}|d u r	t s	|| }tj|||jd}| jr| ||||d ur|jnd |
}n| j jdkr/t|| j|j}n|}| jrX|d urX| \}}}||f}|d u rRtj||jd}| |}nd }| || j j }| || j j }|rodnd }|
rvdnd }|
r| jrdnd }d }d } | !|}!t"| j#D ]l\}"}#||" }$||" }%|r||!f }|#|!||||| |$|%||	|
||d}&|	du r|&d d d |&dd   }&|&d d \}!}'|&d }| jr|d ur|&|
rdnd } |
r||&d f }| jr||&d f }q| $|!}!| !|!}!|r||!f }|	r|'nd }(|r|j}(|r$|% }(|s5t&dd |!|(|||fD S t'|!|(|||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer)   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rm   r{  r*   )r"  r  r!  r#  r$  r  r  r   r   r]      r      c                 s   s    | ]	}|d ur|V  qd S r   r*   )r  r   r*   r*   r:   	<genexpr>  s    z&LongT5Stack.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesr   
attentionscross_attentions)(r   r#  r$  output_hidden_statesuse_return_dictr   r~  sizer  r   r  r   r   r  r   r	   r   r
   from_legacy_cacheget_seq_lengthr3   rO   r=   r   rz   _update_causal_maskr  r}  rb   r"   invert_attention_maskget_head_maskr  r   	enumerater  r  to_legacy_cacherH   r   ))r   r  r\   r  r  r  	head_maskcross_attn_head_maskr  r#  r$  r  r  r  err_msg_prefixinput_shaper|   r%  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthr.  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r  r   rL   layer_moduler"  r  layer_outputsnext_decoder_cache
next_cacher*   r*   r:   r   c  s  








zLongT5Stack.forwardFr\   r   input_tensorr  r  r$  c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2ro   flex_attentionr   Fsdpa)r  r  is_trainingr   r)   )sequence_lengthtarget_lengthr'   r  r|   )cudaxpunpu)r   _attn_implementationr  r   r3   rq   r    r  is_compileabler   _ignore_causal_mask_sdpar  r'   r0   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr=   re   r  r   _unmask_unattended)r   r\   r  r  r  r$  past_seen_tokensusing_compilable_cacher'   r  r  r.  	min_dtyper*   r*   r:   r  &  sT   




zLongT5Stack._update_causal_maskr  r  r'   r|   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer'   r=   r   )diagonalrm   r)   r   )r#   r3   r  r   r  r=   triurO   r?   expandr  r0   rX   masked_fill)r\   r  r  r'   r  r|   rn  r.  r  mask_lengthpadding_maskr*   r*   r:   r  j  s,    $
6  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positionr   )NNNNNNNNNNNNNrR  )r   r   r   r   r  r  r   r   r3   rq   r	   r  r  r5  r{   r'   r  r   r*   r*   r   r:   r  A  sZ    
 J
Dr  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c                &       sJ  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
dd Zdd Ze																d'deej deej deej deej deej deej deej deeeej   deeeej   deej deej dee d ee d!ee d"ee d#eej d$eeej ef f"d%d&Z  ZS )(r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S )NFT)r   r   r   r   r  r   r  copydeepcopyr   r#  is_encoder_decoderr  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   r*   r:   r     s   

zLongT5Model.__init__c                 C   r  r   r  r  r*   r*   r:   r    r  z LongT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S r   r  r  r  r  r  r*   r*   r:   r       z LongT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S r   r   r  _tie_or_clone_weightsr  r  r  r  r  r*   r*   r:   _tie_weights     zLongT5Model._tie_weightsc                 C   r  r   r  r  r*   r*   r:   get_encoder  r  zLongT5Model.get_encoderc                 C   r  r   r  r  r*   r*   r:   get_decoder  r  zLongT5Model.get_decoderc                 C   *   |  D ]\}}| jj| j| qdS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr  r  	attentionr   r   heads_to_pruner  r   r*   r*   r:   _prune_heads     zLongT5Model._prune_headsNr  r\   r  r  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr#  r$  r  r  r  r%   c                 C   s"  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }| j||||	|||||||||d}|s}|| S t|j|j|j|j|j|j|j|jdS )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r\   r  r  r$  r  r  r   r   r]   r  r   r  r  r\   r  r  r  r  r  r  r#  r$  r  r  r  )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r#  r  r  r  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r   r  r   r  r  r   r  r  )r   r  r\   r  r  r  r0  r  r1  r  r  r2  r#  r$  r  r  r  r   decoder_outputsr*   r*   r:   r     sd   Q	zLongT5Model.forward)NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r   r  r  r!  r$  r&  r.  r   r   r3   
LongTensorFloatTensor
BoolTensorrq   rH   r  r   r   r   r   r*   r*   r   r:   r    s~    	
r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc                (       sv  e Zd ZdgZg dZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Ze																	d-deej deej deej deej deej deej deej deeeej   deeeej   deej deej d eej d!ee d"ee d#ee d$ee d%eej d&eeej ef f$d'd(Zd ejfd)d*Zd+d, Z  ZS ).r  r  )r  r  zlm_head.weightr   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr   )r   r   r   	model_dimr   r   r  r  r  r  r   r#  r  r  r  r  r  r  r   r  r  r  r   r*   r:   r   z  s   

z'LongT5ForConditionalGeneration.__init__c                 C   r  r   r  r  r*   r*   r:   r    r  z3LongT5ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r  r*   r*   r:   r    r  z3LongT5ForConditionalGeneration.set_input_embeddingsc                 C   r  r   r  r  r*   r*   r:   r!    r"  z+LongT5ForConditionalGeneration._tie_weightsc                 C   r  r   r  r  r*   r*   r:   set_output_embeddings  r  z4LongT5ForConditionalGeneration.set_output_embeddingsc                 C   r  r   rF  r  r*   r*   r:   get_output_embeddings  r  z4LongT5ForConditionalGeneration.get_output_embeddingsc                 C   r  r   r#  r  r*   r*   r:   r$    r  z*LongT5ForConditionalGeneration.get_encoderc                 C   r  r   r%  r  r*   r*   r:   r&    r  z*LongT5ForConditionalGeneration.get_decoderNr  r\   r  r  r  r0  r  r1  r  r  r2  labelsr#  r$  r  r  r  r%   c                 C   s  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }|durv|du rv|du rv| |}| j||||	|||||||||d}|d }| j jr|| jd  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr3  r   r   r]   r4  r5  r  r  )ignore_indexr)   )	losslogitsr  r6  r7  r  r8  r  r9  )r   r#  r  r  r  r:  r;  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr=  r  r   r   r   r  r  r  rE  r  r   rX   r=   r  r  r   r  r   r  r  r  )r   r  r\   r  r  r  r0  r  r1  r  r  r2  rI  r#  r$  r  r  r  r   r>  sequence_output	lm_logitsrK  loss_fctoutputr*   r*   r:   r     s~   U	


z&LongT5ForConditionalGeneration.forwardc                 C   s
   |  |S r   )r  )r   rI  r*   r*   r:   %prepare_decoder_input_ids_from_labelsS  r  zDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc              	   C   s   |d u rt d |S d}|D ]1}d}|D ]}||d||jf }q|d j|d jks1J t|t|ks;J ||f }q|S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr*   r   )r   warningindex_selectrX   r=   r0   r   )r   r  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_stater*   r*   r:   _reorder_cacheV  s   
z-LongT5ForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNNNN)r   r   r   r?  r@  r   r   r  r  r!  rG  rH  r$  r&  r   r   r3   rA  rB  rC  rq   rH   r  r   r   r   rR  rZ  r   r*   r*   r   r:   r  o  s    	
 )r  c                       s   e Zd ZdgZdgZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )r  r  r  r   c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S )NF)r   r   r   r   r  r   r  r  r  r#  r  r  r  r  )r   r   r  r   r*   r:   r   t  s   
zLongT5EncoderModel.__init__c                 C   r  r   r  r  r*   r*   r:   r    r  z'LongT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )r  r  r  r  r*   r*   r:   r    s   z'LongT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S r   )r   r  r   r  r  r  r  r*   r*   r:   r!    s   zLongT5EncoderModel._tie_weightsc                 C   r  r   r#  r  r*   r*   r:   r$    r  zLongT5EncoderModel.get_encoderc                 C   r'  r(  r)  r,  r*   r*   r:   r.    r/  zLongT5EncoderModel._prune_headsNr  r\   r  r  r$  r  r  r%   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr3  )r   r  r  )	r   r  r\   r  r  r$  r  r  r1  r*   r*   r:   r     s   #
zLongT5EncoderModel.forward)NNNNNNN)r   r   r   r@  r?  r   r   r  r  r!  r$  r.  r   r   r3   rA  rB  r  r   rH   r   r   r   r*   r*   r   r:   r  o  sD    	r  )r  r  r  r  )r   )Zrt  r  r   r:  typingr   r   r   r3   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   configuration_longt5r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerr   r   rq   r{   r;   rB   rN   rU   r[   r=   rb   rH   r   r   r   Moduler   apex.normalizationr   infoImportError	ExceptionrS  r   r   r   r   r6  rT  rc  rj  ru  rw  rz  r  r  __HEAD_MASK_WARNING_MSGr  r  r  __all__r*   r*   r*   r:   <module>   s   $	
$$	 	
1	

 e A  
$"&da  e C |X