o
    ei+P                    @   s  d Z ddlZddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& e$'e(Z)dZdej*de+de+de+dej*f
ddZ,dej*de+de+dej*fddZ-dZdej*de+de+de+dej*f
ddZ.de+dej*fdd Z/d!ej*de+dej*fd"d#Z0d$ej*de+d%ej1dej*fd&d'Z2d$ej*d(e+de3ej*ej*f fd)d*Z4d$ej*d(e+dej*fd+d,Z5d-ej*d.ej*d/e+dej*fd0d1Z6G d2d3 d3ej7Z8zdd4l9m:Z: e:Z8e);d5 W n e<y   Y n e=y,   e)>d6 Y nw G d7d8 d8ej7Z?G d9d: d:ej7Z@G d;d< d<ej7ZAG d=d> d>ej7ZBG d?d@ d@ej7ZCG dAdB dBej7ZDG dCdD dDej7ZEG dEdF dFej7ZFG dGdH dHej7ZGG dIdJ dJej7ZHG dKdL dLeZIe"G dMdN dNeZJG dOdP dPeJZKe"G dQdR dReJZLe"dSdTG dUdV dVeJeZMe"G dWdX dXeJZNg dYZOdS )[zPyTorch LongT5 model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torchdynamo_compilinglogging   )LongT5Configx	block_lendim	pad_valuereturnc                 C   s   | j |  | }t| j s"t| j }||  |7  < tj|| jdS dg| j }d|f||< t|ddd d}tj	j
| |d|d} | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr    ndimsumr   
functionalr&   )r   r   r   r   pad_len	new_shaper&   r#   r#   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multiple5   s   

r4   c                 C   s~   | j | | dkrt| ||dd} | j | | }| j d| ||f | j |d d  }d|v r:tj|| j| jdS | |S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r   Nr   r    device)r)   r4   r,   emptyr    r6   reshape)r   r   r   
num_blocksoutput_shaper#   r#   r3   _split_into_blocksE   s   (
r;   	block_dimsequence_dimc           	      C   s   | j | }dg| j }d||< t|ddd d}tjj| |d|d} g }tdD ]}td	dg| j }t||| ||< t|}|	| |  q)t
j||d
S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://huggingface.co/papers/2112.07916.
    r!   )r   r   Nr"   r#   r$   r%   r   r   r   )r)   r.   r/   r   r0   r&   rangeslicetupleappendr,   cat)	r   r<   r=   r   r9   r&   blocks_listiindicesr#   r#   r3   _concatenate_3_blocksT   s   
rG   c                 C   s:   t jd|  t jd}|| |   }|d|d }|S )z:Makes 3-blocked relative position ids for local attention.r   r   r   r   )r,   arangeint32	unsqueeze)r   position_idscenter_position_idsrelative_position_idsr#   r#   r3   "_make_3block_relative_position_idsm   s   rN   local_attention_maskc                 C   sF   t |}t||k }|ddddddf }|| j}t| |S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rN   r,   abstor6   logical_and)rO   r   rM   locality_maskr#   r#   r3   _mask_local_attention_maskv   s
   rT   attention_maskr6   c                 C   sV   t | |dd}t|ddd}|d}|d}t||}t||}|d|S )z;Prepare attention mask to be applied for a local attention.r   r>      r<   r=   r"   )r;   rG   rJ   r,   rR   rT   rQ   )rU   r   r6   _blocked_attention_mask_3blocked_attention_maskrO   r#   r#   r3   _get_local_attention_mask   s   


r[   global_block_sizec                    s^  | j dd \}dtjdtjf fdd}tj| | jd  }tj|dd	| }t| d
kdd| j}t	|| d | j}tj
d|j|jd}t||k||}||  | d  }||}  }|dkr|tj|ddj|ddd}	ntj|d|j|jd}	tjt||ddd }
|
| j}
t|
|	kdd}
|tj|
tjfS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    NrV   	block_idsr   c                    sd   t    d k}|| j}t || dk}|dd| jd }t 	| |k | |} | S )Nr   r   r"   )
r,   rH   rQ   r6   rR   r/   rJ   typer    where)r]   
block_endstrue_block_endsfull_blocksr\   seq_lenr#   r3   handle_orphan_tokens   s   z:_make_global_fixed_block_ids.<locals>.handle_orphan_tokensr6   r   )axis              ?g     @r"   r5   r   r>   )r)   r,   Tensor	ones_liker6   cumsumr_   r^   r    floortensormaxvaluesrepeat	transposer-   onesrQ   int)rU   r\   
batch_sizere   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsr#   rc   r3   _make_global_fixed_block_ids   s,   
"r}   c                 C   s@   t | |\}}|jd }tj||jd}||d  }|tjS )zBCreate the relative position tensor for local -> global attention.r"   rf   .N)r}   r)   r,   rH   r6   r^   int64)rU   r\   r]   r|   global_seq_lenglobal_positionsside_relative_positionr#   r#   r3    _make_side_relative_position_ids   s
   
r   hidden_statesr]   r   c                 C   sf   | |dktj||j|jd}tj|tj	|d ddddddf }t
d| || jS )zFCompute individual block aggregates by summing over individual blocks.r   r5   r   Nr"   z...nd,...ng->...gd)r_   r,   rn   r    r6   r   r0   one_hotr^   r   einsum)r   r]   r   one_hot_block_idsr#   r#   r3   _create_global_aggregates   s
   0r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LongT5LayerNormư>c                    s&   t    tt|| _|| _dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr,   rs   weightvariance_epsilon)selfhidden_sizeeps	__class__r#   r3   r      s   

zLongT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )NrV   r"   T)keepdim)rQ   r,   float32powmeanrsqrtr   r   r    float16bfloat16)r   r   variancer#   r#   r3   forward   s
   
zLongT5LayerNorm.forward)r   )__name__
__module____qualname__r   r   __classcell__r#   r#   r   r3   r      s    r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                       *   e Zd Zdef fddZdd Z  ZS )LongT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   r#   r3   r      s
   
zLongT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)r   r   r   
isinstancer   r   r,   rj   r    int8rQ   )r   r   r#   r#   r3   r     s   



zLongT5DenseActDense.forwardr   r   r   r   r   r   r   r#   r#   r   r3   r      s    r   c                       r   )LongT5DenseGatedActDenser   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   r   r#   r3   r     s   
z!LongT5DenseGatedActDense.__init__c                 C   s:   |  | |}| |}|| }| |}| |}|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linearr#   r#   r3   r     s   


z LongT5DenseGatedActDense.forwardr   r#   r#   r   r3   r     s    r   c                       r   )LongT5LayerFFr   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r#   r3   r   '  s   

zLongT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   r   forwarded_statesr#   r#   r3   r   1  s   

zLongT5LayerFF.forwardr   r#   r#   r   r3   r   &  s    
r   c                       sb   e Zd Z		ddededB f fddZedd
dZdddZ								dddZ	  Z
S )LongT5AttentionFNr   	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr   r   r   r   r   r#   r3   r   :  s,   

zLongT5Attention.__init__T       c                 C      d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rV   r   rQ   r,   longrP   min
zeros_likelogfloatmath	full_liker_   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger#   r#   r3   _relative_position_bucket\  s*   z)LongT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )%Compute binned relative position biasNr5   r   r   r   rV   r   r   r   )r   r   r6   r,   rH   r   rQ   r   r   r   r   permuterJ   )
r   query_length
key_lengthr6   cache_positioncontext_positionmemory_positionr   relative_position_bucketrp   r#   r#   r3   compute_bias  s    
 
zLongT5Attention.compute_biasc
                 C   s  |j dd \}
}|du}| |}||
d| j| jdd}d}t|tr8|j	| j
}|r4|j}n|j}n|}|r>|n|}|rW|durW|rW|j| j
 j}|j| j
 j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|s|	nd}	|||| j
d|	i\}}|rt|trd|j| j
< t||dd}|du r|j d	 }|dur|n|	d d }| jstjd| j||f|j|jd
}| jr| jrd|_n| j|||j|	d}|dddd| dddf }|dur|ddddddd|j d	 f }|| }|}||7 }tjj|  dd!|}tjj"|| j"| jd}t||}|dd# }||
d| j$}| %|}||f}|rY||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        NrV   r"   r   Fr   Tr   rX   r6   r    )r6   r   r>   ptraining)&r)   r   viewr   r   rr   r   r
   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysrp   r   r   updater,   matmulr   r-   r6   r    r   r   requires_gradr   r   r0   softmaxr   type_asr   
contiguousr   r   )r   r   rw   key_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   ru   
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr#   r#   r3   r     sp   






"
&

zLongT5Attention.forwardFNTr   r   )NN)NNNNNFFN)r   r   r   r   rt   r   staticmethodr   r   r   r   r#   r#   r   r3   r   9  s(    "
/r   c                       sX   e Zd Zddededdf fddZedddZdefddZ				dddZ
  ZS )LongT5LocalAttentionFr   r   r   Nc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrmt| j| j
| _d| _d S )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r3   r     s&   

zLongT5LocalAttention.__init__Tr   r   c                 C   r   r   r   r   r#   r#   r3   r     *   z.LongT5LocalAttention._relative_position_bucketblock_lengthc                 C      | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r   metaNr   r5   r   r   r   r   r   r6   r^   r,   rH   r   r   r   r   r   r   rJ   r   r%  target_devicer   r   r   r   rp   r#   r#   r3   r   O      
 
z!LongT5LocalAttention.compute_biasc                    s  |j d d \ } fdd} fdd}||}||}	||}
t|jdd}t|	jdd}	t|
jdd}
t|	ddd}	t|
ddd}
td	||	}|d u rj	s~tj
ddjjd
j f|j|jd}jr}jr}d|_nj}|d urt|dkdd}||dd }||7 }tjj| dd|}tjj|jjd}||
j}|td||
}|d d d |d d f }|}||f}|r||f }|S )NrV   c                       |   djjS 
projectionr"   r   r   r   statesru   r   r#   r3   r)   p     z+LongT5LocalAttention.forward.<locals>.shapec                       |    djS r8   r"   r
  r   r   r1  r3  r#   r3   unshapet  r4  z-LongT5LocalAttention.forward.<locals>.unshaper   r>   rW   ...qhd,...khd->...hqkr   r   Tr   rh       _r"   r   ...hqk,...khd->...qhd)r)   r   r   r   r;   r   rG   r,   r   r   r-   r   r6   r    r   r   r  r   r_   rr   r   r0   r  r   r	  r   r^   r   )r   r   rw   r  r  r  r)   r8  r  r  r  r  r  r  r  r#   r3  r3   r   g  sN   

zLongT5LocalAttention.forwardFr  NNF)r   r   r   r   boolr   r   r   rt   r   r   r   r#   r#   r   r3   r!    s    /r!  c                       st   e Zd Zddededdf fddZedddZdefddZ	de
jde
jde
jfddZ			dddZ  ZS )LongT5TransientGlobalAttentionFr   r   r   Nc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j
| _| jr}t| j| j
| _t|j|jd| _d S )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   r"  r   r\   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normr#  r   r#   r3   r     s,   
z'LongT5TransientGlobalAttention.__init__Tr   r   c                 C   r   r   r   r   r#   r#   r3   r     r$  z8LongT5TransientGlobalAttention._relative_position_bucketr%  c                 C   r&  r'  r)  r*  r#   r#   r3   r     r,  z+LongT5TransientGlobalAttention.compute_biasrw   r|   c                 C   s   t |d |d d d d d f d d d df }t |dkdd}t|| j}| j|| j | j| jd}| 	|}|
g d}|| }|S )Nr~   .r   rh   r:  r   )r   r   r   rV   )r,   eqr_   r   r\   r   r   r   r   r@  r   )r   rw   r|   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biasr#   r#   r3   compute_side_bias  s   0
z0LongT5TransientGlobalAttention.compute_side_biasc                    s   |j d d \ } fdd} fdd}t|d ur|n	t|j d d j\}}	|	j d }
t|||
}|}||}||}|	|}||}|	|}t
|jdd}t
|jdd}t
|jdd}t|ddd	}t|ddd	}dg|jd  }|j d |d< |d|}|d|}tj||gdd}tj||gdd}td
||}|d urt|j|j}t|dkdd}nd }|d u rEjstjddjjdj f|j|jd}jrjrd|_nj}|d ur||dd }||j}|d u r t |}||	}t
|jdddd}||j |j}tj||gdd}||7 }t!j"j#|$ dd%|}t!j"j&|j&jd}||j}|td||}|d d d |d d f }'|}||f}|r||f }|S )NrV   c                    r-  r.  r0  r1  r3  r#   r3   r)   5  r4  z5LongT5TransientGlobalAttention.forward.<locals>.shapec                    r5  r6  r7  r1  r3  r#   r3   r8  9  r4  z7LongT5TransientGlobalAttention.forward.<locals>.unshaper"   r   r>   rW   r9  r   rh   r:  r   r   TrX   r   r;  )(r)   r}   r,   rs   r\   r   rA  r   r   r   r;   r   rG   r.   rJ   rq   rC   r   r[   r6   r_   r   r-   r   r    r   r   r  r   rr   r^   rG  rQ   r   r0   r  r   r	  r   r   )r   r   rw   r  r  r  r)   r8  r]   r|   _global_seq_lenglobal_inputsr  r  r  side_key_statesside_value_statesrepsr  rO   side_position_biasr  r  r  r#   r3  r3   r   ,  s|   






z&LongT5TransientGlobalAttention.forwardr<  r  r=  )r   r   r   r   r>  r   r   r   rt   r   r,   rj   rG  r   r   r#   r#   r   r3   r?    s    /r?  c                       s>   e Zd ZddedB f fddZ						d	ddZ  ZS )
LongT5LayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   r   r#   r3   r     s   
z!LongT5LayerSelfAttention.__init__c              	   C   sJ   |  |}| j|||||||d}	|| |	d  }|f|	dd   }
|
S )N)rw   r  r  r  r  r   r   r   )r   rP  r   )r   r   rU   r  r  r  r  r   normed_hidden_statesattention_outputr  r#   r#   r3   r     s   

	z LongT5LayerSelfAttention.forwardr  )NNNFFNr   r   r   rt   r   r   r   r#   r#   r   r3   rN    s    rN  c                       B   e Zd ZdZd
dedB f fddZ			ddefdd	Z  ZS )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                    <   t    t||d| _t|j|jd| _t	|j
| _d S N)r   r   )r   r   r!  LocalSelfAttentionr   r   r   r   r   r   r   r   r   r   r#   r3   r     s   
z&LongT5LayerLocalSelfAttention.__init__kwargsc           	      K   D   |  |}| j||||d}|| |d  }|f|dd   }|S N)rw   r  r  r   r   )r   rX  r   	r   r   rU   r  r  rY  rQ  rR  r  r#   r#   r3   r        
z%LongT5LayerLocalSelfAttention.forwardr  r=  	r   r   r   __doc__rt   r   r   r   r   r#   r#   r   r3   rU    s    	rU  c                       rT  )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                    rV  rW  )r   r   r?  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   r   r#   r3   r     s   
z0LongT5LayerTransientGlobalSelfAttention.__init__rY  c           	      K   rZ  r[  )r   ra  r   r\  r#   r#   r3   r     r]  z/LongT5LayerTransientGlobalSelfAttention.forwardr  r=  r^  r#   r#   r   r3   r`    s    r`  c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
LongT5LayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFrO  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   r#   r3   r     s   
z"LongT5LayerCrossAttention.__init__Fc
                 C   sN   |  |}
| j|
||||||||	d	}|| |d  }|f|dd   }|S )N)rw   r  r  r  r  r   r  r   r   r   )r   rc  r   )r   r   r  rU   r  r  r  r   r  r   rQ  rR  layer_outputr  r#   r#   r3   r     s   
z!LongT5LayerCrossAttention.forwardr   )NNNFNFNrS  r#   r#   r   r3   rb     s    
rb  c                       sF   e Zd Zd	dedB f fddZ										d
ddZ  ZS )LongT5BlockFNr   c                    s   t    |j| _|jrt}n|jdkrt}n|jdkrt}n	td|j dt	 | _
| j
||||d | jrE| j
t||d | j
t| d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .rO  )r   )r   r   r   rN  encoder_attention_typerU  r`  
ValueErrorr   
ModuleListlayerrB   rb  r   )r   r   r   r   attention_layerr   r#   r3   r   %  s(   



zLongT5Block.__init__Tc                 C   sT  | j d ||||||	|d}|d }|dd  }|jtjkr7t| r7t|jjd }tj|| |d}| j	o=|d u}|r| j d ||||||d d ||	|d	}|d }|jtjkrwt| rwt|jjd }tj|| |d}||dd   }| j d |}|jtjkrt| rt|jjd }tj|| |d}|f| S )Nr   )rU   r  r  r  r  r   r   i  )r   ro   r"   )r  rU   r  r  r   r  r  r   )
rk  r    r,   r   isinfanyfinforo   clampr   )r   r   rU   r  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr  r  r  return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr#   r#   r3   r   <  sL   	
zLongT5Block.forwardr  )
NNNNNNFFTNrS  r#   r#   r   r3   re  $  s    re  c                   @   sL   e Zd ZU eed< dZdZdgZdZe	dd Z
e dd	 Zd
d ZdS )LongT5PreTrainedModelr   transformerTre  Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r,   rn   r   r   )r   r}  
input_maskdummy_inputsr#   r#   r3   r    s   

z"LongT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|trt|j|d  dS t|ttt	frCtj
|jjd|d d t|dr?| j jsAtj
|jjd|d d dS dS dS t|trtj
|jjd|| j jd  d t|jdrk|jjdurkt|jj tj
|jjd|| j jd  d t|jdr|jjdurt|jj dS dS dS t|trtj
|jjd|| j jd  d t|jdr|jjdurt|jj tj
|jjd|| j jd  d t|jdr|jjdurt|jj tj
|jjd|| j jd  d t|jdr|jjdur	t|jj dS dS dS t|tttfr| j j}| j j}| j j}tj
|jjd||| d  d tj
|jjd||d  d tj
|j jd||d  d tj
|j!jd||| d  d |j"rtj
|j#jd||d  d t|trtj
|j$jd||d  d dS dS dS dS )zInitialize the weightsri   rh   )r   stdlm_head      r   N)%r   initializer_factorr   r   init	constant_r   LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelnormal_sharedhasattrtie_word_embeddingsr  r   r   r   r   zeros_r   r   r   r   r   r   r!  r?  r   r   r   r   r   r   r   r   r@  )r   modulefactorr   r   r   r#   r#   r3   _init_weights  sX   

       
z#LongT5PreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information..r"   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idri  	new_zerosr)   clonemasked_fill_)r   r}  r  r  shifted_input_idsr#   r#   r3   _shift_right  s    z"LongT5PreTrainedModel._shift_rightN)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r,   no_gradr  r  r#   r#   r#   r3   rz    s   
 


*rz  c                       sD   e Zd Z fddZdd Z											dddZ  ZS )	LongT5Stackc                    s   t    t j j| _ j| _ j| _| jd | _	t
 fddt jD | _t j jd| _t j| _d| _|   d S )Nr   c                    s"   g | ]}t  t|d k|dqS )r   rO  )re  r>  ).0rE   r   r#   r3   
<listcomp>  s    z(LongT5Stack.__init__.<locals>.<listcomp>r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r"  r   rj  r?   
num_layersblockr   r   final_layer_normr   r   r   r   	post_initr   r   r  r3   r     s   
zLongT5Stack.__init__c                 C   s
   || _ d S r   )r  r   new_embeddingsr#   r#   r3   set_input_embeddings     
z LongT5Stack.set_input_embeddingsNc           "      K   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|
d ur$|
n| j j}
|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|rtd
 d}|d u r| jd usJ d| |}|\}}| jr|r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jrt| j ||||d}n| j jdkr	t|| j|j}n|}| jr2|d ur2| \}}}||f}|d u r,tj||jd}| |}nd }|	r9dnd }|r@dnd }|rK| jrKdnd }d }d }| |}t| jD ]K\}} |	rg||f }| ||||||||||
|d}!|!d }|!d }| jr|d ur|!|rdnd }|r||!d f }| jr||!d f }q[| |}| |}|	r||f }|
st dd |||||fD S t!|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer"   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsr  r   rf   )r   r  rU   r   r  rf  r#   )r  r  r  rt  r   r   r   rV      c                 s   s    | ]	}|d ur|V  qd S r   r#   )r  r   r#   r#   r3   	<genexpr>  s    z&LongT5Stack.forward.<locals>.<genexpr>)last_hidden_stater  r   
attentionscross_attentions)"r   r  r  output_hidden_statesuse_return_dictr   ri  sizer   r   r   r   r   r  is_encoder_decoderr
   r	   get_seq_lengthr,   rH   r6   r   rs   r   rh  r[   r   invert_attention_maskr   	enumerater  r  rA   r   )"r   r}  rU   rq  rr  r  r  r  r  r  rt  r   rY  err_msg_prefixinput_shaperu   r  past_key_values_lengthmask_seq_lengthr  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr  rs  r   rE   layer_modulelayer_outputsr#   r#   r3   r     s   






zLongT5Stack.forward)NNNNNNNNNNN)r   r   r   r   r  r   r   r#   r#   r   r3   r    s    r  c                       s   e Zd ZdgZdddZdef fddZdd Zd	d
 Ze														dde
jdB de
jdB de
jdB de
jdB deee
j  dB dedB de
jdB de
jdB dedB dedB dedB dedB de
jdB dee
j eB fddZ  ZS )r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weight)encoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    sl   t  | t|j|j| _t|}d|_	d|_
t|| _t|}d|_	|j|_t|| _|   d S )NFT)r   r   r   r   r  r   r  copydeepcopyr   r  r  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   r#   r3   r     s   



zLongT5Model.__init__c                 C      | j S r   r  r   r#   r#   r3   get_input_embeddings     z LongT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S r   r  r  r  r  r  r#   r#   r3   r       z LongT5Model.set_input_embeddingsNr}  rU   r|  r~  encoder_outputsr  r  decoder_inputs_embedsr  r  r  rt  r   r   c                 K   s   |	dur|	n| j j}	|dur|n| j j}|du r$| j||||
||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }| j|||||||	|
|||d}|sb|| S t|j	|j
|j|j|j|j	|j|jdS )	a	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr}  rU   r  r  r  rt  r   r   rV   r  r   r  r}  rU   r  r  rq  rr  r  r  r  rt  r   )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_staterq  encoder_attentions)r   r  r  r  r   r   lenr  r   r  r  r   r  r  )r   r}  rU   r|  r~  r  r  r  r  r  r  r  rt  r   rY  r   decoder_outputsr#   r#   r3   r     sV   CzLongT5Model.forward)NNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r   r  r  r   r,   
LongTensorFloatTensor
BoolTensorrA   r   rj   r>  r   r   r   r#   r#   r   r3   r    sh    	
r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc                !       s  e Zd ZdgZddddZdef fddZdd Zd	d
 Ze															dde
jdB de
jdB de
jdB de
jdB deee
j  dB dedB de
jdB de
jdB de
jdB dedB dedB dedB dedB de
jdB dee
j eB fddZde
jfddZ  ZS ) r  r  r  )r  r  zlm_head.weightr   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _tj|j|jdd| _|   d S )NFTr   )r   r   r   	model_dimr   r   r  r  r  r  r   r  r  r  r  r  r  r   r  r  r  r   r#   r3   r   C  s   



z'LongT5ForConditionalGeneration.__init__c                 C   r  r   r  r  r#   r#   r3   r  X  r  z3LongT5ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r  r#   r#   r3   r  [  r  z3LongT5ForConditionalGeneration.set_input_embeddingsNr}  rU   r|  r~  r  r  r  r  labelsr  r  r  rt  r   r   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|du r$| j||||||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }|	dur]|du r]|du r]| |	}| j|||||||
||||d}|d }| j j	r||| j
d  }| |}d}|	durtd	d
}|	|j}	||d|d|	d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a7  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr  r   r   rV   r  r  r  r  )ignore_indexr"   )	losslogitsr  r  r  r  r  rq  r  )r   r  r  r  r   r   r  r  r  r  r  r  r   rQ   r6   r   r  r   r  r   r  r  r  )r   r}  rU   r|  r~  r  r  r  r  r  r  r  r  rt  r   rY  r   r  sequence_output	lm_logitsr  loss_fctoutputr#   r#   r3   r   `  sp   G


z&LongT5ForConditionalGeneration.forwardc                 C   s
   |  |S r   )r  )r   r  r#   r#   r3   %prepare_decoder_input_ids_from_labels  r  zDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNN)r   r   r   r  r  r   r   r  r  r   r,   r  r  r  rA   rj   r   r>  r   r   r  r   r#   r#   r   r3   r  4  st    	
 r  c                       s   e Zd ZddiZdgZdef fddZdd Zd	d
 Ze							dde
jdB de
jdB de
jdB dedB dedB dedB dee
j eB fddZ  ZS )r  r  r  r  r   c                    sD   t  | t|j|j| _t|}d|_	t
|| _|   d S )NF)r   r   r   r   r  r   r  r  r  r  r  r  r  )r   r   r  r   r#   r3   r     s   

zLongT5EncoderModel.__init__c                 C   r  r   r  r  r#   r#   r3   r    r  z'LongT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )r  r  r  r  r#   r#   r3   r    s   z'LongT5EncoderModel.set_input_embeddingsNr}  rU   r  r  r  rt  r   c           	      K   s.   |dur|n| j j}| j||||||d}|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   r  r  )	r   r}  rU   r  r  r  rt  rY  r  r#   r#   r3   r     s   #	zLongT5EncoderModel.forward)NNNNNN)r   r   r   r  r  r   r   r  r  r   r,   r  r  r>  rA   r   r   r   r#   r#   r   r3   r    s:    	r  )r  r  r  rz  )r   )Pr_  r  r   typingr   r,   r   torch.nnr   r  r   r  activationsr   cache_utilsr   r	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_longt5r   
get_loggerr   r   rj   rt   r4   r;   rG   rN   rT   r6   r[   rA   r}   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionwarningr   r   r   r   r!  r?  rN  rU  r`  rb  re  rz  r  r  r  r  __all__r#   r#   r#   r3   <module>   s   
$$	 	
1	
 M - q" $[U F  ;I