o
    eiJ                    @   s  d Z ddlZddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* e'+e,Z-G dd dej.Z/zddl0m1Z1 e1Z/e-2d W n e3y   Y n e4y   e-5d Y nw G dd dej.Z6G dd dej.Z7G dd dej.Z8G dd dej.Z9G dd dej.Z:G d d! d!ej.Z;G d"d# d#eZ<G d$d% d%ej.Z=e&G d&d' d'e"Z>G d(d) d)e>Z?e&G d*d+ d+e>Z@e&d,d-G d.d/ d/e>eZAe&G d0d1 d1e>ZBe&d2d-G d3d4 d4e>ZCe&G d5d6 d6e>ZDe&G d7d8 d8e>ZEg d9ZFdS ):zPyTorch T5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringloggingtorch_compilable_check   )T5Configc                       s&   e Zd Zd fdd	Zdd Z  ZS )T5LayerNormư>c                    s&   t    tt|| _|| _dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ `/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.pyr"   /   s   

zT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor$   float32powmeanrsqrtr'   r&   dtypefloat16bfloat16)r(   hidden_statesvariancer-   r-   r.   forward7   s
   
zT5LayerNorm.forward)r    )__name__
__module____qualname__r"   r<   __classcell__r-   r-   r+   r.   r   .   s    r   )FusedRMSNormzODiscovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNormzBdiscovered apex but it failed to load, falling back to T5LayerNormc                       *   e Zd Zdef fddZdd Z  ZS )T5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r!   r"   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr(   rD   r+   r-   r.   r"   U   s
   
zT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rK   rQ   rO   
isinstancerL   r&   r$   Tensorr7   int8r2   r(   r:   r-   r-   r.   r<   \   s   



zT5DenseActDense.forwardr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   rC   T   s    rC   c                       rB   )T5DenseGatedActDenserD   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rE   )r!   r"   r   rH   rI   rJ   wi_0wi_1rL   rM   rN   rO   r   rP   rQ   rR   r+   r-   r.   r"   k   s   
zT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rS   )rQ   rZ   r[   rO   rT   rL   r&   r$   rU   r7   rV   r2   )r(   r:   hidden_geluhidden_linearr-   r-   r.   r<   s   s   


zT5DenseGatedActDense.forwardrX   r-   r-   r+   r.   rY   j   s    rY   c                       rB   )	T5LayerFFrD   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr*   )r!   r"   is_gated_actrY   DenseReluDenserC   r   rI   layer_norm_epsilon
layer_normr   rM   rN   rO   rR   r+   r-   r.   r"      s   

zT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rS   )rc   ra   rO   )r(   r:   forwarded_statesr-   r-   r.   r<      s   

zT5LayerFF.forwardrX   r-   r-   r+   r.   r^      s    
r^   c                       sb   e Zd Z		ddededB f fddZedd
dZdddZ								dddZ	  Z
S )T5AttentionFNrD   	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrF   )r!   r"   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerI   d_kvkey_value_proj_dim	num_headsn_headsrN   rO   	inner_dimrf   loggerwarning_oncer,   r=   r   rH   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr(   rD   rh   rf   r+   r-   r.   r"      s,   

zT5Attention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r/   r   )r2   r$   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger-   r-   r.   _relative_position_bucket   s*   z%T5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r7   device)r   r   r   )r/   r   r   r   )rw   r&   r   r$   aranger|   r2   r   rg   ri   rj   permute	unsqueeze)
r(   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluesr-   r-   r.   compute_bias   s    
 
zT5Attention.compute_biasc
                 C   s  |j dd \}
}|du}| |}||
d| j| jdd}d}t|tr8|j	| j
}|r4|j}n|j}n|}|r>|n|}|rW|durW|rW|j| j
 j}|j| j
 j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|s|	nd}	|||| j
d|	i\}}|rt|trd|j| j
< t||dd}|du r|j d	 }|dur|n|	d d }| jstjd| j||f|j|jd
}| jr| jrd|_n| j|||j|	d}|dddd| dddf }|dur|ddddddd|j d	 f }|| }|}||7 }tjj|  dd!|}tjj"|| j"| jd}t||}|dd# }||
d| j$}| %|}||f}|rY||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr/   r0   r   Fr   Tr   )r   r7   )r   r   dim)ptraining)&shaperr   viewrn   rl   	transposerT   r   
is_updatedgetrf   cross_attention_cacheself_attention_cachelayerskeysr   rs   rt   updater$   matmulrh   zerosr   r7   rx   r   requires_gradr   r   
functionalsoftmaxr   type_asrO   
contiguousro   ru   )r(   r:   maskkey_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr-   r-   r.   r<      sp   






"
&

zT5Attention.forwardFN)Trz   r{   )NN)NNNNNFFN)r=   r>   r?   r   intr"   staticmethodr   r   r<   r@   r-   r-   r+   r.   re      s(    "
/re   c                       s>   e Zd ZddedB f fddZ						d	ddZ  ZS )
T5LayerSelfAttentionFNrf   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nrh   rf   r_   )r!   r"   re   SelfAttentionr   rI   rb   rc   r   rM   rN   rO   ry   r+   r-   r.   r"   f  s   
zT5LayerSelfAttention.__init__c              	   C   sJ   |  |}| j|||||||d}	|| |	d  }|f|	dd   }
|
S )N)r   r   r   r   r   r   r   r   )rc   r   rO   )r(   r:   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   r-   r-   r.   r<   n  s   

	zT5LayerSelfAttention.forwardr   )NNNFFNr=   r>   r?   r   r"   r<   r@   r-   r-   r+   r.   r   e  s    r   c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
T5LayerCrossAttentionNrf   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr   r_   )r!   r"   re   EncDecAttentionr   rI   rb   rc   r   rM   rN   rO   )r(   rD   rf   r+   r-   r.   r"     s   
zT5LayerCrossAttention.__init__Fc
                 C   sN   |  |}
| j|
||||||||	d	}|| |d  }|f|dd   }|S )N)r   r   r   r   r   r   r   r   r   r   )rc   r   rO   )r(   r:   r   r   r   r   r   r   r   r   r   r   layer_outputr   r-   r-   r.   r<     s   
zT5LayerCrossAttention.forwardrS   )NNNFNFNr   r-   r-   r+   r.   r     s    
r   c                       sF   e Zd Zd	dedB f fddZ										d
ddZ  ZS )T5BlockFNrf   c                    s`   t    |j| _t | _| jt|||d | jr&| jt||d | jt	| d S )Nr   )rf   )
r!   r"   rg   r   
ModuleListlayerappendr   r   r^   ry   r+   r-   r.   r"     s   

zT5Block.__init__Tc              
   C   s  | j d ||||||	|d}|d }|dd  }|jtjkr?tt| t|jjd t|jj}tj	|| |d}| j
oE|d u}|r| j d ||||||d d ||	d}|d }|jtjkrtt| t|jjd t|jj}tj	|| |d}||dd   }| j d |}|jtjkrtt| t|jjd t|jj}tj	|| |d}|f}|| S )Nr   )r   r   r   r   r   r   r   i  )r~   maxr0   )r   r   r   r   r   r   r   )r   r7   r$   r8   r   isinfanyfinfor   clamprg   )r(   r:   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   r-   r-   r.   r<     sd   	

zT5Block.forwardr   )
NNNNNNFFTNr   r-   r-   r+   r.   r     s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	T5ClassificationHeadz-Head for sentence-level classification tasks.rD   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r!   r"   r   rH   rI   denserM   classifier_dropoutrO   
num_labelsout_projrR   r+   r-   r.   r"     s   
zT5ClassificationHead.__init__r:   returnc                 C   s6   |  |}| |}t|}|  |}| |}|S rS   )rO   r   r$   tanhr   rW   r-   r-   r.   r<     s   




zT5ClassificationHead.forward)
r=   r>   r?   __doc__r   r"   r$   rU   r<   r@   r-   r-   r+   r.   r     s    r   c                   @   sR   e Zd ZU eed< dZdZdZdgZdgZ	e
dd Ze dd	 Zd
d ZdS )T5PreTrainedModelrD   transformerTr   rL   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r$   tensorr   r   )r(   r   
input_maskdummy_inputsr-   r-   r.   r   $  s   

zT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|trt|j|d  d	S t|ttt	t
fr^tj|jjd|d d t|dr>| j js>tj|jjd|d d t|dr\tj|jjd|| j jd  d t|jj d	S d	S t|trt|dr}tj|jjd|d d t|jj d	S d	S t|trtj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|tr$tj|jjd|| j jd  d t|jdr|jjd	urt|jj tj|jjd|| j jd  d t|jdr |jjd	ur"t|jj d	S d	S d	S t|trtj|jjd|| j jd  d t|jdrO|jjd	urOt|jj tj|jjd|| j jd  d t|jdrt|jjd	urtt|jj tj|jjd|| j jd  d t|jdr|jjd	urt|jj d	S d	S d	S t|t r| j j}| j j!}| j j"}tj|j#jd||| d  d tj|j$jd||d  d tj|j%jd||d  d tj|j&jd||| d  d |j'rtj|j(jd||d  d d	S d	S d	S )
zInitialize the weightsg      ?g        )r5   stdlm_head
qa_outputs      
classifierrG   N))rD   initializer_factorrT   r   init	constant_r&   T5ModelT5ForConditionalGenerationT5EncoderModelT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr   r   rI   zeros_rG   T5ForTokenClassificationr   r   r   r   rC   rK   rL   rJ   rY   rZ   r[   re   rk   rm   rr   rs   rt   ru   rh   rw   )r(   modulefactorrI   rl   rn   r-   r-   r.   _init_weights/  st   


 


          zT5PreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzself.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information..r0   r   ).r   z1self.model.config.pad_token_id has to be defined.)rD   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r(   r   r
  r  shifted_input_idsr-   r-   r.   _shift_rightf  s    zT5PreTrainedModel._shift_rightN)r=   r>   r?   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr   r$   no_gradr  r  r-   r-   r-   r.   r     s   
 


6r   c                       sD   e Zd Z fddZdd Z											dddZ  ZS )	T5Stackc                    sx   t    t j j| _ j| _t fddt	 j
D | _t j jd| _t j| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   r   )r   bool).0irD   r-   r.   
<listcomp>  s   " z$T5Stack.__init__.<locals>.<listcomp>r_   F)r!   r"   r   rv   
vocab_sizerI   embed_tokensrg   r   range
num_layersblockr   rb   final_layer_normrM   rN   rO   	post_initrx   rR   r+   r  r.   r"   }  s   
zT5Stack.__init__c                 C   s
   || _ d S rS   )r!  r(   new_embeddingsr-   r-   r.   set_input_embeddings     
zT5Stack.set_input_embeddingsNc                 K   sh  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|
d ur$|
n| j j}
|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|rtd
 d}|d u r| jd u rtd| |}|\}}|du r| jstd|  d| jr|r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}| j jrt| j |||t|tr|jn|d}nt| j ||d}d }| jr|d urt| j |||d}|	rdnd }|r%dnd }|r0| jr0dnd }d }d }| |}| jD ]I}|	rH||f }|||||||||||
|d}|d }|d }| jrq|d urq||rndnd }|r||d f }| jr||d f }q>| |}| |}|	r||f }|
stdd |||||fD S t|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer0   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr  r   )r   )rD   r-  r   r   r   )rD   r-  r   )rD   r-  r   r   r-   )r   r   r   r   r   r   r   r/      c                 s   s    | ]	}|d ur|V  qd S rS   r-   )r  rt   r-   r-   r.   	<genexpr>#  s    z"T5Stack.forward.<locals>.<genexpr>)last_hidden_stater   r:   
attentionscross_attentions)rD   r   r   output_hidden_statesuse_return_dictrg   r  sizer   rx   r   rp   rq   r!  is_encoder_decoderr   r
   get_seq_lengthr$   r   r   r   rT   r   r   rO   r$  r%  tupler   )r(   r   r   r   r   r-  r   r   r   r3  r   r   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r:   layer_modulelayer_outputsr-   r-   r.   r<     s   








zT5Stack.forward)NNNNNNNNNNN)r=   r>   r?   r"   r)  r<   r@   r-   r-   r+   r.   r  |  s    r  c                       s   e Zd ZdgZdddZdef fddZdd Zd	d
 Ze														dde
jdB de
jdB de
jdB de
jdB deee
j  dB dedB de
jdB de
jdB dedB dedB dedB dedB de
jdB dee
j eB fddZ  ZS )r   Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightrD   c                    sl   t  | t|j|j| _t|}d|_	d|_
t|| _t|}d|_	|j|_t|| _|   d S NFT)r!   r"   r   rv   r   rI   r  copydeepcopyrg   r   r  encodernum_decoder_layersr#  decoderr&  r(   rD   encoder_configdecoder_configr+   r-   r.   r"   A  s   



zT5Model.__init__c                 C      | j S rS   r  r(   r-   r-   r.   get_input_embeddingsR     zT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rS   r  rK  r)  rM  r'  r-   r-   r.   r)  U     zT5Model.set_input_embeddingsNr   r   r   r   encoder_outputsr   r-  decoder_inputs_embedsr   r   r3  r   r   r   c                 K   s   |	dur|	n| j j}	|dur|n| j j}|du r$| j||||
||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }| j|||||||	|
|||d}|sb|| S t|j	|j
|j|j|j|j	|j|jdS )	aV
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, T5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
        >>> model = T5Model.from_pretrained("google-t5/t5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
        >>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r-  r   r3  r   r   r   r/   r0  r:   r1  r   r   r-  r   r   r   r   r   r3  r   r   )r0  r   decoder_hidden_statesdecoder_attentionsr2  encoder_last_hidden_stater   encoder_attentions)rD   r   r4  rK  rT   r   lenrM  r   r0  r   r:   r1  r2  )r(   r   r   r   r   rY  r   r-  rZ  r   r   r3  r   r   r9  r:   decoder_outputsr-   r-   r.   r<   Z  sV   CzT5Model.forwardNNNNNNNNNNNNN)r=   r>   r?   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r"   rT  r)  r   r$   
LongTensorFloatTensor
BoolTensorr8  r	   rU   r  r   r<   r@   r-   r-   r+   r.   r   7  sh    	
r   z:
    T5 Model with a `language modeling` head on top.
    )custom_introc                !       s  e Zd ZdgZddddZdef fddZdd Zd	d
 Ze															dde
jdB de
jdB de
jdB de
jdB deee
j  dB dedB de
jdB de
jdB de
jdB dedB dedB dedB dedB de
jdB dee
j eB fddZde
jfddZ  ZS ) r   rC  rD  )zlm_head.weightrF  rG  rD   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _tj|j|jdd| _|   d S )NFTrF   )r!   r"   rI   	model_dimr   rv   r   r  rI  rJ  rg   r   r  rK  rL  r#  rM  rH   r   r&  rN  r+   r-   r.   r"     s   



z#T5ForConditionalGeneration.__init__c                 C   rQ  rS   rR  rS  r-   r-   r.   rT    rU  z/T5ForConditionalGeneration.get_input_embeddingsc                 C   rV  rS   rW  r'  r-   r-   r.   r)    rX  z/T5ForConditionalGeneration.set_input_embeddingsNr   r   r   r   rY  r   r-  rZ  labelsr   r   r3  r   r   r   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|du r$| j||||||d}n$|rHt|tsHt|d t|dkr9|d ndt|dkrD|d ndd}|d }|	dur]|du r]|du r]| |	}| j|||||||
||||d}|d }| j j	r||| j
d  }| |}d}|	durtd	d
}|	|j}	||d|d|	d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, T5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
        >>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer(
        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        >>> # studies have shown that owning a dog is good for you.
        ```Nr[  r   r   r/   r\  r]  r   r	  ignore_indexr0   	losslogitsr   r^  r_  r2  r`  r   ra  )rD   r   r4  rK  rT   r   rb  r  rM  scale_decoder_outputsrk  r   r   r2   r   r   r5  r   r   r:   r1  r2  r0  )r(   r   r   r   r   rY  r   r-  rZ  rl  r   r   r3  r   r   r9  r:   rc  sequence_output	lm_logitsrp  loss_fctoutputr-   r-   r.   r<     sp   J


z"T5ForConditionalGeneration.forwardc                 C   s
   |  |S rS   )r  )r(   rl  r-   r-   r.   %prepare_decoder_input_ids_from_labels  r*  z@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNN)r=   r>   r?   re  rf  r   r"   rT  r)  r   r$   rg  rh  ri  r8  rU   r	   r  r   r<   rw  r@   r-   r-   r+   r.   r     st    	
 r   c                       s   e Zd ZddiZdgZdef fddZdd Zd	d
 Ze							dde
jdB de
jdB de
jdB dedB dedB dedB dee
j eB fddZ  ZS )r   rF  rD  rM  rD   c                    sD   t  | t|j|j| _|}d|_d|_t	|| _
|   d S )NF)r!   r"   r   rv   r   rI   r  r   r6  r  rK  r&  )r(   rD   rO  r+   r-   r.   r"     s   
zT5EncoderModel.__init__c                 C   rQ  rS   rR  rS  r-   r-   r.   rT    rU  z#T5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rS   )r  rK  r)  r'  r-   r-   r.   r)    s   z#T5EncoderModel.set_input_embeddingsNr   r   r-  r   r3  r   r   c           	      K   s.   |dur|n| j j}| j||||||d}|S )aI  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, T5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
        >>> model = T5EncoderModel.from_pretrained("google-t5/t5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr[  )rD   r4  rK  )	r(   r   r   r-  r   r3  r   r9  rY  r-   r-   r.   r<     s   "	zT5EncoderModel.forward)NNNNNN)r=   r>   r?   rf  re  r   r"   rT  r)  r   r$   rg  rh  r  r8  r   r<   r@   r-   r-   r+   r.   r     s8    	r   z
    T5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                       s   e Zd ZdgZdef fddZe												ddejdB dej	dB dejdB d	ejdB d
e
ej dB dejdB dejdB dejdB dedB dedB dedB dedB deeB fddZ  ZS )T5ForSequenceClassificationrC  rD   c                    s,   t  | t|| _t|| _|   d S rS   )r!   r"   r   r   r   classification_headr&  rR   r+   r-   r.   r"     s   

z$T5ForSequenceClassification.__init__Nr   r   r   r   rY  r-  rZ  rl  r   r   r3  r   r   c                 K   s`  |dur|n| j j}|durd}	|du r!|dur!td| jj |du r6|du r6|du r1td| |}| j||||||||	|
||d}|d }|| j j	
|j}tt|d dkd |j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j |j!|j"|j#|j$d	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r   r   rY  r-  rZ  r   r   r3  r   r   r   z7All examples must have the same number of <eos> tokens.r0   
regressionsingle_label_classificationmulti_label_classificationro  )%rD   r4  NotImplementedErrorr,   r=   r  r  r   eqeos_token_idr2   r   r   r$   unique_consecutivesumnumelr   r   ry  problem_typer   r7   r|   r   r   squeezer   r   r   r   r^  r_  r2  r`  r   ra  )r(   r   r   r   r   rY  r-  rZ  rl  r   r   r3  r   r9  r   rs  eos_maskr   _r)   sentence_representationrq  rp  ru  rv  r-   r-   r.   r<     s   0
,


$

z#T5ForSequenceClassification.forward)NNNNNNNNNNNN)r=   r>   r?   re  r   r"   r   r$   rg  rU   listrh  r  r8  r   r<   r@   r-   r-   r+   r.   rx    sV    	
rx  c                       s   e Zd Zdef fddZe							ddejdB dejdB dejdB dejdB d	edB d
edB dedB de	ej e
B fddZ  ZS )r  rD   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rS   )r!   r"   r   r   r   r   rM   r   rO   rH   r)   r   r&  rR   r+   r-   r.   r"   s  s   
z!T5ForTokenClassification.__init__Nr   r   r-  rl  r   r3  r   r   c                 K   s   |dur|n| j j}| j||||||d}	|	d }
| |
}
| |
}d}|dur9t }||d| j|d}|sN||	dd f}|durL|f| S |S t|||	j	|	j
dS )a<  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   r-  r   r3  r   r   r0   r/   )rp  rq  r:   r1  )rD   r4  r   rO   r   r   r   r   r   r:   r1  )r(   r   r   r-  rl  r   r3  r   r9  r   r:   rq  rp  ru  rv  r-   r-   r.   r<   ~  s2   	

z T5ForTokenClassification.forward)NNNNNNN)r=   r>   r?   r   r"   r   r$   rU   r  r8  r   r<   r@   r-   r-   r+   r.   r  q  s6    
r  c                       s   e Zd ZdgZdddZdef fddZdd Zd	d
 Ze														dde
jdB de
jdB de
jdB de
jdB deee
j  dB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB dedB dee
j eB fddZ  ZS )r   rC  rD  rE  rD   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_t|| _t	|}d|_
|j|_t|| _|j| _t|j|j| _|   d S rH  )r!   r"   rI   rk  r   rv   r   r  rI  rJ  rg   r   r  rK  rL  r#  rM  r   rH   r)   r   r&  rN  r+   r-   r.   r"     s   



zT5ForQuestionAnswering.__init__c                 C   rQ  rS   rR  rS  r-   r-   r.   rT    rU  z+T5ForQuestionAnswering.get_input_embeddingsc                 C   rV  rS   rW  r'  r-   r-   r.   r)    rX  z+T5ForQuestionAnswering.set_input_embeddingsNr   r   r   r   rY  start_positionsend_positionsr-  rZ  r   r   r3  r   r   c                 K   sd  |dur|n| j j}|
dur|
n| j j}
|dur|durd}
|du r3|	du r3|du r.td| |}|
dur9|
n| j j}
|durC|n| j j}|du rW| j||||||d}n$|r{t|ts{t|d t|dkrl|d ndt|dkrw|d ndd}|d }| j	|||	d|||
|||d	
}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|dur|durt| dkr|d
|j}t| dkr|d
|j}|d}|d|}|d|}t|d}|||}|||}|| d }|s||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )az  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        NFrz  r[  r   r   r/   r\  )
r   r   r-  r   r   r   r   r   r3  r   r0   r   rm  )
rp  start_logits
end_logitsr   r^  r_  r2  r`  r   ra  )rD   r4  r   r  r  rK  rT   r   rb  rM  r   splitr  r   r5  r2   r   r   r   r   r   r:   r1  r2  r0  )r(   r   r   r   r   rY  r  r  r-  rZ  r   r   r3  r   r9  r:   rc  rs  rq  r  r  
total_lossignored_indexru  
start_lossend_lossrv  r-   r-   r.   r<     s   .





zT5ForQuestionAnswering.forwardrd  )r=   r>   r?   re  rf  r   r"   rT  r)  r   r$   rg  rh  ri  r8  rU   r  r   r<   r@   r-   r-   r+   r.   r     sf    	
r   )r   r   r   r   r   rx  r  )Gr   rI  r   r$   r   torch.nnr   r   r   r,  r   r   activationsr   cache_utilsr	   r
   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_t5r   
get_loggerr=   rp   Moduler   apex.normalizationrA   infoImportError	ExceptionwarningrC   rY   r^   re   r   r   r   r   r   r  r   r   r   rx  r  r   __all__r-   r-   r-   r.   <module>   sz   $	
 M"$]a <  >G F 3