o
    ߥiT                     @   s   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ ddlmZmZ ddlmZ e ZdZej ej!ej"dG dd deZ#dS )    N)OptionalTupleUnion)nn)CrossEntropyLoss)assert_device_mapget_device_map)Models)MODELS)AttentionBackboneModelOutputSeq2SeqLMOutputTokenGeneratorOutput)Tasks)
get_logger   )T5PreTrainedModelT5Stack)T5Configa_  
The input argument `head_mask` was split into two arguments `head_mask` and
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
but this feature is deprecated and will be removed in future versions. If you do
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
torch.ones(num_layers, num_heads)`.
)	group_keymodule_namec                %       s  e Zd Zg dZdgZd2def fddZd2ddZd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Z																d3deej deej deej deej deej deej deej deeeej   deeeej   d eej d!eej d"eej d#ee d$ee d%ee d&ee d'eeej ef f"d(d)Z							d4d*d+Zd"ejfd,d-Z fd.d/Zd0d1 Z  ZS )5T5ForConditionalGeneration)zencoder\.embed_tokens\.weightzdecoder\.embed_tokens\.weightzlm_head\.weightzMdecoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weightNconfigc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d| _|dkrZ|   d S d S )NFT)biasauto)super__init__d_model	model_dimr   	Embedding
vocab_sizesharedcopydeepcopy
is_decoder	use_cacheis_encoder_decoderr   encodernum_decoder_layers
num_layersdecoderLinearlm_head	post_initmodel_parallelparallelize)selfr   
device_mapkwargsencoder_configdecoder_config	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/T5/text2text_generation.pyr   <   s&   

z#T5ForConditionalGeneration.__init__c                 C   sv   |d u rt t| jjttj n|| _t	| jt| jj | j
| j | j
| j | j| jj| _d| _d S )NT)r   lenr&   blockrangetorchcudadevice_countr0   r   r.   r)   r+   tofirst_devicer-   )r/   r0   r6   r6   r7   r.   X   s   
z&T5ForConditionalGeneration.parallelizec                 C   sX   | j   | j  | j d| _ | jd| _| jd| _d| _d | _tj	  d S )NcpuF)
r&   deparallelizer)   r>   r+   r-   r0   r;   r<   empty_cacher/   r6   r6   r7   rA   c   s   

z(T5ForConditionalGeneration.deparallelizec                 C      | j S N)r    rC   r6   r6   r7   get_input_embeddingsm      z/T5ForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rE   )r    r&   set_input_embeddingsr)   r/   new_embeddingsr6   r6   r7   rH   p   s   z/T5ForConditionalGeneration.set_input_embeddingsc                 C   s
   || _ d S rE   r+   rI   r6   r6   r7   set_output_embeddingsu      
z0T5ForConditionalGeneration.set_output_embeddingsc                 C   rD   rE   rK   rC   r6   r6   r7   get_output_embeddingsx   rG   z0T5ForConditionalGeneration.get_output_embeddingsc                 C   rD   rE   )r&   rC   r6   r6   r7   get_encoder{   rG   z&T5ForConditionalGeneration.get_encoderc                 C   rD   rE   )r)   rC   r6   r6   r7   get_decoder~   rG   z&T5ForConditionalGeneration.get_decoder	input_idsattention_maskdecoder_input_idsdecoder_attention_mask	head_maskdecoder_head_maskcross_attn_head_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embedslabelsr$   output_attentionsoutput_hidden_statesreturn_dictreturnc                 K   s|  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }| jrptj| jj |dur|du r|du r| |}| jrtj| jj || jj}|dur|| jj}|dur|| jj}|dur|| jj}| j||||	||||||||d}|d }| jrtj| j	j | j| j	j| _|| jjj}| j jr|| jd  }| |}d}|durtd	d
}||d|d|d}|s*|f|dd  | }|dur(|f| S |S t|||j|j|j |j!|j"|j|j d	S )aH  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. T5 is a model
                with relative position embeddings so you should be able to pad the
                inputs on both the right and the left.

                Indices can be obtained using [`T5Tokenizer`]. See
                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
                for detail.

                [What are input IDs?](../glossary#input-ids)

                To know more on how to prepare `input_ids` for pretraining take a
                look a [T5 Training](./t5#training).
            attention_mask (`torch.FloatTensor` of shape `(batch_size,sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask
                values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
                Indices of decoder input sequence tokens in the vocabulary.

                Indices can be obtained using [`T5Tokenizer`]. See
                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
                for details.

                [What are decoder input IDs?](../glossary#decoder-input-ids)

                T5 uses the `pad_token_id` as the starting token for
                `decoder_input_ids` generation. If `past_key_values` is used,
                optionally only the last `decoder_input_ids` have to be input (see
                `past_key_values`).

                To know more on how to prepare `decoder_input_ids` for pretraining
                take a look at [T5 Training](./t5#training).
            decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
                Default behavior: generate a tensor that ignores pad tokens in
                `decoder_input_ids`. Causal mask will also be used by default.
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the self-attention modules in the
                encoder. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
                `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the self-attention modules in the
                decoder. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                    Mask to nullify selected heads of the cross-attention modules in
                    the decoder. Mask values selected in `[0, 1]`:

                    - 1 indicates the head is **not masked**,
                    - 0 indicates the head is **masked**.

            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
                Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
                `optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
                sequence_length, hidden_size)` is a sequence of hidden states at the
                output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            past_key_values (`tuple(tuple(torch.FloatTensor))` of length
                `config.n_layers` with each tuple having 4 tensors of shape
                `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):

                Contains precomputed key and value hidden states of the attention
                blocks. Can be used to speed up decoding.

                If `past_key_values` are used, the user can optionally input only
                the last `decoder_input_ids` (those that don't have their past key
                value states given to this model) of shape `(batch_size, 1)` instead
                of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to
                directly pass an embedded representation. This is useful if you want
                more control over how to convert `input_ids` indices into associated
                vectors than the model's internal embedding lookup matrix.
            decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`,
                *optional*):
                Optionally, instead of passing `decoder_input_ids` you can choose to
                directly pass an embedded representation. If `past_key_values` is
                used, optionally only the last `decoder_inputs_embeds` have to be
                input (see `past_key_values`). This is useful if you want more
                control over how to convert `decoder_input_ids` indices into
                associated vectors than the model's internal embedding lookup
                matrix.

                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
                `decoder_inputs_embeds` takes the value of `inputs_embeds`.

            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned
                and can be used to speed up decoding (see `past_key_values`).

            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention
                layers. See `attentions` under returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See
                `hidden_states` under returned tensors for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain
                tuple.
            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the sequence classification/regression loss.
                Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
                labels set to `-100` are ignored (masked), the loss is only computed
                for labels in `[0, ..., config.vocab_size]`

        Returns:

        Examples:

        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration

        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer(
        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
        >>> ).input_ids  # Batch size 1
        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        >>> # studies have shown that owning a dog is good for you.
        N)rQ   rR   rZ   rU   r]   r^   r_   r   r      )last_hidden_statehidden_states
attentions)rQ   rR   rZ   rY   encoder_hidden_statesencoder_attention_maskrU   rW   r$   r]   r^   r_   g      i)ignore_index)	losslogitsrY   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_statere   encoder_attentions)#r   r$   use_return_dictr(   r'   warningswarn2_T5ForConditionalGeneration__HEAD_MASK_WARNING_MSGFutureWarningr&   
isinstancer   r8   r-   r;   r<   
set_devicer)   r?   _shift_rightr>   r+   weightdevicetie_word_embeddingsr   r   viewsizer   rY   rc   rd   rm   rb   )r/   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r$   r]   r^   r_   r1   rc   decoder_outputssequence_output	lm_logitsri   loss_fctoutputr6   r6   r7   forward   s    !	



z"T5ForConditionalGeneration.forwardc	           
   	   K   s2   |d ur|d d dd f }||||||||dS )Nrh   )rS   rY   rX   rR   rU   rV   rW   r$   r6   )
r/   rQ   pastrR   rU   rV   rW   r$   rX   r1   r6   r6   r7   prepare_inputs_for_generation  s   z8T5ForConditionalGeneration.prepare_inputs_for_generationc                 C   s
   |  |S rE   )rw   )r/   r\   r6   r6   r7   %prepare_decoder_input_ids_from_labels  rM   z@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s4   t  j|i |}tt|tjr|dS |d dS )Nr   )	sequences)r   generater   ru   r;   Tensor)r/   argsr1   r   r4   r6   r7   r     s   z#T5ForConditionalGeneration.generatec              	   C   s   |d u rt d |S d}|D ]1}d}|D ]}||d||jf }q|d j|d jks1J t|t|ks;J ||f }q|S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr6   r   )loggerwarningindex_selectr>   ry   shaper8   )r/   r   beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_stater6   r6   r7   _reorder_cache  s0   
z)T5ForConditionalGeneration._reorder_cacherE   )NNNNNNNNNNNNNNNN)NNNNNNN)__name__
__module____qualname___keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr   r   r.   rA   rF   rH   rL   rN   rO   rP   r   r;   
LongTensorFloatTensor
BoolTensorr   r   boolr   r   r   r   r   r   r   __classcell__r6   r6   r4   r7   r   .   s    

	

  

r   )$r!   rq   typingr   r   r   r;   r   torch.nnr   'transformers.utils.model_parallel_utilsr   r   modelscope.metainfor	   modelscope.models.builderr
   modelscope.outputsr   r   r   modelscope.utils.constantr   modelscope.utils.loggerr   backboner   r   configurationr   r   __HEAD_MASK_WARNING_MSGregister_moduletext2text_generationT5r   r6   r6   r6   r7   <module>   s*   	