o
    eiQI                     @   sl  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlm Z  G dd deZ!eG dd deZ"G dd deZ#G dd deZ$eG dd de"Z%eddG dd de"eZ&G d d! d!eZ'G d"d# d#eZ(G d$d% d%eZ)g d&Z*dS )'zPyTorch PLBART model.    N)nn)CrossEntropyLoss   )initialization)Cache)GenerationMixin)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstring   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfigc                   @      e Zd ZdS )PLBartScaledWordEmbeddingN__name__
__module____qualname__ r   r   g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/plbart/modular_plbart.pyr   ,       r   c                       sB   e Zd ZU eed< dZdZddgZdZdZ	dZ
 fddZ  ZS )PLBartPreTrainedModelconfigmodelTPLBartDecoderLayerPLBartEncoderLayerc                    s*   t  | t|trt|j d S d S N)super_init_weights
isinstancePLBartForConditionalGenerationinitzeros_final_logits_bias)selfmodule	__class__r   r   r'   :   s   
z#PLBartPreTrainedModel._init_weights)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr'   __classcell__r   r   r/   r   r    0   s   
 r    c                   @   r   )PLBartEncoderNr   r   r   r   r   r9   @   r   r9   c                   @   r   )PLBartDecoderNr   r   r   r   r   r:   D   r   r:   c                       s   e Zd ZdddZdef fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
dde	j
d
B de	j
d
B de	j
d
B de	jd
B dee	j d
B ded
B de	jd
B de	jd
B ded
B ded
B ded
B ded
B de	j
d
B dee	j eB fddZ  ZS )PLBartModelzshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr!   c                    sd   t  | |j|j}}|jrt|jnd}t||j||d| _	t
|| _t|| _|   d S )Ng      ?)embed_scale)r&   __init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr   sharedr9   encoderr:   decoder	post_init)r-   r!   padding_idxr?   r<   r/   r   r   r=   O   s   

zPLBartModel.__init__c                 C   s   | j S r%   )rD   )r-   r   r   r   get_input_embeddings[   s   z PLBartModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r%   )rD   rE   embed_tokensrF   )r-   valuer   r   r   set_input_embeddings^   s   
z PLBartModel.set_input_embeddingsN	input_idsattention_maskdecoder_input_idsdecoder_attention_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionreturnc                 K   s.  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}|du r7|du r7t|| j j}|du rG| j||||
||d}n$|rkt|t	skt	|d t
|dkr\|d ndt
|dkrg|d ndd}| j|||d ||||	|
|||d}|s|| S t|j|j|j|j|j|j|j|jdS )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        N)rM   rN   rS   rV   rW   rX   r   r   r   )last_hidden_statehidden_states
attentions)rM   rN   encoder_hidden_statesencoder_attention_maskrR   rS   rU   rV   rW   rX   rY   )r[   rR   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stater^   encoder_attentions)r!   rV   rW   rU   use_return_dictr   r>   rE   r(   r   lenrF   r
   r[   rR   r\   r]   rb   )r-   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   kwargsdecoder_outputsr   r   r   forwardc   s^   (	zPLBartModel.forward)NNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   r=   rI   rL   r   torch
LongTensorTensorlistFloatTensorr   booltupler
   ri   r8   r   r   r/   r   r;   H   sd    	
r;   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc                !       s>  e Zd ZdZdgZddiZdef fddZ		d$d
ededB de	de
jf fddZd
eddfddZe														d%dejdB dejdB dejdB dejdB deej dB dedB dejdB dejdB dejdB de	dB de	dB de	dB de	dB dejdB deej eB fd d!Zdejfd"d#Z  ZS )&r)   r"   r,   zlm_head.weightzmodel.shared.weightr!   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr,   r   F)bias)r&   r=   r;   r"   register_bufferrk   zerosrD   num_embeddingsr   LinearrC   lm_headrG   )r-   r!   r/   r   r   r=      s
   
z'PLBartForConditionalGeneration.__init__NTnew_num_tokenspad_to_multiple_ofmean_resizingrZ   c                    s&   t  |||}| |jjd  |S )Nr   )r&   resize_token_embeddings_resize_final_logits_biasweightshape)r-   ry   rz   r{   new_embeddingsr/   r   r   r|      s   z6PLBartForConditionalGeneration.resize_token_embeddingsc                 C   sj   | j jd }||kr| j d d d |f }ntjd|| f| j jd}tj| j |gdd}| d| d S )Nr   )device)dimr,   )r,   r   rk   ru   r   catrt   )r-   ry   old_num_tokensnew_bias
extra_biasr   r   r   r}      s   z8PLBartForConditionalGeneration._resize_final_logits_biasrM   rN   rO   rP   rQ   rR   rS   rT   labelsrU   rV   rW   rX   rY   c                 K   s   |dur|n| j j}|	dur|du r|du rt|	| j j}| j|||||||||
||||d}| |d }|| j|j }d}|	durVt	 }||
d| j j|	
d}|sl|f|dd  }|durj|f| S |S t|||j|j|j|j|j|j|jd	S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)rN   rO   rQ   rP   rR   rS   rT   rU   rV   rW   rX   rY   r   r   r   )	losslogitsrR   r`   ra   rb   rc   r^   rd   )r!   re   r   r>   r"   rx   r,   tor   r   viewr?   r	   rR   r`   ra   rb   rc   r^   rd   )r-   rM   rN   rO   rP   rQ   rR   rS   rT   r   rU   rV   rW   rX   rY   rg   outputs	lm_logitsmasked_lm_lossloss_fctoutputr   r   r   ri      sN   Bz&PLBartForConditionalGeneration.forwardc                 C   s   t || jjS r%   )r   r!   r>   )r-   r   r   r   r   %prepare_decoder_input_ids_from_labelsY  s   zDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labels)NT)NNNNNNNNNNNNNN)r   r   r   r2   _keys_to_ignore_on_load_missingrj   r   r=   intrp   r   	Embeddingr|   r}   r   rk   rl   rm   rn   ro   r   rq   r	   ri   r   r8   r   r   r/   r   r)      s    			
nr)   c                   @   r   )PLBartClassificationHeadNr   r   r   r   r   r   ]  r   r   c                       s   e Zd Z fddZ  ZS )PLBartForSequenceClassificationc                        t  jdi |  dS )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r&   ri   super_kwargsr/   r   r   ri   b  s   z'PLBartForSequenceClassification.forward)r   r   r   ri   r8   r   r   r/   r   r   a  s    r   c                       s    e Zd Ze fddZ  ZS )PLBartForCausalLMc                     r   )aF  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```Nr   r   r   r/   r   r   ri     s   zPLBartForCausalLM.forward)r   r   r   r   ri   r8   r   r   r/   r   r     s    r   )r   r)   r   r;   r    )+__doc__rA   rk   r   torch.nnr    r   r*   cache_utilsr   
generationr   modeling_outputsr   r	   r
   modeling_utilsr   utilsr   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   r   r    r9   r:   r;   r)   r   r   r   __all__r   r   r   r   <module>   s>   } 