o
    ei{                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e(e)Z*G dd de"Z+G dd de%Z,G dd de!Z-G dd de#Z.eG dd deZ/G dd de$Z0eddG dd  d e/eZ1eG d!d" d"e/Z2G d#d$ d$ej3Z4ed%dG d&d' d'e/Z5eG d(d) d)e/Z6eG d*d+ d+e/Z7G d,d- d-ej3Z8eG d.d/ d/e/Z9g d0Z:dS )1zPyTorch RoBERTa model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu)GenerationMixin),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )BertCrossAttentionBertEmbeddings	BertLayer	BertModelBertSelfAttention   )RobertaConfigc                       sx   e Zd Z fddZ					ddejdB dejdB dejdB dejdB d	ef
d
dZe	dd Z
e	dddZ  ZS )RobertaEmbeddingsc                    s8   t  | | `| `|j| _tj|j|j| jd| _d S )N)padding_idx)	super__init__pad_token_idposition_embeddingsr   nn	Embeddingmax_position_embeddingshidden_sizeselfconfig	__class__ i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/roberta/modular_roberta.pyr!   -   s   zRobertaEmbeddings.__init__Nr   	input_idstoken_type_idsposition_idsinputs_embedspast_key_values_lengthc                 C   s  |d u r|d ur|  || j|}n| || j}|d ur!| }n| d d }|\}}|d u rZt| drO| j|jd d}	tj	|	d|d}	|	||}ntj
|tj| jjd}|d u rc| |}| |}
||
 }| |}|| }| |}| |}|S )Nr0   r   r   )dimindexdtypedevice)"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedssizehasattrr0   expandshapetorchgatherzeroslongr1   r9   word_embeddingstoken_type_embeddingsr#   	LayerNormdropout)r)   r/   r0   r1   r2   r3   input_shape
batch_size
seq_lengthbuffered_token_type_idsrE   
embeddingsr#   r-   r-   r.   forward8   s2   






zRobertaEmbeddings.forwardc                 C   sJ   |   dd }|d }tj|d || d tj| jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr4   r   r7   r   )r<   r@   arangerC   r9   	unsqueezer>   )r2   r   rH   sequence_lengthr1   r-   r-   r.   r;   h   s   
z8RobertaEmbeddings.create_position_ids_from_inputs_embedsc                 C   s6   |  | }tj|dd|| | }| | S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r5   )neintr@   cumsumtype_asrC   )r/   r   r3   maskincremental_indicesr-   r-   r.   r:   z   s   z4RobertaEmbeddings.create_position_ids_from_input_ids)NNNNr   )r   )__name__
__module____qualname__r!   r@   
LongTensorFloatTensorrS   rM   staticmethodr;   r:   __classcell__r-   r-   r+   r.   r   ,   s,    
0
r   c                   @      e Zd ZdS )RobertaSelfAttentionNrX   rY   rZ   r-   r-   r-   r.   r`          r`   c                   @   r_   )RobertaCrossAttentionNra   r-   r-   r-   r.   rc      rb   rc   c                   @   r_   )RobertaLayerNra   r-   r-   r-   r.   rd      rb   rd   c                       sL   e Zd ZeZdZdZdZdZdZ	dZ
eeedZe  fddZ  ZS )RobertaPreTrainedModelrobertaT)hidden_states
attentionscross_attentionsc                    sf   t  | t|trt|j dS t|tr1t|j	t
|j	jd d t|j dS dS )zInitialize the weightsr4   )r   r4   N)r    _init_weights
isinstanceRobertaLMHeadinitzeros_biasr   copy_r1   r@   rN   r?   r>   r0   )r)   moduler+   r-   r.   rj      s   

"z$RobertaPreTrainedModel._init_weights)rX   rY   rZ   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrd   r`   rc   _can_record_outputsr@   no_gradrj   r^   r-   r-   r+   r.   re      s    re   c                       s   e Zd Zd fdd	Z  ZS )RobertaModelTc                    s   t  | | d S N)r    r!   )r)   r*   add_pooling_layerr+   r-   r.   r!      s   zRobertaModel.__init__)T)rX   rY   rZ   r!   r^   r-   r-   r+   r.   r{      s    r{   zS
    RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        s   e Zd ZdddZ fddZdd Zdd	 Zee	
	
	
	
	
	
	
	
	
	
	
	dde	j
d
B de	jd
B de	j
d
B de	j
d
B de	jd
B de	jd
B de	jd
B de	j
d
B deee	j  d
B ded
B de	jd
B dee	jB dee dee	j eB fddZ  ZS )RobertaForCausalLM)roberta.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                    s@   t  | |jstd t|dd| _t|| _| 	  d S )NzOIf you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`Fr}   
r    r!   
is_decoderloggerwarningr{   rf   rl   lm_head	post_initr(   r+   r-   r.   r!      s   

zRobertaForCausalLM.__init__c                 C      | j jS r|   r   decoderr)   r-   r-   r.   get_output_embeddings      z(RobertaForCausalLM.get_output_embeddingsc                 C      || j _d S r|   r   r)   new_embeddingsr-   r-   r.   set_output_embeddings      z(RobertaForCausalLM.set_output_embeddingsNr   r/   attention_maskr0   r1   r2   encoder_hidden_statesencoder_attention_masklabelspast_key_values	use_cachecache_positionlogits_to_keepkwargsreturnc                 K   s   |durd}
| j |f|||||||	|
|dd
|}|j}t|tr(t| dn|}| |dd|ddf }d}|durL| jd||| jjd|}t	|||j
|j|j|jdS )am  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
        >>> config.is_decoder = True
        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)
r   r0   r1   r2   r   r   r   r   r   return_dict)logitsr   
vocab_size)lossr   r   rg   rh   ri   r-   )rf   last_hidden_staterk   rS   slicer   loss_functionr*   r   r
   r   rg   rh   ri   )r)   r/   r   r0   r1   r2   r   r   r   r   r   r   r   r   outputsrg   slice_indicesr   r   r-   r-   r.   rM      s@   1zRobertaForCausalLM.forward)NNNNNNNNNNNr   )rX   rY   rZ   _tied_weights_keysr!   r   r   r   r   r@   r[   r\   tupleboolTensorrS   r   r   r
   rM   r^   r-   r-   r+   r.   r      sd    	
r   c                       s   e Zd ZdddZ fddZdd Zdd	 Zee	
	
	
	
	
	
	
	
dde	j
d
B de	jd
B de	j
d
B de	j
d
B de	jd
B de	jd
B de	jd
B de	j
d
B dee dee	j eB fddZ  ZS )RobertaForMaskedLMr   r   r   c                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NznIf you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr   r   r(   r+   r-   r.   r!   0  s   
zRobertaForMaskedLM.__init__c                 C   r   r|   r   r   r-   r-   r.   r   ?  r   z(RobertaForMaskedLM.get_output_embeddingsc                 C   r   r|   r   r   r-   r-   r.   r   B  r   z(RobertaForMaskedLM.set_output_embeddingsNr/   r   r0   r1   r2   r   r   r   r   r   c	              
   K   s   | j |f||||||dd|	}
|
d }| |}d}|dur7||j}t }||d| jj|d}t|||
j	|
j
dS )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)r   r0   r1   r2   r   r   r   r   Nr4   r   r   rg   rh   )rf   r   tor9   r   viewr*   r   r   rg   rh   )r)   r/   r   r0   r1   r2   r   r   r   r   r   sequence_outputprediction_scoresmasked_lm_lossloss_fctr-   r-   r.   rM   E  s4   	
zRobertaForMaskedLM.forward)NNNNNNNN)rX   rY   rZ   r   r!   r   r   r   r   r@   r[   r\   r   r   r   r   r   rM   r^   r-   r-   r+   r.   r   )  sL    	
r   c                       (   e Zd ZdZ fddZdd Z  ZS )rl   z*Roberta Head for masked language modeling.c                    sZ   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _d S )N)eps)r    r!   r$   Linearr'   denserF   layer_norm_eps
layer_normr   r   	Parameterr@   rB   ro   r(   r+   r-   r.   r!     s
   
zRobertaLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r|   )r   r   r   r   r)   featuresr   xr-   r-   r.   rM     s
   


zRobertaLMHead.forwardrX   rY   rZ   __doc__r!   rM   r^   r-   r-   r+   r.   rl     s    rl   z
    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZee						ddejdB dejdB dejdB dejdB dejdB d	ejdB d
e	e
 deej eB fddZ  ZS ) RobertaForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFr   )	r    r!   
num_labelsr*   r{   rf   RobertaClassificationHead
classifierr   r(   r+   r-   r.   r!     s   
z)RobertaForSequenceClassification.__init__Nr/   r   r0   r1   r2   r   r   r   c                 K   s6  | j |f||||dd|}|d }	| |	}
d}|dur||
j}| jjdu rN| jdkr4d| j_n| jdkrJ|jtj	ksE|jtj
krJd| j_nd| j_| jjdkrlt }| jdkrf||
 | }n+||
|}n%| jjdkrt }||
d	| j|d	}n| jjdkrt }||
|}t||
|j|jd
S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr   r0   r1   r2   r   r   Nr   
regressionsingle_label_classificationmulti_label_classificationr4   r   )rf   r   r   r9   r*   problem_typer   r8   r@   rC   rS   r   squeezer   r   r   r   rg   rh   r)   r/   r   r0   r1   r2   r   r   r   r   r   r   r   r-   r-   r.   rM     sN   	


"


z(RobertaForSequenceClassification.forwardNNNNNN)rX   rY   rZ   r!   r   r   r@   r[   r\   r   r   r   r   r   rM   r^   r-   r-   r+   r.   r     s6    	r   c                       s   e Zd Z fddZee						ddejdB dejdB dejdB dejdB dejdB d	ejdB d
e	e
 deej eB fddZ  ZS )RobertaForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r    r!   r{   rf   r$   Dropouthidden_dropout_probrG   r   r'   r   r   r(   r+   r-   r.   r!     s
   
z!RobertaForMultipleChoice.__init__Nr/   r0   r   r   r1   r2   r   r   c                 K   s<  |dur	|j d n|j d }|dur|d|dnd}	|dur*|d|dnd}
|dur9|d|dnd}|durH|d|dnd}|dur[|d|d|dnd}| j|	f|
|||dd|}|d }| |}| |}|d|}d}|dur||j}t }|||}t	|||j
|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r4   T)r1   r0   r   r2   r   r   )r?   r   r<   rf   rG   r   r   r9   r   r   rg   rh   )r)   r/   r0   r   r   r1   r2   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   pooled_outputr   reshaped_logitsr   r   r-   r-   r.   rM     sF   +	


z RobertaForMultipleChoice.forwardr   )rX   rY   rZ   r!   r   r   r@   r[   r\   r   r   r   r   r   rM   r^   r-   r-   r+   r.   r     s6    
	r   c                       r   )RobertaForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r   )r    r!   r   r{   rf   classifier_dropoutr   r$   r   rG   r   r'   r   r   r)   r*   r   r+   r-   r.   r!   R  s   z&RobertaForTokenClassification.__init__Nr/   r   r0   r1   r2   r   r   r   c                 K   s   | j |f||||dd|}|d }	| |	}	| |	}
d}|dur9||
j}t }||
d| j|d}t||
|j	|j
dS )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr   r   Nr4   r   )rf   rG   r   r   r9   r   r   r   r   rg   rh   r   r-   r-   r.   rM   `  s2   


z%RobertaForTokenClassification.forwardr   )rX   rY   rZ   r!   r   r   r@   r[   r\   r   r   r   r   r   rM   r^   r-   r-   r+   r.   r   P  s6    	r   c                       r   )r   z-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _d S r|   )r    r!   r$   r   r'   r   r   r   r   rG   r   out_projr   r+   r-   r.   r!     s   
z"RobertaClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S )Nr   )rG   r   r@   tanhr   r   r-   r-   r.   rM     s   




z!RobertaClassificationHead.forwardr   r-   r-   r+   r.   r     s    	r   c                       s   e Zd Z fddZee							ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB de	e
 deej eB fddZ  ZS )RobertaForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r   )
r    r!   r   r{   rf   r$   r   r'   
qa_outputsr   r(   r+   r-   r.   r!     s
   z$RobertaForQuestionAnswering.__init__Nr/   r   r0   r1   r2   start_positionsend_positionsr   r   c                 K   s  | j |f||||dd|}	|	d }
| |
}|jddd\}}|d }|d }d}|dury|duryt| dkrF|d}t| dkrS|d}|d}|d|}|d|}t|d}|||}|||}|| d	 }t	||||	j
|	jd
S )a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Tr   r   r   r4   rQ   N)ignore_indexr   )r   start_logits
end_logitsrg   rh   )rf   r   splitr   
contiguouslenr<   clampr   r   rg   rh   )r)   r/   r   r0   r1   r2   r   r   r   r   r   r   r   r   
total_lossignored_indexr   
start_lossend_lossr-   r-   r.   rM     sH   







z#RobertaForQuestionAnswering.forward)NNNNNNN)rX   rY   rZ   r!   r   r   r@   r[   r\   r   r   r   r   r   rM   r^   r-   r-   r+   r.   r     s<    
	
r   )r   r   r   r   r   r   r{   re   );r   r@   torch.nnr$   r   r   r    r   rm   activationsr   
generationr   modeling_outputsr	   r
   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   bert.modeling_bertr   r   r   r   r   configuration_robertar   
get_loggerrX   r   r   r`   rc   rd   re   r{   r   r   Modulerl   r   r   r   r   r   __all__r-   r-   r-   r.   <module>   sR   $	
_nUT`FN