o
    ei                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZeG dd deZ G dd dej!Z"G dd dej!Z#G dd dej!Z$G dd dej!Z%G dd dej!Z&G dd dej!Z'G dd dej!Z(G dd dej!Z)eG dd  d e Z*G d!d" d"e Z+G d#d$ d$ej!Z,ed%d&G d'd( d(e Z-eG d)d* d*e Z.eG d+d, d,e Z/G d-d. d.ej!Z0eG d/d0 d0e Z1d1d2 Z2g d3Z3dS )4zPyTorch MPNet model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )MPNetConfigc                       s2   e Zd ZU eed< dZe  fddZ  Z	S )MPNetPreTrainedModelconfigmpnetc                    sZ   t  | t|trt|j dS t|tr+t|j	t
|j	jd d dS dS )zInitialize the weightsr   r   N)super_init_weights
isinstanceMPNetLMHeadinitzeros_biasMPNetEmbeddingscopy_position_idstorcharangeshapeexpand)selfmodule	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mpnet/modeling_mpnet.pyr   /   s   

&z"MPNetPreTrainedModel._init_weights)
__name__
__module____qualname__r   __annotations__base_model_prefixr%   no_gradr   __classcell__r-   r-   r+   r.   r   *   s
   
 r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	r"   c                    s   t    d| _tj|j|j| jd| _tj|j|j| jd| _	tj
|j|jd| _
t|j| _| jdt|jddd d S )Nr   )padding_idxepsr$   r   F)
persistent)r   __init__r6   r   	Embedding
vocab_sizehidden_sizeword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr%   r&   r(   r)   r   r+   r-   r.   r:   :   s   

zMPNetEmbeddings.__init__Nc           	      K   s   |d u r|d urt || j}n| |}|d ur| }n| d d }|d }|d u r8| jd d d |f }|d u rA| |}| |}|| }| |}| |}|S )Nr   r   )	"create_position_ids_from_input_idsr6   &create_position_ids_from_inputs_embedssizer$   r>   r@   rA   rE   )	r)   	input_idsr$   inputs_embedskwargsinput_shape
seq_lengthr@   
embeddingsr-   r-   r.   forwardH   s"   





zMPNetEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r   )dtypedevicer   )rJ   r%   r&   r6   longrS   	unsqueezer(   )r)   rL   rN   sequence_lengthr$   r-   r-   r.   rI   b   s   	z6MPNetEmbeddings.create_position_ids_from_inputs_embeds)NNN)r/   r0   r1   r:   rQ   rI   r5   r-   r-   r+   r.   r"   9   s    
r"   c                       ,   e Zd Z fddZ			dddZ  ZS )MPNetSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	
|j|j| _t	|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r   r:   r=   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizer   LinearqkvorC   attention_probs_dropout_probrE   rG   r+   r-   r.   r:   u   s   

zMPNetSelfAttention.__init__NFc                 K   s6  |j \}}}| ||d| j| jdd}	| ||d| j| jdd}
| ||d| j| jdd}t	|	|
dd}|t
| j }|d urS||7 }|d ur[|| }tjj|dd}| |}t	||}|dddd }| d d | jf }|j| }| |}|r||f}|S |f}|S )Nr   r      dimr   r   )r'   rb   viewr[   r_   	transposerc   rd   r%   matmulmathsqrtr   
functionalsoftmaxrE   permute
contiguousrJ   r`   re   )r)   hidden_statesattention_maskposition_biasoutput_attentionsrM   
batch_sizerO   _rb   rc   rd   attention_scoresattention_probscnew_c_shapere   outputsr-   r-   r.   rQ      s<   


zMPNetSelfAttention.forwardNNFr/   r0   r1   r:   rQ   r5   r-   r-   r+   r.   rX   t   s    rX   c                       rW   )MPNetAttentionc                    s:   t    t|| _tj|j|jd| _t|j	| _
d S Nr7   )r   r:   rX   attnr   rA   r=   rB   rC   rD   rE   rG   r+   r-   r.   r:      s   

zMPNetAttention.__init__NFc           	      K   s@   | j ||||d}| | |d | }|f|dd   }|S )N)rw   r   r   )r   rA   rE   )	r)   rt   ru   rv   rw   rM   self_outputsattention_outputr~   r-   r-   r.   rQ      s   zMPNetAttention.forwardr   r   r-   r-   r+   r.   r          	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MPNetIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S N)r   r:   r   ra   r=   intermediate_sizedenser   
hidden_actstrr   intermediate_act_fnrG   r+   r-   r.   r:      s
   
zMPNetIntermediate.__init__rt   returnc                 C   s   |  |}| |}|S r   )r   r   )r)   rt   r-   r-   r.   rQ      s   

zMPNetIntermediate.forwardr/   r0   r1   r:   r%   TensorrQ   r5   r-   r-   r+   r.   r      s    r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )MPNetOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r   r:   r   ra   r   r=   r   rA   rB   rC   rD   rE   rG   r+   r-   r.   r:      s   
zMPNetOutput.__init__rt   input_tensorr   c                 C   s&   |  |}| |}| || }|S r   )r   rE   rA   )r)   rt   r   r-   r-   r.   rQ      s   

zMPNetOutput.forwardr   r-   r-   r+   r.   r      s    $r   c                       rW   )
MPNetLayerc                    s,   t    t|| _t|| _t|| _d S r   )r   r:   r   	attentionr   intermediater   outputrG   r+   r-   r.   r:      s   


zMPNetLayer.__init__NFc                 K   sJ   | j ||||d}|d }|dd  }| |}	| |	|}
|
f| }|S )N)rv   rw   r   r   )r   r   r   )r)   rt   ru   rv   rw   rM   self_attention_outputsr   r~   intermediate_outputlayer_outputr-   r-   r.   rQ      s   

zMPNetLayer.forwardr   r   r-   r-   r+   r.   r      r   r   c                       sd   e Zd Z fddZ				ddejdejdB deded	ef
d
dZdddZe	dddZ
  ZS )MPNetEncoderc                    sN   t     | _ j| _t fddt jD | _	t
 j| j| _d S )Nc                    s   g | ]}t  qS r-   )r   ).0ry   r   r-   r.   
<listcomp>  s    z)MPNetEncoder.__init__.<locals>.<listcomp>)r   r:   r   r[   n_headsr   
ModuleListrangenum_hidden_layerslayerr;   relative_attention_num_bucketsrelative_attention_biasrG   r+   r   r.   r:     s
   
 zMPNetEncoder.__init__NFrt   ru   rw   output_hidden_statesreturn_dictc                 K   s   |  |}|r	dnd }|rdnd }	t| jD ]$\}
}|r!||f }||||fd|i|}|d }|r:|	|d f }	q|rB||f }|sPtdd |||	fD S t|||	dS )Nr-   rw   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r-   )r   rd   r-   r-   r.   	<genexpr>=  s    z'MPNetEncoder.forward.<locals>.<genexpr>)last_hidden_statert   
attentions)compute_position_bias	enumerater   tupler
   )r)   rt   ru   rw   r   r   rM   rv   all_hidden_statesall_attentionsilayer_modulelayer_outputsr-   r-   r.   rQ     s8   
	

zMPNetEncoder.forward    c                 C   s   | d| d| d}}}|d ur,|d d d d d f }|d d d d d f }ntj|tjdd d d f }tj|tjdd d d f }|| }	| j|	|d}
|
|j}
| |
}|g d	d}|
|d||f }|S )Nr   r   )rR   )num_buckets)rg   r   r   r   )rJ   r%   r&   rT   relative_position_buckettorS   r   rr   rU   r(   rs   )r)   xr$   r   bszqlenklencontext_positionmemory_positionrelative_position	rp_bucketvaluesr-   r-   r.   r   D  s   "
z"MPNetEncoder.compute_position_bias   c                 C   s   d}|  }|d }||dk  tj| 7 }t|}|d }||k }|t| | t||  ||   tj }t|t||d }|t	|||7 }|S )Nr   rg   r   )
r   r%   rT   abslogfloatrn   min	full_likewhere)r   r   max_distanceretn	max_exactis_smallval_if_larger-   r-   r.   r   V  s   
&z%MPNetEncoder.relative_position_bucket)NFFF)Nr   )r   r   )r/   r0   r1   r:   r%   r   boolrQ   r   staticmethodr   r5   r-   r-   r+   r.   r     s(    


(r   c                       r   )MPNetPoolerc                    s*   t    t|j|j| _t | _d S r   )r   r:   r   ra   r=   r   Tanh
activationrG   r+   r-   r.   r:   m  s   
zMPNetPooler.__init__rt   r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )r)   rt   first_token_tensorpooled_outputr-   r-   r.   rQ   r  s   

zMPNetPooler.forwardr   r-   r-   r+   r.   r   l  s    r   c                       s   e Zd Zd fdd	Zdd Zdd Ze							dd	ejdB d
ej	dB dejdB dej	dB de
dB de
dB de
dB deej eB fddZ  ZS )
MPNetModelTc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r   r:   r   r"   rP   r   encoderr   pooler	post_init)r)   r   add_pooling_layerr+   r-   r.   r:   }  s   

zMPNetModel.__init__c                 C      | j jS r   rP   r>   r)   r-   r-   r.   get_input_embeddings     zMPNetModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r)   valuer-   r-   r.   set_input_embeddings  s   zMPNetModel.set_input_embeddingsNrK   ru   r$   rL   rw   r   r   r   c                 K   s:  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }	n|d urF| d d }	ntd|d urQ|jn|j}
|d u r_tj	|	|
d}| 
||	}| j|||d}| j|||||d}|d }| jd ur| |nd }|s||f|dd   S t|||j|jd	S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)rS   )rK   r$   rL   )ru   rw   r   r   r   r   )r   pooler_outputrt   r   )r   rw   r   use_return_dictr]   %warn_if_padding_and_no_attention_maskrJ   rS   r%   onesget_extended_attention_maskrP   r   r   r   rt   r   )r)   rK   ru   r$   rL   rw   r   r   rM   rN   rS   extended_attention_maskembedding_outputencoder_outputssequence_outputr   r-   r-   r.   rQ     sD   
zMPNetModel.forward)T)NNNNNNN)r/   r0   r1   r:   r   r   r   r%   
LongTensorFloatTensorr   r   r   r   rQ   r5   r-   r-   r+   r.   r   {  s:    
r   c                       s   e Zd ZdddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
ddej	d
B dej
d
B dej	d
B dej
d
B dej	d
B ded
B ded
B ded
B deej eB fddZ  ZS )MPNetForMaskedLMz'mpnet.embeddings.word_embeddings.weightzlm_head.bias)zlm_head.decoder.weightzlm_head.decoder.biasc                    s0   t  | t|dd| _t|| _|   d S NF)r   )r   r:   r   r   r   lm_headr   rG   r+   r-   r.   r:     s   
zMPNetForMaskedLM.__init__c                 C   r   r   )r   decoderr   r-   r-   r.   get_output_embeddings  r   z&MPNetForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )r   r   r!   )r)   new_embeddingsr-   r-   r.   set_output_embeddings  s   z&MPNetForMaskedLM.set_output_embeddingsNrK   ru   r$   rL   labelsrw   r   r   r   c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur6t }||d| j j|d}|sL|f|
dd  }|durJ|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nru   r$   rL   rw   r   r   r   r   rg   losslogitsrt   r   )
r   r   r   r   r   rk   r<   r   rt   r   )r)   rK   ru   r$   rL   r   rw   r   r   rM   r~   r   prediction_scoresmasked_lm_lossloss_fctr   r-   r-   r.   rQ     s2   

zMPNetForMaskedLM.forwardNNNNNNNN)r/   r0   r1   _tied_weights_keysr:   r   r   r   r%   r   r   r   r   r   r   rQ   r5   r-   r-   r+   r.   r     sF    		r   c                       (   e Zd ZdZ fddZdd Z  ZS )r   z5MPNet Head for masked and permuted language modeling.c                    s^   t    t|j|j| _tj|j|jd| _tj|j|j	dd| _
tt|j	| _d S )Nr7   T)r!   )r   r:   r   ra   r=   r   rA   rB   
layer_normr<   r   	Parameterr%   zerosr!   rG   r+   r-   r.   r:     s
   
zMPNetLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r	   r   r   r)   featuresrM   r   r-   r-   r.   rQ     s
   


zMPNetLMHead.forwardr/   r0   r1   __doc__r:   rQ   r5   r-   r-   r+   r.   r     s    r   z
    MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe								ddejdB dejdB dejdB dejdB dejdB d	edB d
edB dedB de	ej
 eB fddZ  ZS )MPNetForSequenceClassificationc                    s8   t  | |j| _t|dd| _t|| _|   d S r   )r   r:   
num_labelsr   r   MPNetClassificationHead
classifierr   rG   r+   r-   r.   r:   0  s
   
z'MPNetForSequenceClassification.__init__NrK   ru   r$   rL   r   rw   r   r   r   c	              	   K   sd  |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur| j jdu rO| jdkr5d| j _n| jdkrK|jtjksF|jtj	krKd| j _nd| j _| j jdkrmt
 }| jdkrg|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|
d	d  }|dur|f| S |S t|||
j|
jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   
regressionsingle_label_classificationmulti_label_classificationr   rg   r   )r   r   r   r
  problem_typer  rR   r%   rT   r^   r   squeezer   rk   r   r   rt   r   r)   rK   ru   r$   rL   r   rw   r   r   rM   r~   r   r   r   r   r   r-   r-   r.   rQ   :  sP   	


"


z&MPNetForSequenceClassification.forwardr   )r/   r0   r1   r:   r   r%   r   r   r   r   r   r   rQ   r5   r-   r-   r+   r.   r  )  s<    
	r  c                       r  )MPNetForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r   r:   r   r   r   rC   rD   rE   ra   r=   r
  r   rG   r+   r-   r.   r:     s
   
zMPNetForMultipleChoice.__init__NrK   ru   r$   rL   r   rw   r   r   r   c	              	   K   sL  |dur|n| j j}|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}| j|||||||d}|d }| |}| |}|d|
}d}|durt }|||}|s|f|dd  }|dur|f| S |S t	|||j
|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rh   )r$   ru   rL   rw   r   r   rg   r   )r   r   r'   rk   rJ   r   rE   r
  r   r   rt   r   )r)   rK   ru   r$   rL   r   rw   r   r   rM   num_choicesflat_input_idsflat_position_idsflat_attention_maskflat_inputs_embedsr~   r   r   reshaped_logitsr   r   r   r-   r-   r.   rQ     sF   $	


zMPNetForMultipleChoice.forwardr   )r/   r0   r1   r:   r   r%   r   r   r   r   r   r   rQ   r5   r-   r-   r+   r.   r  ~  s<    
	r  c                       r  )MPNetForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S r   )r   r:   r  r   r   r   rC   rD   rE   ra   r=   r
  r   rG   r+   r-   r.   r:     s   z$MPNetForTokenClassification.__init__NrK   ru   r$   rL   r   rw   r   r   r   c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}| |}d}|dur:t }||d| j|d}|sP|f|
dd  }|durN|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr   r   r   rg   r   )r   r   r   rE   r
  r   rk   r  r   rt   r   r  r-   r-   r.   rQ     s4   


z#MPNetForTokenClassification.forwardr   )r/   r0   r1   r:   r   r%   r   r   r   r   r   r   rQ   r5   r-   r-   r+   r.   r    s<    	r  c                       r   )r	  z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r   )r   r:   r   ra   r=   r   rC   rD   rE   r  out_projrG   r+   r-   r.   r:     s   
z MPNetClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )rE   r   r%   tanhr  r  r-   r-   r.   rQ   $  s   




zMPNetClassificationHead.forwardr  r-   r-   r+   r.   r	    s    r	  c                       s   e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB de	ej
 eB fddZ  ZS )MPNetForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r   )
r   r:   r  r   r   r   ra   r=   
qa_outputsr   rG   r+   r-   r.   r:   0  s
   z"MPNetForQuestionAnswering.__init__NrK   ru   r$   rL   start_positionsend_positionsrw   r   r   r   c
              	   K   sD  |	d ur|	n| j j}	| j|||||||	d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrM|d}t| dkrZ|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|	s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr   r   r   r   ri   )ignore_indexrg   )r   start_logits
end_logitsrt   r   )r   r   r   r  splitr  rs   lenrJ   clampr   r   rt   r   )r)   rK   ru   r$   rL   r  r  rw   r   r   rM   r~   r   r   r   r!  
total_lossignored_indexr   
start_lossend_lossr   r-   r-   r.   rQ   :  sL   







z!MPNetForQuestionAnswering.forward)	NNNNNNNNN)r/   r0   r1   r:   r   r%   r   r   r   r   r   r   rQ   r5   r-   r-   r+   r.   r  .  sB    
	
r  c                 C   s2   |  | }tj|dd|| }| | S )z
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor:
    r   ri   )ner^   r%   cumsumtype_asrT   )rK   r6   maskincremental_indicesr-   r-   r.   rH   y  s   rH   )r   r  r  r  r  r   r   r   )4r  rn   r%   r   torch.nnr   r   r    r   r   activationsr   r	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mpnetr   
get_loggerr/   loggerr   Moduler"   rX   r   r   r   r   r   r   r   r   r   r  r  r  r	  r  rH   __all__r-   r-   r-   r.   <module>   sN   $	
;IXNIO[@J