o
    	۷i
                     @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ e e!Z"eG dd deZ#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd dej$Z(G dd dej$Z)G dd dej$Z*G dd dej$Z+G dd dej$Z,eG d d! d!e#Z-G d"d# d#e#Z.G d$d% d%ej$Z/ed&d'G d(d) d)e#Z0eG d*d+ d+e#Z1eG d,d- d-e#Z2G d.d/ d/ej$Z3eG d0d1 d1e#Z4d2d3 Z5g d4Z6dS )5zPyTorch MPNet model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )MPNetConfigc                   @   s"   e Zd ZU eed< dZdd ZdS )MPNetPreTrainedModelconfigmpnetc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   Linearweightdatanormal_r   initializer_rangebiaszero_	Embeddingpadding_idx	LayerNormfill_MPNetLMHead)selfmodule r-   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/mpnet/modeling_mpnet.py_init_weights1   s    


z"MPNetPreTrainedModel._init_weightsN)__name__
__module____qualname__r   __annotations__base_model_prefixr/   r-   r-   r-   r.   r   ,   s   
 r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	MPNetEmbeddingsc                    s   t    d| _tj|j|j| jd| _tj|j|j| jd| _	tj
|j|jd| _
t|j| _| jdt|jddd d S )Nr   )r'   epsposition_ids)r   F)
persistent)super__init__r'   r   r&   
vocab_sizehidden_sizeword_embeddingsmax_position_embeddingsposition_embeddingsr(   layer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandr+   r   	__class__r-   r.   r<   E   s   

zMPNetEmbeddings.__init__Nc           	      K   s   |d u r|d urt || j}n| |}|d ur| }n| d d }|d }|d u r8| jd d d |f }|d u rA| |}| |}|| }| |}| |}|S )Nr9   r   )	"create_position_ids_from_input_idsr'   &create_position_ids_from_inputs_embedssizer8   r?   rA   r(   rE   )	r+   	input_idsr8   inputs_embedskwargsinput_shape
seq_lengthrA   
embeddingsr-   r-   r.   forwardS   s"   





zMPNetEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr9   r   )dtypedevicer   )rO   rG   rH   r'   longrX   	unsqueezerI   )r+   rQ   rS   sequence_lengthr8   r-   r-   r.   rN   m   s   	z6MPNetEmbeddings.create_position_ids_from_inputs_embeds)NNN)r0   r1   r2   r<   rV   rN   __classcell__r-   r-   rK   r.   r5   D   s    
r5   c                       .   e Zd Z fddZ				dddZ  ZS )MPNetSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	
|j|j| _t	|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r;   r<   r>   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizer   r   qkvorC   attention_probs_dropout_probrE   rJ   rK   r-   r.   r<      s   

zMPNetSelfAttention.__init__NFc                 K   sF  |j \}}}	| ||d| j| jdd}
| ||d| j| jdd}| ||d| j| jdd}t	|
|dd}|t
| j }|d urS||7 }|d ur[|| }tjj|dd}| |}|d urp|| }t	||}|dddd }| d d | jf }|j| }| |}|r||f}|S |f}|S )Nr9   r      dimr   r   )shaperg   viewra   re   	transposerh   ri   rG   matmulmathsqrtr   
functionalsoftmaxrE   permute
contiguousrO   rf   rj   )r+   hidden_statesattention_mask	head_maskposition_biasoutput_attentionsrR   
batch_sizerT   _rg   rh   ri   attention_scoresattention_probscnew_c_shaperj   outputsr-   r-   r.   rV      s@   	


zMPNetSelfAttention.forwardNNNFr0   r1   r2   r<   rV   r\   r-   r-   rK   r.   r^      s    r^   c                       s6   e Zd Z fddZdd Z				d	ddZ  ZS )
MPNetAttentionc                    sB   t    t|| _tj|j|jd| _t|j	| _
t | _d S Nr6   )r;   r<   r^   attnr   r(   r>   rB   rC   rD   rE   setpruned_headsrJ   rK   r-   r.   r<      s
   

zMPNetAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| jj
|dd| j_
| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rn   )lenr   r   ra   re   r   r   rg   rh   ri   rj   rf   union)r+   headsindexr-   r-   r.   prune_heads   s   zMPNetAttention.prune_headsNFc           
      K   sB   | j |||||d}| | |d | }|f|dd   }	|	S )N)r~   r   r   )r   r(   rE   )
r+   rz   r{   r|   r}   r~   rR   self_outputsattention_outputr   r-   r-   r.   rV      s   	zMPNetAttention.forwardr   )r0   r1   r2   r<   r   rV   r\   r-   r-   rK   r.   r      s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MPNetIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S N)r;   r<   r   r   r>   intermediate_sizedenser   
hidden_actstrr	   intermediate_act_fnrJ   rK   r-   r.   r<      s
   
zMPNetIntermediate.__init__rz   returnc                 C   s   |  |}| |}|S r   )r   r   )r+   rz   r-   r-   r.   rV     s   

zMPNetIntermediate.forwardr0   r1   r2   r<   rG   TensorrV   r\   r-   r-   rK   r.   r      s    r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )MPNetOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r;   r<   r   r   r   r>   r   r(   rB   rC   rD   rE   rJ   rK   r-   r.   r<     s   
zMPNetOutput.__init__rz   input_tensorr   c                 C   s&   |  |}| |}| || }|S r   )r   rE   r(   )r+   rz   r   r-   r-   r.   rV     s   

zMPNetOutput.forwardr   r-   r-   rK   r.   r     s    $r   c                       r]   )
MPNetLayerc                    s,   t    t|| _t|| _t|| _d S r   )r;   r<   r   	attentionr   intermediater   outputrJ   rK   r-   r.   r<     s   


zMPNetLayer.__init__NFc                 K   sL   | j |||||d}|d }|dd  }	| |}
| |
|}|f|	 }	|	S )N)r}   r~   r   r   )r   r   r   )r+   rz   r{   r|   r}   r~   rR   self_attention_outputsr   r   intermediate_outputlayer_outputr-   r-   r.   rV      s   	

zMPNetLayer.forwardr   r   r-   r-   rK   r.   r     s    	r   c                       sp   e Zd Z fddZ					ddejdeej deej ded	ed
efddZdddZ	e
dddZ  ZS )MPNetEncoderc                    sN   t     | _ j| _t fddt jD | _	t
 j| j| _d S )Nc                    s   g | ]}t  qS r-   )r   ).0r   r   r-   r.   
<listcomp>>  s    z)MPNetEncoder.__init__.<locals>.<listcomp>)r;   r<   r   ra   n_headsr   
ModuleListrangenum_hidden_layerslayerr&   relative_attention_num_bucketsrelative_attention_biasrJ   rK   r   r.   r<   :  s
   
 zMPNetEncoder.__init__NFrz   r{   r|   r~   output_hidden_statesreturn_dictc                 K   s   |  |}|r	dnd }	|rdnd }
t| jD ]'\}}|r!|	|f }	||||| |fd|i|}|d }|r=|
|d f }
q|rE|	|f }	|sStdd ||	|
fD S t||	|
dS )Nr-   r~   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r-   )r   ri   r-   r-   r.   	<genexpr>d  s    z'MPNetEncoder.forward.<locals>.<genexpr>)last_hidden_staterz   
attentions)compute_position_bias	enumerater   tupler   )r+   rz   r{   r|   r~   r   r   rR   r}   all_hidden_statesall_attentionsilayer_modulelayer_outputsr-   r-   r.   rV   A  s:   



zMPNetEncoder.forward    c                 C   s   | d| d| d}}}|d ur,|d d d d d f }|d d d d d f }ntj|tjdd d d f }tj|tjdd d d f }|| }	| j|	|d}
|
|j}
| |
}|g d	d}|
|d||f }|S )Nr   r   )rW   )num_buckets)rl   r   r   r9   )rO   rG   rH   rY   relative_position_buckettorX   r   rx   rZ   rI   ry   )r+   xr8   r   bszqlenklencontext_positionmemory_positionrelative_position	rp_bucketvaluesr-   r-   r.   r   k  s   "
z"MPNetEncoder.compute_position_bias   c                 C   s   d}|  }|d }||dk  tj| 7 }t|}|d }||k }|t| | t||  ||   tj }t|t||d }|t	|||7 }|S )Nr   rl   r   )
r   rG   rY   abslogfloatrt   min	full_likewhere)r   r   max_distanceretn	max_exactis_smallval_if_larger-   r-   r.   r   }  s   
&z%MPNetEncoder.relative_position_bucket)NNFFF)Nr   )r   r   )r0   r1   r2   r<   rG   r   r   boolrV   r   staticmethodr   r\   r-   r-   rK   r.   r   9  s.    


*r   c                       r   )MPNetPoolerc                    s*   t    t|j|j| _t | _d S r   )r;   r<   r   r   r>   r   Tanh
activationrJ   rK   r-   r.   r<     s   
zMPNetPooler.__init__rz   r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )r+   rz   first_token_tensorpooled_outputr-   r-   r.   rV     s   

zMPNetPooler.forwardr   r-   r-   rK   r.   r     s    r   c                       s   e Zd Zd fdd	Zdd Zdd Zdd	 Ze	
	
	
	
	
	
	
	
ddee	j
 dee	j dee	j
 dee	j dee	j dee dee dee deee	j ef fddZ  ZS )
MPNetModelTc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r;   r<   r   r5   rU   r   encoderr   pooler	post_init)r+   r   add_pooling_layerrK   r-   r.   r<     s   

zMPNetModel.__init__c                 C      | j jS r   rU   r?   r+   r-   r-   r.   get_input_embeddings     zMPNetModel.get_input_embeddingsc                 C   s   || j _d S r   r   )r+   valuer-   r-   r.   set_input_embeddings  s   zMPNetModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r+   heads_to_pruner   r   r-   r-   r.   _prune_heads  s   zMPNetModel._prune_headsNrP   r{   r8   r|   rQ   r~   r   r   r   c	                 K   sL  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}| 
||
}| || j j}| j|||d}| j||||||d}|d }| jd ur| |nd }|s||f|dd   S t|||j|jd	S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer9   z5You have to specify either input_ids or inputs_embeds)rX   )rP   r8   rQ   )r{   r|   r~   r   r   r   r   )r   pooler_outputrz   r   )r   r~   r   use_return_dictrc   %warn_if_padding_and_no_attention_maskrO   rX   rG   onesget_extended_attention_maskget_head_maskr   rU   r   r   r   rz   r   )r+   rP   r{   r8   r|   rQ   r~   r   r   rR   rS   rX   extended_attention_maskembedding_outputencoder_outputssequence_outputr   r-   r-   r.   rV     sH   
zMPNetModel.forward)T)NNNNNNNN)r0   r1   r2   r<   r   r   r   r   r   rG   
LongTensorFloatTensorr   r   r   r   r   rV   r\   r-   r-   rK   r.   r     sB    	r   c                       s   e Zd ZdgZ fddZdd Zdd Ze									dd	ee	j
 d
ee	j dee	j
 dee	j dee	j dee	j
 dee dee dee deee	j ef fddZ  ZS )MPNetForMaskedLMzlm_head.decoderc                    s0   t  | t|dd| _t|| _|   d S NF)r   )r;   r<   r   r   r*   lm_headr   rJ   rK   r-   r.   r<     s   
zMPNetForMaskedLM.__init__c                 C   r   r   )r   decoderr   r-   r-   r.   get_output_embeddings  r   z&MPNetForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )r   r   r$   )r+   new_embeddingsr-   r-   r.   set_output_embeddings  s   z&MPNetForMaskedLM.set_output_embeddingsNrP   r{   r8   r|   rQ   labelsr~   r   r   r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur7t }||d| j j|d}|	sM|f|
dd  }|durK|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr{   r8   r|   rQ   r~   r   r   r   r9   rl   losslogitsrz   r   )
r   r   r   r   r   rq   r=   r   rz   r   )r+   rP   r{   r8   r|   rQ   r  r~   r   r   r   r   prediction_scoresmasked_lm_lossloss_fctr   r-   r-   r.   rV     s4   
zMPNetForMaskedLM.forward	NNNNNNNNN)r0   r1   r2   _tied_weights_keysr<   r  r  r   r   rG   r   r   r   r   r   r   r   rV   r\   r-   r-   rK   r.   r     sH    		
r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r*   z5MPNet Head for masked and permuted language modeling.c                    sh   t    t|j|j| _tj|j|jd| _tj|j|j	dd| _
tt|j	| _| j| j
_d S )Nr6   F)r$   )r;   r<   r   r   r>   r   r(   rB   
layer_normr=   r   	ParameterrG   zerosr$   rJ   rK   r-   r.   r<   F  s   
zMPNetLMHead.__init__c                 C   s   | j | j_ d S r   )r$   r   r   r-   r-   r.   _tie_weightsQ  s   zMPNetLMHead._tie_weightsc                 K   s*   |  |}t|}| |}| |}|S r   )r   r
   r  r   r+   featuresrR   r   r-   r-   r.   rV   T  s
   


zMPNetLMHead.forward)r0   r1   r2   __doc__r<   r  rV   r\   r-   r-   rK   r.   r*   C  s
    r*   z
    MPNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe									ddeej deej deej deej deej d	eej d
ee	 dee	 dee	 de
eej ef fddZ  ZS )MPNetForSequenceClassificationc                    s8   t  | |j| _t|dd| _t|| _|   d S r   )r;   r<   
num_labelsr   r   MPNetClassificationHead
classifierr   rJ   rK   r-   r.   r<   f  s
   
z'MPNetForSequenceClassification.__init__NrP   r{   r8   r|   rQ   r  r~   r   r   r   c
              
   C   sf  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtjksG|jtj	krLd| j _nd| j _| j jdkrnt
 }| jdkrh|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|
d	d  }|dur|f| S |S t|||
j|
jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr9   rl   r  )r   r   r   r  problem_typer  rW   rG   rY   rd   r   squeezer   rq   r   r   rz   r   r+   rP   r{   r8   r|   rQ   r  r~   r   r   r   r   r  r  r  r   r-   r-   r.   rV   p  sR   



"


z&MPNetForSequenceClassification.forwardr  )r0   r1   r2   r<   r   r   rG   r   r   r   r   r   r   r   rV   r\   r-   r-   rK   r.   r  _  sB    
	
r  c                       r  )MPNetForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r;   r<   r   r   r   rC   rD   rE   r   r>   r  r   rJ   rK   r-   r.   r<     s
   
zMPNetForMultipleChoice.__init__NrP   r{   r8   r|   rQ   r  r~   r   r   r   c
              
   C   sN  |	dur|	n| j j}	|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}| j||||||||	d}|d }| |}| |}|d|
}d}|durt }|||}|	s|f|dd  }|dur|f| S |S t	|||j
|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r9   rm   )r8   r{   r|   rQ   r~   r   r   rl   r  )r   r   rp   rq   rO   r   rE   r  r   r   rz   r   )r+   rP   r{   r8   r|   rQ   r  r~   r   r   num_choicesflat_input_idsflat_position_idsflat_attention_maskflat_inputs_embedsr   r   r  reshaped_logitsr  r  r   r-   r-   r.   rV     sH   $



zMPNetForMultipleChoice.forwardr  )r0   r1   r2   r<   r   r   rG   r   r   r   r   r   r   r   rV   r\   r-   r-   rK   r.   r!    sB    
	
r!  c                       r  )MPNetForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S r   )r;   r<   r  r   r   r   rC   rD   rE   r   r>   r  r   rJ   rK   r-   r.   r<     s   z$MPNetForTokenClassification.__init__NrP   r{   r8   r|   rQ   r  r~   r   r   r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|
dd  }|durO|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r9   rl   r  )r   r   r   rE   r  r   rq   r  r   rz   r   r   r-   r-   r.   rV     s6   

z#MPNetForTokenClassification.forwardr  )r0   r1   r2   r<   r   r   rG   r   r   r   r   r   r   r   rV   r\   r-   r-   rK   r.   r(    sB    	
r(  c                       s(   e Zd ZdZ fddZdd Z  ZS )r  z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r   )r;   r<   r   r   r>   r   rC   rD   rE   r  out_projrJ   rK   r-   r.   r<   W  s   
z MPNetClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )rE   r   rG   tanhr)  r  r-   r-   r.   rV   ]  s   




zMPNetClassificationHead.forward)r0   r1   r2   r  r<   rV   r\   r-   r-   rK   r.   r  T  s    r  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )MPNetForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r   )
r;   r<   r  r   r   r   r   r>   
qa_outputsr   rJ   rK   r-   r.   r<   i  s
   z"MPNetForQuestionAnswering.__init__NrP   r{   r8   r|   rQ   start_positionsend_positionsr~   r   r   r   c              
   C   sF  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr  r   r   r9   rn   )ignore_indexrl   )r  start_logits
end_logitsrz   r   )r   r   r   r,  splitr  ry   r   rO   clampr   r   rz   r   )r+   rP   r{   r8   r|   rQ   r-  r.  r~   r   r   r   r   r  r0  r1  
total_lossignored_indexr  
start_lossend_lossr   r-   r-   r.   rV   s  sN   






z!MPNetForQuestionAnswering.forward)
NNNNNNNNNN)r0   r1   r2   r<   r   r   rG   r   r   r   r   r   r   r   rV   r\   r-   r-   rK   r.   r+  g  sH    
	
r+  c                 C   s2   |  | }tj|dd|| }| | S )z
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`. :param torch.Tensor x: :return torch.Tensor:
    r   rn   )nerd   rG   cumsumtype_asrY   )rP   r'   maskincremental_indicesr-   r-   r.   rM     s   rM   )r   r!  r+  r  r(  r   r   r   )7r  rt   typingr   r   rG   r   torch.nnr   r   r   activationsr	   r
   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_mpnetr   
get_loggerr0   loggerr   Moduler5   r^   r   r   r   r   r   r   r   r   r*   r  r!  r(  r  r+  rM   __all__r-   r-   r-   r.   <module>   sP   $	
;M/ ZYGP\AK