o
    ߥi~                     @   sT  d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! e" ZG dd de	j#Z$G dd dej%j&Z'G dd de(Z)dd Z*G dd dej%j&Z+G dd de	j#Z,G dd de	j#Z-G d d! d!e	j#Z.G d"d# d#e	j#Z/G d$d% d%e	j#Z0G d&d' d'e	j#Z1G d(d) d)e	j#Z2G d*d+ d+e	j#Z3d,d- Z4	.	.d@d/d0Z5ej6j7d1d2 Z8ej6j7d3d4 Z9ej6j7d5d6 Z:G d7d8 d8e	j#Z;G d9d: d:e	j#Z<G d;d< d<eeZ=ej>ej?ej@d=G d>d? d?e=ZAdS )Az PyTorch DeBERTa-v2 model.    )Sequence)OptionalTupleUnionN)nn)	LayerNorm)ACT2FN)PreTrainedModel)softmax_backward_data)Models)Model
TorchModel)MODELS)AttentionBackboneModelOutput)logger)Tasks   )DebertaV2Configc                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s2   t    t|j|j| _t|j| _|| _	d S N)
super__init__r   Linearpooler_hidden_sizedenseStableDropoutpooler_dropoutdropoutconfigselfr   	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/deberta_v2/backbone.pyr   *   s   

zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S Nr   )r   r   r   r   pooler_hidden_act)r    hidden_statescontext_tokenpooled_outputr#   r#   r$   forward1   s
   

zContextPooler.forwardc                 C      | j jS r   )r   hidden_sizer    r#   r#   r$   
output_dim;   s   zContextPooler.output_dim)__name__
__module____qualname__r   r*   propertyr.   __classcell__r#   r#   r!   r$   r   (   s
    
r   c                   @   s4   e Zd ZdZedd Zedd Zedd ZdS )	XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    >>> import torch
    >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    c                 C   sX   || _ |tj }||tt|jj}t	|| j }|
|d | | |S r%   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)r    inputmaskr5   rmaskoutputr#   r#   r$   r*   \   s   
zXSoftmax.forwardc                 C   s$   | j \}t| ||| j|}|d d fS r   )saved_tensorsr
   r5   )r    grad_outputrD   	inputGradr#   r#   r$   backwardh   s
   
zXSoftmax.backwardc           
      C   s   dd l m  m} ddlm}m} | jd||jd d}| jd| d| jdtj	dtj
d	d
||jd d}|| ||| jdt	t|  jd
}	|| |	|}	|| |	|| jdtj	dtjd	d
S )Nr   )r9   r>   CastLong)to_iSubConstantr   r<   )value_tByte)torch.onnx.symbolic_helperonnxsymbolic_helpertorch.onnx.symbolic_opset9r9   r>   opcast_pytorch_to_onnxr7   r:   int64r;   typer<   r=   uint8)
gr    rB   r5   sym_helpr9   r>   mask_cast_valuer_maskrD   r#   r#   r$   symbolico   s2   zXSoftmax.symbolicN)r/   r0   r1   __doc__staticmethodr*   rH   r^   r#   r#   r#   r$   r4   A   s    

r4   c                   @   s   e Zd Zdd ZdS )DropoutContextc                 C   s   d| _ d | _d| _d| _d S )Nr   r   T)r   rB   scale
reuse_maskr-   r#   r#   r$   r      s   
zDropoutContext.__init__N)r/   r0   r1   r   r#   r#   r#   r$   ra      s    ra   c                 C   s   t |ts
|}d }n|j}||j9 }|jr|jnd }|dkr2|d u r2dt| d|  	tj
}t |tr?|jd u r?||_||fS )Nr   r   )
isinstancera   r   rb   rc   rB   r7   
empty_like
bernoulli_r6   r8   )rA   local_contextr   rB   r#   r#   r$   get_mask   s   



rh   c                	   @   sZ   e Zd ZdZedd Zedd Zedejj	dejj
deeef d	ejj
fd
dZdS )XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                 C   sD   t ||\}}dd|  | _|dkr | | ||d| j S |S )Ng      ?r   r   )rh   rb   r@   r9   )ctxrA   	local_ctxrB   r   r#   r#   r$   r*      s   
zXDropout.forwardc                 C   s0   | j dkr| j\}||d| j  d fS |d fS )Nr   r   )rb   rE   r9   )rj   rF   rB   r#   r#   r$   rH      s   
zXDropout.backwardrZ   rA   rk   returnc                 C   s4   ddl m} |}t|tr|j}d}|| |||S )Nr   )symbolic_opset12T)
torch.onnxrm   rd   ra   r   )rZ   rA   rk   rm   	dropout_ptrainr#   r#   r$   r^      s   
zXDropout.symbolicN)r/   r0   r1   r_   r`   r*   rH   r7   _CGraphValuer   floatra   r^   r#   r#   r#   r$   ri      s    
	

ri   c                       sB   e Zd ZdZ fddZdd Zdd Zdd
dZdd Z  Z	S )r   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                    s    t    || _d| _d | _d S r%   )r   r   	drop_probcountcontext_stack)r    ru   r!   r#   r$   r      s   

zStableDropout.__init__c                 C   s$   | j r| jdkrt||  S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingru   ri   applyget_context)r    xr#   r#   r$   r*      s   zStableDropout.forwardc                 C   s   d| _ d | _d S r%   )rv   rw   r-   r#   r#   r$   clear_context   s   
zStableDropout.clear_contextTr   c                 C   s2   | j d u rg | _ d| _| j D ]}||_||_qd S r%   )rw   rv   rc   rb   )r    rc   rb   cr#   r#   r$   init_context   s   

zStableDropout.init_contextc                 C   sT   | j d ur'| jt| j kr| j t  | j | j }| j|_|  jd7  _|S | jS )Nr   )rw   rv   lenappendra   ru   r   )r    rj   r#   r#   r$   rz      s   
zStableDropout.get_context)Tr   )
r/   r0   r1   r_   r   r*   r|   r~   rz   r3   r#   r#   r!   r$   r      s    
r   c                       $   e Zd Z fddZdd Z  ZS )DebertaV2SelfOutputc                    s<   t    t|j|j| _t|j|j| _t|j	| _
d S r   )r   r   r   r   r,   r   r   layer_norm_epsr   hidden_dropout_probr   r   r!   r#   r$   r     s   
zDebertaV2SelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r   r   r   r    r'   input_tensorr#   r#   r$   r*        

zDebertaV2SelfOutput.forwardr/   r0   r1   r   r*   r3   r#   r#   r!   r$   r     s    r   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaV2Attentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   DisentangledSelfAttentionr    r   rD   r   r   r!   r#   r$   r     s   



zDebertaV2Attention.__init__FNc           
      C   sJ   | j ||||||d}|r|\}}|d u r|}| ||}	|r#|	|fS |	S )N)query_statesrelative_posrel_embeddings)r    rD   )
r    r'   attention_maskoutput_attentionsr   r   r   self_output
att_matrixattention_outputr#   r#   r$   r*     s    	zDebertaV2Attention.forwardFNNNr   r#   r#   r!   r$   r         
r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )DebertaV2Intermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r   r,   intermediate_sizer   rd   
hidden_actstrr   intermediate_act_fnr   r!   r#   r$   r   ;  s
   
zDebertaV2Intermediate.__init__r'   rl   c                 C   s   |  |}| |}|S r   )r   r   )r    r'   r#   r#   r$   r*   C  s   

zDebertaV2Intermediate.forward)r/   r0   r1   r   r7   Tensorr*   r3   r#   r#   r!   r$   r   9  s    r   c                       r   )DebertaV2Outputc                    sB   t    t|j|j| _t|j|j| _t	|j
| _|| _d S r   )r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r!   r#   r$   r   L  s
   

zDebertaV2Output.__init__c                 C   r   r   r   r   r#   r#   r$   r*   S  r   zDebertaV2Output.forwardr   r#   r#   r!   r$   r   J  s    r   c                       s.   e Zd Z fddZ				dddZ  ZS )DebertaV2Layerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   r   	attentionr   intermediater   rD   r   r!   r#   r$   r   ]  s   


zDebertaV2Layer.__init__NFc                 C   sH   | j ||||||d}|r|\}}| |}	| |	|}
|r"|
|fS |
S )Nr   r   r   r   )r   r   rD   )r    r'   r   r   r   r   r   r   r   intermediate_outputlayer_outputr#   r#   r$   r*   c  s   	
zDebertaV2Layer.forward)NNNFr   r#   r#   r!   r$   r   [  r   r   c                       r   )	ConvLayerc                    sx   t    t|dd}t|dd}t|dd| _tj|j|j||d d |d| _t|j|j	| _t
|j| _|| _d S )	Nconv_kernel_size   conv_groupsr   conv_acttanh   )paddinggroups)r   r   getattrr   r   Conv1dr,   convr   r   r   r   r   r   )r    r   kernel_sizer   r!   r#   r$   r     s   


zConvLayer.__init__c           	      C   s   |  |ddd ddd }d|  }||d| d t| j	 | 
|}|| }| ||}|d u rE|}|S | | kr`| dkr[|dd}|d}||j}|| }|S )Nr   r   r      )r   permute
contiguousr8   r?   	unsqueezeexpandsizer   r   r   r   r6   r5   squeezer<   )	r    r'   residual_states
input_maskoutrC   layer_norm_inputrD   output_statesr#   r#   r$   r*     s(   

zConvLayer.forwardr   r#   r#   r!   r$   r   ~  s    r   c                       sN   e Zd ZdZ fddZdd Zdd Zdd	d
Z					dddZ  Z	S )DebertaV2Encoderz8Modified BertEncoder with relative position bias supportc                    s  t    t fddt jD | _t dd| _| jrMt dd| _	| j	dk r/ j
| _	t dd| _| j	d	 }| jd
krE| jd	 }t| j| _dd t dd dD | _d| jv rmt j jdd| _t dd
d
kryt nd | _d| _d S )Nc                    s   g | ]}t  qS r#   )r   .0_r   r#   r$   
<listcomp>  s    z-DebertaV2Encoder.__init__.<locals>.<listcomp>relative_attentionFmax_relative_positionsr   r   position_bucketsr   r   c                 S   s   g | ]}|  qS r#   )strip)r   r{   r#   r#   r$   r     s    norm_rel_ebdnone|
layer_normT)elementwise_affiner   )r   r   r   
ModuleListrangenum_hidden_layerslayerr   r   r   max_position_embeddingsr   	Embeddingr,   r   lowersplitr   r   r   r   r   gradient_checkpointing)r    r   pos_ebd_sizer!   r   r$   r     sF   






zDebertaV2Encoder.__init__c                 C   s2   | j r| jjnd }|d urd| jv r| |}|S )Nr   )r   r   weightr   r   )r    r   r#   r#   r$   get_rel_embedding  s   
z"DebertaV2Encoder.get_rel_embeddingc                 C   sV   |  dkr|dd}||dd }| }|S |  dkr)|d}|S )Nr   r   r   r   )r5   r   r   byte)r    r   extended_attention_maskr#   r#   r$   get_attention_mask  s   
z#DebertaV2Encoder.get_attention_maskNc                 C   sH   | j r"|d u r"|d ur|dn|d}t||d| j| jd}|S )Nr   bucket_sizemax_position)r   r   build_relative_positionr   r   )r    r'   r   r   qr#   r#   r$   get_rel_pos  s   zDebertaV2Encoder.get_rel_posTFc              	      s  |  dkr	|}n	|ddk }| |}| |||}|r"dnd }	 r(dnd }
t|tr4|d }n|}|  }|}t| j	D ]l\}}|rL|	|f }	| j
rg| jrg fdd}tjj|||||||}n
|||||| d} rw|\}}|dkr| jd ur| |||}|d ur|}t|tr|d t| j	k r||d  nd }n|} r|
|f }
qA|r|	|f }	|std	d
 ||	|
fD S t||	|
dS )Nr   r   r   r#   c                    s    fdd}|S )Nc                     s    g | R  S r   r#   )inputs)moduler   r#   r$   custom_forward  s   zODebertaV2Encoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr#   )r   r   r   )r   r$   create_custom_forward  s   z7DebertaV2Encoder.forward.<locals>.create_custom_forward)r   r   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r#   )r   vr#   r#   r$   	<genexpr><  s    z+DebertaV2Encoder.forward.<locals>.<genexpr>last_hidden_stater'   
attentions)r5   sumr   r   r   rd   r   r   	enumerater   r   rx   r7   utils
checkpointr   r   tupler   )r    r'   r   output_hidden_statesr   r   r   return_dictr   all_hidden_statesall_attentionsnext_kvr   r   ilayer_moduler   att_mr#   r   r$   r*     s   




		


zDebertaV2Encoder.forward)NN)TFNNT)
r/   r0   r1   r_   r   r   r   r   r*   r3   r#   r#   r!   r$   r     s    %
r   c              	   C   s   t | }|d }t | |k | | k@ t |d | t | }t t || t t |d |  |d  | }t ||k| ||| }|S )Nr   r   )r7   signwherer:   type_asabsceillog)r   r   r   r   midabs_poslog_pos
bucket_posr#   r#   r$   make_log_bucket_positionE  s.   

r  r   c                 C   s   t d| }t d|}|dddf |dddf  }|dkr*|dkr*t|||}|t j}|d| ddf }|d}|S )af  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key
        bucket_size (int): the size of position bucket
        max_position (int): the maximum allowed absolute position

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r   N)r7   aranger  r6   longr   )
query_sizekey_sizer   r   q_idsk_idsrel_pos_idsr#   r#   r$   r   W  s    
r   c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r   r   r   )c2p_posquery_layerr   r#   r#   r$   c2p_dynamic_expandx     r  c                 C   s*   |  |d|d|d|dgS )Nr   r   r   r  )r  r  	key_layerr#   r#   r$   p2c_dynamic_expand  r  r  c                 C   s*   |  | d d | d|df S )Nr   r   r  )	pos_indexp2c_attr  r#   r#   r$   pos_dynamic_expand  s   r  c                       sB   e Zd ZdZ fddZdd Z				ddd	Zd
d Z  ZS )r   a  
    Disentangled self-attention module

    Parameters:
        config (`DebertaV2Config`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaV2Config`]

    c                    s  t    |j|j dkrtd|j d|j d|j| _|j|j }t|d|| _| j| j | _tj	|j| jdd| _
tj	|j| jdd| _tj	|j| jdd| _t|dd	| _|jd urb|jng | _t|d
d	| _| jrt|dd| _t|dd| _| jdk r|j| _| j| _| jdkr| j| _t|j| _| jsd| jv rtj	|j| jdd| _d| jv rt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()attention_head_sizeTbiasshare_att_keyFr   r   r   r   r   c2pp2c)r   r   r,   num_attention_heads
ValueErrorr   r  all_head_sizer   r   
query_projkey_proj
value_projr  pos_att_typer   r   r   r   r   r   r   pos_dropoutpos_key_projpos_query_projattention_probs_dropout_probr   )r    r   _attention_head_sizer!   r#   r$   r     s\   









z"DisentangledSelfAttention.__init__c                 C   sL   |  d d |df }||}|dddd d| d| dS )Nr   r   r   r   r   )r   viewr   r   )r    r{   attention_headsnew_x_shaper#   r#   r$   transpose_for_scores  s
   
z.DisentangledSelfAttention.transpose_for_scoresFNc              	   C   s  |du r|}|  | || j}|  | || j}|  | || j}	d}
d}d| jv r1|d7 }d| jv r:|d7 }ttj|	dtj
d| }t||ddtj||jd }| jrm| |}| |||||}
|
duru||
 }|}|d| j|	d|	d}t||d}| |}t|d|	d|	d|	}|d| j|	d|	ddd	dd
 }|	 dd d }||}|r||fS |S )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.ByteTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, optional):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, optional):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r  r  r   rN   r   r   r   r   r   )r,  r   r  r!  r"  r#  r7   sqrtr:   r   rt   bmm	transposer<   r   r$  disentangled_attention_biasr)  r4   ry   r   r   r   )r    r'   r   r   r   r   r   r  r  value_layerrel_attscale_factorrb   attention_scoresattention_probscontext_layernew_context_layer_shaper#   r#   r$   r*     s   &







z!DisentangledSelfAttention.forwardc              
   C   s$  |d u r| d}t|| d| j| jd}| dkr%|dd}n| dkr1|d}n| dkr@td|  | j}| 	|j
}|d|d d d f d}| jr| | || j| d| j dd}| | || j| d| j dd}	n4d	| jv r| | || j| d| j dd}	d
| jv r| | || j| d| j dd}d}
d	| jv rttj|	 dtjd| }t||	dd}t|| d|d d }tj|d|d| d| d| dgd}|
|tj||jd 7 }
d
| jv rttj| dtjd| }| d| dkrLt| d| d| j| jd	|j
}|d}n|}t| | d|d d }t||dd}tj|d|d| d| d| dgddd}|
|tj||jd 7 }
|
S )Nr   r   r   r   r   r   r   z2Relative position ids must be of dim 2 or 3 or 4. r  r  r   rN   )r5   index)r   r   r   r   r5   r   r  r   r  r6   devicer  r,  r   r  repeatr!  r#  r%  r&  r7   r.  r:   rt   r/  r0  clampgatherr   r   r<   )r    r  r  r   r   r4  r   att_spanpos_query_layerpos_key_layerscorerb   c2p_attr  r_posp2c_posr  r#   r#   r$   r1  .  s   



	
	z5DisentangledSelfAttention.disentangled_attention_biasr   )	r/   r0   r1   r_   r   r,  r*   r1  r3   r#   r#   r!   r$   r     s    
,

\r   c                       s4   e Zd ZdZ fddZ					dddZ  ZS )DebertaV2EmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	s,d | _
n	t|j| j| _
|jdkrCt|j| j| _| j|jkrTtj| j|jdd| _t|j|j| _t|j| _|| _| d	t|jd
 d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr  position_ids)r   r   )r   r   r   r,   rG  r   r   
vocab_sizeword_embeddingsrI  position_embeddingsr   type_vocab_sizetoken_type_embeddingsr   
embed_projr   r   r   r   r   r   register_bufferr7   r  r   )r    r   rF  r!   r#   r$   r     s>   



zDebertaV2Embeddings.__init__Nc                 C   sN  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jd urI| | }nt|}|}	| j	rW|	|7 }	| j
jdkrf| |}
|	|
7 }	| j| j
jkrr| |	}	| |	}	|d ur| |	 kr| dkr|dd}|d}||	j}|	| }	| |	}	|	S )Nr   r   r<   r:  r   r   r   )r   rJ  r7   zerosr  r:  rL  rM  
zeros_likerI  r   rN  rO  rG  r,   rP  r   r5   r   r   r6   r<   r   )r    	input_idstoken_type_idsrJ  rB   inputs_embedsinput_shape
seq_lengthrM  
embeddingsrO  r#   r#   r$   r*     sB   








zDebertaV2Embeddings.forward)NNNNN)r/   r0   r1   r_   r   r*   r3   r#   r#   r!   r$   rE    s    !rE  c                       sZ   e Zd ZdZeZdZdgZdgZdZ	 fddZ
dd	 ZdddZe fddZ  ZS )DebertaV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertarJ  rM  Tc                    s*   t  j|jfi | t t| | d S r   )r   r   name_or_pathr   r    r   kwargsr!   r#   r$   r     s   z!DebertaV2PreTrainedModel.__init__c                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rA|jjjd| jjd |jdurC|jj|j 	  dS dS dS )zInitialize the weights.g        )meanstdN)rd   r   r   r   datanormal_r   initializer_ranger  zero_r   rH  )r    r   r#   r#   r$   _init_weights  s   

z&DebertaV2PreTrainedModel._init_weightsFc                 C   s   t |tr
||_d S d S r   )rd   r   r   )r    r   valuer#   r#   r$   _set_gradient_checkpointing  s   

z4DebertaV2PreTrainedModel._set_gradient_checkpointingc                    sD   | dd }|d u rtdi |}| |}|S tt| j|d}|S )N	model_dir)pretrained_model_name_or_pathr#   )popr   r   r   from_pretrained)clsr_  ri  ponet_configmodelr!   r#   r$   _instantiate  s   z%DebertaV2PreTrainedModel._instantiate)F)r/   r0   r1   r_   r   config_classbase_model_prefix_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr   rf  rh  classmethodrp  r3   r#   r#   r!   r$   r[    s    
r[  )module_namec                       s   e Zd ZdZ fddZdd Zdd Zdd	 Z	
	
	
	
	
	
	
	
ddee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )DebertaV2Modela  The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.

    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration.
    c                    s8   t  | t|| _t|| _d| _|| _|   d S r%   )	r   r   rE  rZ  r   encoderz_stepsr   	post_initr^  r!   r#   r$   r   -  s   

zDebertaV2Model.__init__c                 C   r+   r   rZ  rL  r-   r#   r#   r$   get_input_embeddings7  s   z#DebertaV2Model.get_input_embeddingsc                 C   s   || j _d S r   r|  )r    new_embeddingsr#   r#   r$   set_input_embeddings:  s   z#DebertaV2Model.set_input_embeddingsc                 C   s   t d)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r    heads_to_pruner#   r#   r$   _prune_heads=  s   zDebertaV2Model._prune_headsNrU  r   rV  rJ  rW  r   r   r   rl   c	              	      s  |dur|n j j}|dur|n j j}|dur|n j j}|dur*|dur*td|dur3| }	n|dur@| dd }	ntd|durK|jn|j}
|du rYtj|	|
d}|du rftj	|	tj
|
d} j|||||d} j||d||d	}|d
 } jd
kr|d } fddt jD }|d } j } j|} j|}|d
d D ]}|||d|||d}|| q|d }|s|f||rd
ndd  S t||r|jnd|jdS )u5
  
        Args:
            input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
                Indices of input sequence tokens in the vocabulary.

            attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

            token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
                1]`:

                - 0 corresponds to a *sentence A* token,
                - 1 corresponds to a *sentence B* token.

            position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0,config.max_position_embeddings - 1]`.

            inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert *input_ids* indices into associated
                vectors than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a dataclass instead of a plain tuple.

        Returns:
            Returns `modelscope.outputs.AttentionBackboneModelOutput`

        Examples:
            >>> from modelscope.models import Model
            >>> from modelscope.preprocessors import Preprocessor
            >>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
            >>> print(model(**preprocessor('这是个测试')))
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r:  rR  )rU  rV  rJ  rB   rW  T)r   r   r   r   r   c                    s   g | ]} j jd  qS r-  )ry  r   r   r-   r#   r$   r     s    z*DebertaV2Model.forward.<locals>.<listcomp>Fr   r   r   )r   r   r   use_return_dictr  r   r:  r7   onesrS  r  rZ  ry  rz  r   r   r   r   r   r   r'   r   )r    rU  r   rV  rJ  rW  r   r   r   rX  r:  embedding_outputencoder_outputsencoded_layersr'   layersr   r   rel_posr   sequence_outputr#   r-   r$   r*   E  s   9


zDebertaV2Model.forward)NNNNNNNN)r/   r0   r1   r_   r   r}  r  r  r   r7   r   r8   r   r   r   r*   r3   r#   r#   r!   r$   rx    sB    

	

rx  )r   r   )Br_   collections.abcr   typingr   r   r   r7   torch.utils.checkpointr   torch.nnr   transformers.activationsr   transformers.modeling_utilsr	   transformers.pytorch_utilsr
   modelscope.metainfor   modelscope.modelsr   r   modelscope.models.builderr   modelscope.outputsr   modelscope.utilsr   loggingmodelscope.utils.constantr   configurationr   
get_loggerModuler   autogradFunctionr4   objectra   rh   ri   r   r   r   r   r   r   r   r   r  r   jitscriptr  r  r  r   rE  r[  register_modulebackbone
deberta_v2rx  r#   r#   r#   r$   <module>   s`   H
)2&#) 
!
	
	
 |Y0