o
    ei\                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ eeZG dd dejZG dd dejZ ej!j"dd Z#ej!j"dd Z$ej!j"dd Z%ej!j"dd Z&ej!j"dej'de(fddZ)ej!j"dej'dej'fdd Z*ej!j"dej'dej'd!e(fd"d#Z+ej!j"dej'dej'fd$d%Z,G d&d' d'ejZ-G d(d) d)ejZ.G d*d+ d+ejZ/G d,d- d-ejZ0G d.d/ d/ejZ1G d0d1 d1eZ2G d2d3 d3ejZ3eG d4d5 d5eZ4eG d6d7 d7e4Z5G d8d9 d9ejZ6G d:d; d;ejZ7G d<d= d=ejZ8G d>d? d?ejZ9G d@dA dAejZ:eG dBdC dCe4Z;G dDdE dEejZ<edFdGG dHdI dIe4Z=eG dJdK dKe4Z>eG dLdM dMe4Z?g dNZ@dS )OzPyTorch DeBERTa model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )DebertaConfigc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )DebertaLayerNormz2LayerNorm module (epsilon inside the square root).-q=c                    s8   t    tt|| _tt|| _|| _	d S N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__ j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.pyr   )   s   

zDebertaLayerNorm.__init__c                 C   sj   |j }| }|jddd}|| djddd}|| t|| j  }||}| j| | j	 }|S )NT)keepdim   )
dtypefloatmeanpowr   sqrtr   tor   r   )r    hidden_states
input_typer,   varianceyr%   r%   r&   forward/   s   
zDebertaLayerNorm.forward)r   __name__
__module____qualname____doc__r   r4   __classcell__r%   r%   r#   r&   r   &   s    r   c                       $   e Zd Z fddZdd Z  ZS )DebertaSelfOutputc                    s>   t    t|j|j| _t|j|j| _t	|j
| _d S r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr    configr#   r%   r&   r   ;   s   
zDebertaSelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r?   rD   rA   r    r0   input_tensorr%   r%   r&   r4   A      

zDebertaSelfOutput.forwardr6   r7   r8   r   r4   r:   r%   r%   r#   r&   r<   :   s    r<   c                 C   s   |  d}| d}tj|tj| jd}tj|tj|jd}|dddf |dd|d }|d|ddf }|d}|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r*   deviceNr   r'   r   )r!   r   arangelongrO   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idsr%   r%   r&   build_relative_positionH   s   

$
r\   c                 C   s*   |  |d|d|d|dgS )Nr   r   r)   r'   expandr!   )c2p_posrU   relative_posr%   r%   r&   c2p_dynamic_expande      *ra   c                 C   s*   |  |d|d|d|dgS )Nr   r   rM   r]   )r_   rU   rV   r%   r%   r&   p2c_dynamic_expandj   rb   rc   c                 C   s*   |  | d d | d|df S )Nr)   rM   r]   )	pos_indexp2c_attrV   r%   r%   r&   pos_dynamic_expando   rb   rf   rU   scale_factorc                 C   s    t t j| dt jd| S )Nr'   r*   )r   r.   tensorr!   r+   )rU   rg   r%   r%   r&   scaled_size_sqrtw   s    rj   rV   c                 C   s"   |  d| dkrt| |S |S NrM   )r!   r\   )rU   rV   r`   r%   r%   r&   
build_rpos|   s   
rl   max_relative_positionsc                 C   s"   t tt| d|d|S rk   )r   ri   minmaxr!   )rU   rV   rm   r%   r%   r&   compute_attention_span   s   "rp   c                 C   sR   | d| dkr'|d d d d d d df d}tj| dt|| |dS | S )NrM   r   r'   r)   dimindex)r!   rT   r   gatherrf   )re   rU   rV   r`   rd   r%   r%   r&   uneven_size_corrected   s   "ru   c                       s   e Zd ZdZ fddZdd Z				ddejd	ejd
edejdB dejdB dejdB de	ejejdB f fddZ
dejdejdejdejdef
ddZ  ZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    s  t    |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _tj	|j| jd dd| _
ttj| jtjd| _ttj| jtjd| _|jd ur]|jng | _t|d	d| _t|d
d| _| jrtj	|j|jdd| _tj	|j|jdd| _nd | _d | _| jrt|dd| _| jdk r|j| _t|j| _d| jv rtj	|j| jdd| _d| jv rt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   Fr   rh   relative_attentiontalking_headrm   r'   r   c2pp2c) r   r   r>   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   r=   in_projr   r   r   r+   q_biasv_biaspos_att_typegetattrry   rz   head_logits_projhead_weights_projrm   max_position_embeddingsrB   rC   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrD   rE   r#   r%   r&   r      s>   




z"DisentangledSelfAttention.__init__c                 C   s4   |  d d | jdf }||}|ddddS )Nr'   r   r)   r   r   )r!   r}   rR   permute)r    xnew_x_shaper%   r%   r&   transpose_for_scores   s   
z.DisentangledSelfAttention.transpose_for_scoresFNr0   attention_maskoutput_attentionsquery_statesr`   rel_embeddingsreturnc                    s  |du r  |} |jddd\}}	}
nZ j jj jd dd fddtdD }t|d | j	|d j
d}t|d	 | j	|d	 j
d}t|d
 | j	|d
 j
d} fdd|||fD \}}	}
|  jddddf  }|
  jddddf  }
d}d	t j }t||}||j	|j
d }t||	dd} jr|dur|durɈ |} ||	|||}|dur|| } jdur |dd
dd	ddd	d
}| }|| t|j
j}tjj|dd} |} jdur |dd
dd	ddd	d
}t||
}|dd
d	d }|  dd d }|!|}|sA|dfS ||fS )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r'   rr   r   c                    s0   g | ] t j fd dtjD ddqS )c                    s   g | ]
}|d     qS )r   r%   ).0i)kwsr%   r&   
<listcomp>   s    z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>r   r   )r   catranger}   )r   r    r   )r   r&   r      s   0 z5DisentangledSelfAttention.forward.<locals>.<listcomp>rh   r   r)   c                    s   g | ]}  |qS r%   )r   )r   r   r    r%   r&   r      s    rM   r'   )"r   r   chunkr   r}   r   r   matmultr/   r*   r   r   lenr   rj   	transposery   r   disentangled_att_biasr   r   boolmasked_fillfinforn   r   
functionalsoftmaxrD   r   
contiguousr!   rR   )r    r0   r   r   r   r`   r   qprU   rV   value_layerqkvwqr   vrel_attrg   scaleattention_scoresattention_probscontext_layernew_context_layer_shaper%   r   r&   r4      sH   &
"""


"
"
z!DisentangledSelfAttention.forwardrU   rV   rg   c                 C   s  |d u rt |||j}| dkr|dd}n| dkr&|d}n| dkr5td|  t||| j}| }|| j| | j| d d f d}d}d| jv r| 	|}| 
|}t||dd	}	t|| d|d d }
tj|	dt|
||d
}	||	7 }d| jv r| |}| 
|}|t|| }t|||}t| | d|d d }t||dd	j|jd}tj|dt|||d
dd	}t||||}||7 }|S )Nr)   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. r{   r'   rM   rq   r|   rh   )r\   rO   rr   rT   r~   rp   rm   rQ   r   r   r   r   r   r   clamprt   ra   r   rj   rl   r/   r*   rc   ru   )r    rU   rV   r`   r   rg   att_spanscorepos_key_layerc2p_attr_   pos_query_layerr_posp2c_posre   r%   r%   r&   r   "  sT   





z/DisentangledSelfAttention.disentangled_att_biasFNNN)r6   r7   r8   r9   r   r   r   Tensorr   tupler4   r   r   r:   r%   r%   r#   r&   rv      sD    
&	
Wrv   c                       s*   e Zd ZdZ fddZdddZ  ZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	s,d | _
n	t|j| j| _
|jdkrDt|j| j| _nd | _| j|jkrYtj| j|jdd| _nd | _t|j|j| _t|j| _|| _| jd	t|jd
dd d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFrx   position_idsr   r'   )
persistent)r   r   r   r>   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsr=   
embed_projr   r@   rA   rB   rC   rD   rF   register_bufferr   rP   r^   )r    rF   r   r#   r%   r&   r   ^  s(   


zDebertaEmbeddings.__init__Nc                 C   sH  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jd urI| | }nt|}|}	| j	rW|	| }	| j
d ure| 
|}
|	|
 }	| jd uro| |	}	| |	}	|d ur| |	 kr| dkr|dd}|d}||	j}|	| }	| |	}	|	S )Nr'   r   rN   r   r)   )r!   r   r   r   rQ   rO   r   r   
zeros_liker   r   r   rA   rr   squeezerT   r/   r*   rD   )r    	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   r%   r%   r&   r4   }  s>   










zDebertaEmbeddings.forward)NNNNNr5   r%   r%   r#   r&   r   [  s    r   c                       sH   e Zd Z fddZ				d	dedeejejdB f fddZ  Z	S )
DebertaAttentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   rv   r    r<   outputrF   rE   r#   r%   r&   r     s   



zDebertaAttention.__init__FNr   r   c           
      C   sF   | j ||||||d\}}|d u r|}| ||}	|r|	|fS |	d fS )N)r   r`   r   )r    r   )
r    r0   r   r   r   r`   r   self_output
att_matrixattention_outputr%   r%   r&   r4     s   	
zDebertaAttention.forwardr   
r6   r7   r8   r   r   r   r   r   r4   r:   r%   r%   r#   r&   r     s    
r   c                       2   e Zd Z fddZdejdejfddZ  ZS )DebertaIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r=   r>   intermediate_sizer?   
isinstance
hidden_actstrr   intermediate_act_fnrE   r#   r%   r&   r     s
   
zDebertaIntermediate.__init__r0   r   c                 C      |  |}| |}|S r   )r?   r   r    r0   r%   r%   r&   r4        

zDebertaIntermediate.forwardr6   r7   r8   r   r   r   r4   r:   r%   r%   r#   r&   r     s    r   c                       r;   )DebertaOutputc                    sD   t    t|j|j| _t|j|j| _	t
|j| _|| _d S r   )r   r   r   r=   r   r>   r?   r   r@   rA   rB   rC   rD   rF   rE   r#   r%   r&   r     s
   

zDebertaOutput.__init__c                 C   rG   r   rH   rI   r%   r%   r&   r4     rK   zDebertaOutput.forwardrL   r%   r%   r#   r&   r     s    r   c                       sH   e Zd Z fddZ				d	dedeejejdB f fddZ  Z	S )
DebertaLayerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   r   	attentionr   intermediater   r   rE   r#   r%   r&   r     s   


zDebertaLayer.__init__NFr   r   c                 C   sD   | j ||||||d\}}| |}	| |	|}
|r|
|fS |
d fS )Nr   r   r`   r   )r   r   r   )r    r0   r   r   r`   r   r   r   r   intermediate_outputlayer_outputr%   r%   r&   r4     s   	

zDebertaLayer.forward)NNNFr   r%   r%   r#   r&   r     s    
r   c                       sh   e Zd ZdZ fddZdd Zdd Zdd	d
Z					ddej	dej	de
de
de
f
ddZ  ZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    s~   t    t fddt jD | _t dd| _| jr:t dd| _	| j	dk r/ j
| _	t| j	d  j| _d| _d S )	Nc                    s   g | ]}t  qS r%   )r   r   _rF   r%   r&   r     s    z+DebertaEncoder.__init__.<locals>.<listcomp>ry   Frm   r'   r   r)   )r   r   r   
ModuleListr   num_hidden_layerslayerr   ry   rm   r   r   r>   r   gradient_checkpointingrE   r#   r   r&   r     s   
 

zDebertaEncoder.__init__c                 C   s   | j r	| jj}|S d }|S r   )ry   r   r   )r    r   r%   r%   r&   get_rel_embedding  s   z DebertaEncoder.get_rel_embeddingc                 C   sN   |  dkr|dd}||dd }|S |  dkr%|d}|S )Nr)   r   rM   r'   r   )rr   rT   r   )r    r   extended_attention_maskr%   r%   r&   get_attention_mask   s   
z!DebertaEncoder.get_attention_maskNc                 C   s2   | j r|d u r|d urt||}|S t||}|S r   )ry   r\   )r    r0   r   r`   r%   r%   r&   get_rel_pos)  s   

zDebertaEncoder.get_rel_posTFr0   r   output_hidden_statesr   return_dictc              	   C   s   |  |}| |||}|r|fnd }|rdnd }	|}
|  }t| jD ]'\}}||
|||||d\}}|r;||f }|d urB|}n|}
|rK|	|f }	q$|sZtdd |||	fD S t|||	dS )Nr%   )r   r`   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r%   )r   r   r%   r%   r&   	<genexpr>Z  s    z)DebertaEncoder.forward.<locals>.<genexpr>last_hidden_stater0   
attentions)r   r  r   	enumerater   r   r
   )r    r0   r   r  r   r   r`   r  all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_mr%   r%   r&   r4   1  s8   


	

zDebertaEncoder.forward)NN)TFNNT)r6   r7   r8   r9   r   r   r   r  r   r   r   r4   r:   r%   r%   r#   r&   r     s,    
	r   c                       s<   e Zd ZU eed< dZdgZdZe	  fddZ
  ZS )DebertaPreTrainedModelrF   debertar   Tc                    s   t  | t|trt|j t|j dS t|tt	fr(t|j
 dS t|tr@t|jt|jjd d dS dS )zInitialize the weights.r'   r   N)r   _init_weightsr   rv   initzeros_r   r   LegacyDebertaLMPredictionHeadDebertaLMPredictionHeadr   r   copy_r   r   rP   shaper^   )r    moduler#   r%   r&   r  g  s   

&z$DebertaPreTrainedModel._init_weights)r6   r7   r8   r   __annotations__base_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr   no_gradr  r:   r%   r%   r#   r&   r  `  s   
 r  c                       s   e Zd Z fddZdd Zdd Ze								ddejdB d	ejdB d
ejdB dejdB dejdB de	dB de	dB de	dB de
eB fddZ  ZS )DebertaModelc                    s8   t  | t|| _t|| _d| _|| _|   d S Nr   )	r   r   r   r   r   encoderz_stepsrF   	post_initrE   r#   r%   r&   r   v  s   

zDebertaModel.__init__c                 C      | j jS r   r   r   r   r%   r%   r&   get_input_embeddings  s   z!DebertaModel.get_input_embeddingsc                 C   s   || j _d S r   r#  r    new_embeddingsr%   r%   r&   set_input_embeddings  s   z!DebertaModel.set_input_embeddingsNr   r   r   r   r   r   r  r  r   c	              	      s  |d ur|n j j}|d ur|n j j}|d ur|n j j}|d ur*|d ur*td|d ur9 || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d} j|||||d} j||d||d}|d	 } jd	kr|d
 } fddt jD }|d } j } j|} j|}|d	d  D ]}|||d|||d}|| q|d }|s|f||rd	ndd   S t||r|jnd |jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer'   z5You have to specify either input_ids or inputs_embeds)rO   rN   )r   r   r   r   r   T)r  r   r  r   rM   c                    s   g | ]} j jd  qS r   )r  r   r   r   r%   r&   r     s    z(DebertaModel.forward.<locals>.<listcomp>Fr   r)   r  )rF   r   r  use_return_dictr~   %warn_if_padding_and_no_attention_maskr!   rO   r   r   r   rQ   r   r  r   r   r   r   r  appendr
   r0   r  )r    r   r   r   r   r   r   r  r  kwargsr   rO   embedding_outputencoder_outputsencoded_layersr0   layersr   r   rel_posr   sequence_outputr%   r   r&   r4     sr   


zDebertaModel.forward)NNNNNNNN)r6   r7   r8   r   r$  r'  r   r   r   r   r   r
   r4   r:   r%   r%   r#   r&   r  t  s@    
	r  c                       r;   )$LegacyDebertaPredictionHeadTransformc                    sf   t    t|d|j| _t|j| j| _t|j	t
r#t|j	 | _n|j	| _tj| j|jd| _d S )Nr   )r"   )r   r   r   r>   r   r   r=   r?   r   r   r   r   transform_act_fnrA   r@   rE   r#   r%   r&   r     s   
z-LegacyDebertaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r?   r3  rA   r   r%   r%   r&   r4     s   


z,LegacyDebertaPredictionHeadTransform.forwardrL   r%   r%   r#   r&   r2        r2  c                       r;   )r  c                    sR   t    t|| _t|d|j| _tj| j|j	dd| _
tt|j	| _d S )Nr   Trx   )r   r   r2  	transformr   r>   r   r   r=   r   decoderr   r   r   r   rE   r#   r%   r&   r     s
   

z&LegacyDebertaLMPredictionHead.__init__c                 C   r   r   )r5  r6  r   r%   r%   r&   r4     r   z%LegacyDebertaLMPredictionHead.forwardrL   r%   r%   r#   r&   r    r4  r  c                       r   )LegacyDebertaOnlyMLMHeadc                       t    t|| _d S r   )r   r   r  predictionsrE   r#   r%   r&   r         
z!LegacyDebertaOnlyMLMHead.__init__r1  r   c                 C   s   |  |}|S r   )r9  )r    r1  prediction_scoresr%   r%   r&   r4     s   
z LegacyDebertaOnlyMLMHead.forwardr   r%   r%   r#   r&   r7    s    r7  c                       s(   e Zd ZdZ fddZdd Z  ZS )r  zMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    sl   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jdd| _tt|j| _d S )NT)r"   elementwise_affine)r   r   r   r=   r>   r?   r   r   r   r   r3  rA   r@   r   r   r   r   r   rE   r#   r%   r&   r     s   
z DebertaLMPredictionHead.__init__c                 C   s:   |  |}| |}| |}t||j | j }|S r   )r?   r3  rA   r   r   r   r   r   )r    r0   r   r%   r%   r&   r4     s   

zDebertaLMPredictionHead.forwardr5   r%   r%   r#   r&   r  	  s    r  c                       r;   )DebertaOnlyMLMHeadc                    r8  r   )r   r   r  lm_headrE   r#   r%   r&   r   %  r:  zDebertaOnlyMLMHead.__init__c                 C   s   |  ||}|S r   )r>  )r    r1  r   r;  r%   r%   r&   r4   *  s   zDebertaOnlyMLMHead.forwardrL   r%   r%   r#   r&   r=  $  s    r=  c                       s   e Zd ZdddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
ddej	d
B dej	d
B dej	d
B dej	d
B dej	d
B dej	d
B de
d
B de
d
B de
d
B deeB fddZ  ZS )DebertaForMaskedLMzcls.predictions.bias)deberta.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    sP   t  | |j| _t|| _| jrt|| _n
ddi| _t|| _	| 
  d S )Nzlm_predictions.lm_head.weightr@  )r   r   legacyr  r  r7  cls_tied_weights_keysr=  lm_predictionsr!  rE   r#   r%   r&   r   6  s   

zDebertaForMaskedLM.__init__c                 C   s   | j r| jjjS | jjjS r   )rA  rB  r9  r6  rD  r>  r?   r   r%   r%   r&   get_output_embeddingsE  s   

z(DebertaForMaskedLM.get_output_embeddingsc                 C   s:   | j r|| jj_|j| jj_d S || jj_|j| jj_d S r   )rA  rB  r9  r6  r   rD  r>  r?   r%  r%   r%   r&   set_output_embeddingsK  s
   

z(DebertaForMaskedLM.set_output_embeddingsNr   r   r   r   r   labelsr   r  r  r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| jr$| |}n	| || jjj}d}|durDt }||	d| j j
|	d}|	sZ|f|dd  }|durX|f| S |S t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r  r  r   r'   r   losslogitsr0   r  )rF   r(  r  rA  rB  rD  r   r   r   rR   r   r   r0   r  )r    r   r   r   r   r   rG  r   r  r  r+  outputsr1  r;  masked_lm_lossloss_fctr   r%   r%   r&   r4   S  s8   zDebertaForMaskedLM.forward	NNNNNNNNN)r6   r7   r8   rC  r   rE  rF  r   r   r   r   r   r   r4   r:   r%   r%   r#   r&   r?  /  sL    	
r?  c                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s4   t    t|j|j| _t|j| _|| _	d S r   )
r   r   r   r=   pooler_hidden_sizer?   rB   pooler_dropoutrD   rF   rE   r#   r%   r&   r     s   

zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S r  )rD   r?   r   rF   pooler_hidden_act)r    r0   context_tokenpooled_outputr%   r%   r&   r4     s
   

zContextPooler.forwardc                 C   r"  r   )rF   r>   r   r%   r%   r&   
output_dim  s   zContextPooler.output_dim)r6   r7   r8   r   r4   propertyrV  r:   r%   r%   r#   r&   rP    s
    
rP  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                       s   e Zd Z fddZdd Zdd Ze									ddejdB d	ejdB d
ejdB dejdB dejdB dejdB de	dB de	dB de	dB de
eB fddZ  ZS ) DebertaForSequenceClassificationc                    s   t  | t|dd}|| _t|| _t|| _| jj}t	
||| _t|dd }|d u r2| jjn|}t	|| _|   d S )N
num_labelsr)   cls_dropout)r   r   r   rZ  r  r  rP  poolerrV  r   r=   
classifierrF   rC   rB   rD   r!  )r    rF   rZ  rV  drop_outr#   r%   r&   r     s   

z)DebertaForSequenceClassification.__init__c                 C   s
   | j  S r   )r  r$  r   r%   r%   r&   r$    s   
z5DebertaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r   )r  r'  r%  r%   r%   r&   r'    s   z5DebertaForSequenceClassification.set_input_embeddingsNr   r   r   r   r   rG  r   r  r  r   c
              
   K   s:  |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}| |}d}|dur| j jdu r| jdkrQt	 }|
d|j}|||
d}n| dks^|ddkr|dk }| }|ddkrt|d||d|d}t|d|
d}t }||
d| j |
d}n^td|}nUtd}||| d  }nC| j jdkrt	 }| jdkr|| | }n+|||}n%| j jdkrt }||
d| j|
d}n| j jdkrt }|||}|	s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r  r  r   r   r'   
regressionsingle_label_classificationmulti_label_classificationrI  )rF   r(  r  r\  rD   r]  problem_typerZ  r   r   rR   r/   r*   rr   r!   nonzerorQ   r   rt   r^   r   r+   ri   
LogSoftmaxsumr,   r   r   r   r0   r  )r    r   r   r   r   r   rG  r   r  r  r+  rL  encoder_layerrU  rK  rJ  loss_fnlabel_indexlabeled_logitsrN  log_softmaxr   r%   r%   r&   r4     sh   



 


z(DebertaForSequenceClassification.forwardrO  )r6   r7   r8   r   r$  r'  r   r   r   r   r   r   r4   r:   r%   r%   r#   r&   rY    sF    	
rY  c                       s   e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB dee	B fddZ
  ZS )DebertaForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   )r   r   rZ  r  r  r   rB   rC   rD   r=   r>   r]  r!  rE   r#   r%   r&   r     s   
z&DebertaForTokenClassification.__init__Nr   r   r   r   r   rG  r   r  r  r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|dd  }|durO|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrH  r   r'   r   rI  )rF   r(  r  rD   r]  r   rR   rZ  r   r0   r  )r    r   r   r   r   r   rG  r   r  r  r+  rL  r1  rK  rJ  rN  r   r%   r%   r&   r4      s0   

z%DebertaForTokenClassification.forwardrO  )r6   r7   r8   r   r   r   r   r   r   r   r4   r:   r%   r%   r#   r&   rk    sB    	
rk  c                       s   e Zd Z fddZe										ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB dee	B fddZ
  ZS )DebertaForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r   r   rZ  r  r  r   r=   r>   
qa_outputsr!  rE   r#   r%   r&   r   T  s
   
z$DebertaForQuestionAnswering.__init__Nr   r   r   r   r   start_positionsend_positionsr   r  r  r   c              
   K   sF  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t||||j|jdS )	NrH  r   r   r'   r   )ignore_indexr)   )rJ  start_logits
end_logitsr0   r  )rF   r(  r  rm  splitr   r   r   r!   r   r   r   r0   r  )r    r   r   r   r   r   rn  ro  r   r  r  r+  rL  r1  rK  rq  rr  
total_lossignored_indexrN  
start_lossend_lossr   r%   r%   r&   r4   ^  sN   






z#DebertaForQuestionAnswering.forward)
NNNNNNNNNN)r6   r7   r8   r   r   r   r   r   r   r   r4   r:   r%   r%   r#   r&   rl  R  sH    
	
rl  )r?  rl  rY  rk  r  r  )Ar9   r   r   torch.nnr   r   r    r   r  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   r   modeling_utilsr   utilsr   r   configuration_debertar   
get_loggerr6   loggerModuler   r<   jitscriptr\   ra   rc   rf   r   r   rj   rl   rp   ru   rv   r   r   r   r   r   r   r  r  r2  r  r7  r  r=  r?  rP  rY  rk  rl  __all__r%   r%   r%   r&   <module>   sv   




 GQ#!Rd
\k>L