o
    i                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ eeZG dd dejZG dd dejZ ej!j"dd Z#ej!j"dd Z$ej!j"dd Z%ej!j"dd Z&ej!j"dej'de(fddZ)ej!j"dej'dej'fdd Z*ej!j"dej'dej'd!e(fd"d#Z+ej!j"dej'dej'fd$d%Z,G d&d' d'ejZ-G d(d) d)ejZ.G d*d+ d+ejZ/G d,d- d-ejZ0G d.d/ d/ejZ1G d0d1 d1eZ2G d2d3 d3ejZ3eG d4d5 d5eZ4eG d6d7 d7e4Z5G d8d9 d9ejZ6G d:d; d;ejZ7G d<d= d=ejZ8G d>d? d?ejZ9G d@dA dAejZ:eG dBdC dCe4Z;G dDdE dEejZ<edFdGG dHdI dIe4Z=eG dJdK dKe4Z>eG dLdM dMe4Z?g dNZ@dS )OzPyTorch DeBERTa model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )DebertaConfigc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                    s8   t    tt|| _tt|| _|| _	d S N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/deberta/modeling_deberta.pyr   +   s   

zDebertaLayerNorm.__init__c                 C   sj   |j }| }|jddd}|| djddd}|| t|| j  }||}| j| | j	 }|S )NT)keepdim   )
dtypefloatmeanpowr   sqrtr    tor   r   )r!   hidden_states
input_typer-   varianceyr&   r&   r'   forward1   s   
zDebertaLayerNorm.forward)r   __name__
__module____qualname____doc__r   r5   __classcell__r&   r&   r$   r'   r   (   s    r   c                       $   e Zd Z fddZdd Z  ZS )DebertaSelfOutputc                    s>   t    t|j|j| _t|j|j| _t	|j
| _d S r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr!   configr$   r&   r'   r   =   s   
zDebertaSelfOutput.__init__c                 C   &   |  |}| |}| || }|S r   r@   rE   rB   r!   r1   input_tensorr&   r&   r'   r5   C      

zDebertaSelfOutput.forwardr7   r8   r9   r   r5   r;   r&   r&   r$   r'   r=   <   s    r=   c                 C   s   |  d}| d}tj|tj| jd}tj|tj|jd}|dddf |dd|d }|d|ddf }|d}|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r+   deviceNr   r(   r   )r"   r   arangelongrP   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idsr&   r&   r'   build_relative_positionJ   s   

$
r]   c                 C   s*   |  |d|d|d|dgS )Nr   r   r*   r(   expandr"   )c2p_posrV   relative_posr&   r&   r'   c2p_dynamic_expandg      *rb   c                 C   s*   |  |d|d|d|dgS )Nr   r   rN   r^   )r`   rV   rW   r&   r&   r'   p2c_dynamic_expandl   rc   rd   c                 C   s*   |  | d d | d|df S )Nr*   rN   r^   )	pos_indexp2c_attrW   r&   r&   r'   pos_dynamic_expandq   rc   rg   rV   scale_factorc                 C   s    t t j| dt jd| S )Nr(   r+   )r   r/   tensorr"   r,   )rV   rh   r&   r&   r'   scaled_size_sqrty   s    rk   rW   c                 C   s"   |  d| dkrt| |S |S NrN   )r"   r]   )rV   rW   ra   r&   r&   r'   
build_rpos~   s   
rm   max_relative_positionsc                 C   s"   t tt| d|d|S rl   )r   rj   minmaxr"   )rV   rW   rn   r&   r&   r'   compute_attention_span   s   "rq   c                 C   sR   | d| dkr'|d d d d d d df d}tj| dt|| |dS | S )NrN   r   r(   r*   dimindex)r"   rU   r   gatherrg   )rf   rV   rW   ra   re   r&   r&   r'   uneven_size_corrected   s   "rv   c                       s   e Zd ZdZ fddZdd Z				ddejd	ejd
ede	ej de	ej de	ej de
eje	ej f fddZdejdejdejdejdef
ddZ  ZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    s  t    |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _tj	|j| jd dd| _
ttj| jtjd| _ttj| jtjd| _|jd ur]|jng | _t|d	d| _t|d
d| _| jrtj	|j|jdd| _tj	|j|jdd| _nd | _d | _| jrt|dd| _| jdk r|j| _t|j| _d| jv rtj	|j| jdd| _d| jv rt	|j| j| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   Fr   ri   relative_attentiontalking_headrn   r(   r   c2pp2c) r   r   r?   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   r>   in_projr   r   r   r,   q_biasv_biaspos_att_typegetattrrz   r{   head_logits_projhead_weights_projrn   max_position_embeddingsrC   rD   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrE   rF   r$   r&   r'   r      s>   




z"DisentangledSelfAttention.__init__c                 C   s4   |  d d | jdf }||}|ddddS )Nr(   r   r*   r   r   )r"   r~   rS   permute)r!   xnew_x_shaper&   r&   r'   transpose_for_scores   s   
z.DisentangledSelfAttention.transpose_for_scoresFNr1   attention_maskoutput_attentionsquery_statesra   rel_embeddingsreturnc                    s  |du r  |} |jddd\}}	}
nZ j jj jd dd fddtdD }t|d | j	|d j
d}t|d	 | j	|d	 j
d}t|d
 | j	|d
 j
d} fdd|||fD \}}	}
|  jddddf  }|
  jddddf  }
d}d	t j }t||}||j	|j
d }t||	dd} jr|dur|durɈ |} ||	|||}|dur|| } jdur |dd
dd	ddd	d
}| }|| t|j
j}tjj|dd} |} jdur |dd
dd	ddd	d
}t||
}|dd
d	d }|  dd d }|!|}|sA|dfS ||fS )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r(   rs   r   c                    s0   g | ] t j fd dtjD ddqS )c                    s   g | ]
}|d     qS )r   r&   ).0i)kwsr&   r'   
<listcomp>   s    z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>r   r   )r   catranger~   )r   r!   r   )r   r'   r      s   0 z5DisentangledSelfAttention.forward.<locals>.<listcomp>ri   r   r*   c                    s   g | ]}  |qS r&   )r   )r   r   r!   r&   r'   r      s    rN   r(   )"r   r   chunkr   r~   r   r   matmultr0   r+   r   r   lenr   rk   	transposerz   r   disentangled_att_biasr   r   boolmasked_fillfinforo   r   
functionalsoftmaxrE   r   
contiguousr"   rS   )r!   r1   r   r   r   ra   r   qprV   rW   value_layerqkvwqr   vrel_attrh   scaleattention_scoresattention_probscontext_layernew_context_layer_shaper&   r   r'   r5      sH   &
"""


"
"
z!DisentangledSelfAttention.forwardrV   rW   rh   c                 C   s  |d u rt |||j}| dkr|dd}n| dkr&|d}n| dkr5td|  t||| j}| }|| j| | j| d d f d}d}d| jv r| 	|}| 
|}t||dd	}	t|| d|d d }
tj|	dt|
||d
}	||	7 }d| jv r| |}| 
|}|t|| }t|||}t| | d|d d }t||dd	j|jd}tj|dt|||d
dd	}t||||}||7 }|S )Nr*   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. r|   r(   rN   rr   r}   ri   )r]   rP   rs   rU   r   rq   rn   rR   r   r   r   r   r   r   clampru   rb   r   rk   rm   r0   r+   rd   rv   )r!   rV   rW   ra   r   rh   att_spanscorepos_key_layerc2p_attr`   pos_query_layerr_posp2c_posrf   r&   r&   r'   r   $  sT   





z/DisentangledSelfAttention.disentangled_att_biasFNNN)r7   r8   r9   r:   r   r   r   Tensorr   r   tupler5   r   r   r;   r&   r&   r$   r'   rw      sD    
&	
Wrw   c                       s*   e Zd ZdZ fddZdddZ  ZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    t|dd}t|d|j| _tj|j| j|d| _t|dd| _	| j	s,d | _
n	t|j| j| _
|jdkrDt|j| j| _nd | _| j|jkrYtj| j|jdd| _nd | _t|j|j| _t|j| _|| _| jd	t|jd
dd d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFry   position_ids)r   r(   )
persistent)r   r   r   r?   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsr>   
embed_projr   rA   rB   rC   rD   rE   rG   register_bufferr   rQ   r_   )r!   rG   r   r$   r&   r'   r   `  s(   


zDebertaEmbeddings.__init__Nc                 C   sH  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jd urI| | }nt|}|}	| j	rW|	| }	| j
d ure| 
|}
|	|
 }	| jd uro| |	}	| |	}	|d ur| |	 kr| dkr|dd}|d}||	j}|	| }	| |	}	|	S )Nr(   r   rO   r   r*   )r"   r   r   r   rR   rP   r   r   
zeros_liker   r   r   rB   rs   squeezerU   r0   r+   rE   )r!   	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   r&   r&   r'   r5     s>   










zDebertaEmbeddings.forward)NNNNNr6   r&   r&   r$   r'   r   ]  s    r   c                       sH   e Zd Z fddZ				d	dedeejeej f fddZ	  Z
S )
DebertaAttentionc                    s(   t    t|| _t|| _|| _d S r   )r   r   rw   r!   r=   outputrG   rF   r$   r&   r'   r     s   



zDebertaAttention.__init__FNr   r   c           
      C   sF   | j ||||||d\}}|d u r|}| ||}	|r|	|fS |	d fS )N)r   ra   r   )r!   r   )
r!   r1   r   r   r   ra   r   self_output
att_matrixattention_outputr&   r&   r'   r5     s   	
zDebertaAttention.forwardr   r7   r8   r9   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   r     s    
r   c                       2   e Zd Z fddZdejdejfddZ  ZS )DebertaIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r>   r?   intermediate_sizer@   
isinstance
hidden_actstrr	   intermediate_act_fnrF   r$   r&   r'   r     s
   
zDebertaIntermediate.__init__r1   r   c                 C      |  |}| |}|S r   )r@   r   r!   r1   r&   r&   r'   r5        

zDebertaIntermediate.forwardr7   r8   r9   r   r   r   r5   r;   r&   r&   r$   r'   r     s    r   c                       r<   )DebertaOutputc                    sD   t    t|j|j| _t|j|j| _	t
|j| _|| _d S r   )r   r   r   r>   r   r?   r@   r   rA   rB   rC   rD   rE   rG   rF   r$   r&   r'   r     s
   

zDebertaOutput.__init__c                 C   rH   r   rI   rJ   r&   r&   r'   r5     rL   zDebertaOutput.forwardrM   r&   r&   r$   r'   r     s    r   c                       sH   e Zd Z fddZ				d	dedeejeej f fddZ	  Z
S )
DebertaLayerc                    s,   t    t|| _t|| _t|| _d S r   )r   r   r   	attentionr   intermediater   r   rF   r$   r&   r'   r     s   


zDebertaLayer.__init__NFr   r   c                 C   sD   | j ||||||d\}}| |}	| |	|}
|r|
|fS |
d fS )Nr   r   ra   r   )r   r   r   )r!   r1   r   r   ra   r   r   r   r   intermediate_outputlayer_outputr&   r&   r'   r5     s   	

zDebertaLayer.forward)NNNFr   r&   r&   r$   r'   r     s    
r   c                       sh   e Zd ZdZ fddZdd Zdd Zdd	d
Z					ddej	dej	de
de
de
f
ddZ  ZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    s~   t    t fddt jD | _t dd| _| jr:t dd| _	| j	dk r/ j
| _	t| j	d  j| _d| _d S )	Nc                    s   g | ]}t  qS r&   )r   r   _rG   r&   r'   r     s    z+DebertaEncoder.__init__.<locals>.<listcomp>rz   Frn   r(   r   r*   )r   r   r   
ModuleListr   num_hidden_layerslayerr   rz   rn   r   r   r?   r   gradient_checkpointingrF   r$   r   r'   r     s   
 

zDebertaEncoder.__init__c                 C   s   | j r	| jj}|S d }|S r   )rz   r   r   )r!   r   r&   r&   r'   get_rel_embedding  s   z DebertaEncoder.get_rel_embeddingc                 C   sN   |  dkr|dd}||dd }|S |  dkr%|d}|S )Nr*   r   rN   r(   r   )rs   rU   r   )r!   r   extended_attention_maskr&   r&   r'   get_attention_mask"  s   
z!DebertaEncoder.get_attention_maskNc                 C   s2   | j r|d u r|d urt||}|S t||}|S r   )rz   r]   )r!   r1   r   ra   r&   r&   r'   get_rel_pos+  s   

zDebertaEncoder.get_rel_posTFr1   r   output_hidden_statesr   return_dictc              	   C   s   |  |}| |||}|r|fnd }|rdnd }	|}
|  }t| jD ]'\}}||
|||||d\}}|r;||f }|d urB|}n|}
|rK|	|f }	q$|sZtdd |||	fD S t|||	dS )Nr&   )r   ra   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r&   )r   r   r&   r&   r'   	<genexpr>\  s    z)DebertaEncoder.forward.<locals>.<genexpr>last_hidden_stater1   
attentions)r   r  r   	enumerater   r   r   )r!   r1   r   r  r   r   ra   r  all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_mr&   r&   r'   r5   3  s8   


	

zDebertaEncoder.forward)NN)TFNNT)r7   r8   r9   r:   r   r   r   r  r   r   r   r5   r;   r&   r&   r$   r'   r     s,    
	r   c                   @   s,   e Zd ZU eed< dZdgZdZdd ZdS )DebertaPreTrainedModelrG   debertar   Tc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjtfrZ|jjd |jj	  dS t |trm|jj	  |jj	  dS t |ttfr||jj	  dS dS )zInitialize the weights.g        )r-   stdNg      ?)r   r   r>   r   datanormal_rG   initializer_ranger   zero_r   r   rB   r   fill_rw   r   r   LegacyDebertaLMPredictionHeadDebertaLMPredictionHead)r!   moduler&   r&   r'   _init_weightsi  s&   


z$DebertaPreTrainedModel._init_weightsN)	r7   r8   r9   r   __annotations__base_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr  r&   r&   r&   r'   r  b  s   
 r  c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )DebertaModelc                    s8   t  | t|| _t|| _d| _|| _|   d S Nr   )	r   r   r   r   r   encoderz_stepsrG   	post_initrF   r$   r&   r'   r     s   

zDebertaModel.__init__c                 C      | j jS r   r   r   r   r&   r&   r'   get_input_embeddings  s   z!DebertaModel.get_input_embeddingsc                 C   s   || j _d S r   r$  r!   new_embeddingsr&   r&   r'   set_input_embeddings  s   z!DebertaModel.set_input_embeddingsc                 C   s   t d)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r!   heads_to_pruner&   r&   r'   _prune_heads  s   zDebertaModel._prune_headsNr   r   r   r   r   r   r  r  r   c	              	      s  |d ur|n j j}|d ur|n j j}|d ur|n j j}|d ur*|d ur*td|d ur9 || | }	n|d urF| d d }	ntd|d urQ|jn|j}
|d u r_tj	|	|
d}|d u rltj
|	tj|
d} j|||||d} j||d||d}|d	 } jd	kr|d
 } fddt jD }|d } j } j|} j|}|d	d  D ]}|||d|||d}|| q|d }|s|f||rd	ndd   S t||r|jnd |jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embeds)rP   rO   )r   r   r   r   r   T)r  r   r  r   rN   c                    s   g | ]} j jd  qS r   )r   r   r   r   r&   r'   r     s    z(DebertaModel.forward.<locals>.<listcomp>Fr   r*   r  )rG   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr"   rP   r   r   r   rR   r   r   r!  r   r   r   r  appendr   r1   r  )r!   r   r   r   r   r   r   r  r  r   rP   embedding_outputencoder_outputsencoded_layersr1   layersr   r   rel_posr   sequence_outputr&   r   r'   r5     sr   


zDebertaModel.forward)NNNNNNNN)r7   r8   r9   r   r%  r(  r+  r   r   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   r    sB    
	

r  c                       r<   )$LegacyDebertaPredictionHeadTransformc                    sf   t    t|d|j| _t|j| j| _t|j	t
r#t|j	 | _n|j	| _tj| j|jd| _d S )Nr   )r#   )r   r   r   r?   r   r   r>   r@   r   r   r   r	   transform_act_fnrB   rA   rF   r$   r&   r'   r     s   
z-LegacyDebertaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r@   r6  rB   r   r&   r&   r'   r5     s   


z,LegacyDebertaPredictionHeadTransform.forwardrM   r&   r&   r$   r'   r5    s    r5  c                       s,   e Zd Z fddZdd Zdd Z  ZS )r  c                    s\   t    t|| _t|d|j| _tj| j|j	dd| _
tt|j	| _| j| j
_d S )Nr   Fry   )r   r   r5  	transformr   r?   r   r   r>   r   decoderr   r   r   r   rF   r$   r&   r'   r     s   

z&LegacyDebertaLMPredictionHead.__init__c                 C   s   | j | j_ d S r   )r   r8  r   r&   r&   r'   _tie_weights  s   z*LegacyDebertaLMPredictionHead._tie_weightsc                 C   r   r   )r7  r8  r   r&   r&   r'   r5     r   z%LegacyDebertaLMPredictionHead.forward)r7   r8   r9   r   r9  r5   r;   r&   r&   r$   r'   r    s    r  c                       r   )LegacyDebertaOnlyMLMHeadc                       t    t|| _d S r   )r   r   r  predictionsrF   r$   r&   r'   r        
z!LegacyDebertaOnlyMLMHead.__init__r4  r   c                 C   s   |  |}|S r   )r<  )r!   r4  prediction_scoresr&   r&   r'   r5     s   
z LegacyDebertaOnlyMLMHead.forwardr   r&   r&   r$   r'   r:    s    r:  c                       s(   e Zd ZdZ fddZdd Z  ZS )r  zMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    sl   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jdd| _tt|j| _d S )NT)r#   elementwise_affine)r   r   r   r>   r?   r@   r   r   r   r	   r6  rB   rA   r   r   r   r   r   rF   r$   r&   r'   r   #  s   
z DebertaLMPredictionHead.__init__c                 C   s:   |  |}| |}| |}t||j | j }|S r   )r@   r6  rB   r   r   r   r   r   )r!   r1   r   r&   r&   r'   r5   1  s   

zDebertaLMPredictionHead.forwardr6   r&   r&   r$   r'   r     s    r  c                       r<   )DebertaOnlyMLMHeadc                    r;  r   )r   r   r  lm_headrF   r$   r&   r'   r   <  r=  zDebertaOnlyMLMHead.__init__c                 C   s   |  ||}|S r   )rA  )r!   r4  r   r>  r&   r&   r'   r5   A  s   zDebertaOnlyMLMHead.forwardrM   r&   r&   r$   r'   r@  ;  s    r@  c                       s   e Zd ZddgZ fddZdd Zdd Ze																		dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deeef fddZ  ZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    sP   t  | |j| _t|| _| jrt|| _n
ddg| _t|| _	| 
  d S )Nzlm_predictions.lm_head.weightz)deberta.embeddings.word_embeddings.weight)r   r   legacyr  r  r:  cls_tied_weights_keysr@  lm_predictionsr"  rF   r$   r&   r'   r   J  s   


zDebertaForMaskedLM.__init__c                 C   s   | j r| jjjS | jjjS r   )rC  rD  r<  r8  rF  rA  r@   r   r&   r&   r'   get_output_embeddingsW  s   

z(DebertaForMaskedLM.get_output_embeddingsc                 C   s:   | j r|| jj_|j| jj_d S || jj_|j| jj_d S r   )rC  rD  r<  r8  r   rF  rA  r@   r&  r&   r&   r'   set_output_embeddings]  s
   

z(DebertaForMaskedLM.set_output_embeddingsNr   r   r   r   r   labelsr   r  r  r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| jr$| |}n	| || jjj}d}|durDt }||	d| j j
|	d}|	sZ|f|
dd  }|durX|f| S |S t|||
j|
jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r  r  r   r(   r   losslogitsr1   r  )rG   r,  r  rC  rD  rF  r   r   r   rS   r   r   r1   r  )r!   r   r   r   r   r   rI  r   r  r  outputsr4  r>  masked_lm_lossloss_fctr   r&   r&   r'   r5   e  s8   zDebertaForMaskedLM.forward	NNNNNNNNN)r7   r8   r9   rE  r   rG  rH  r   r   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   rB  F  sH    	

rB  c                       s0   e Zd Z fddZdd Zedd Z  ZS )ContextPoolerc                    s4   t    t|j|j| _t|j| _|| _	d S r   )
r   r   r   r>   pooler_hidden_sizer@   rC   pooler_dropoutrE   rG   rF   r$   r&   r'   r     s   

zContextPooler.__init__c                 C   s8   |d d df }|  |}| |}t| jj |}|S r  )rE   r@   r	   rG   pooler_hidden_act)r!   r1   context_tokenpooled_outputr&   r&   r'   r5     s
   

zContextPooler.forwardc                 C   r#  r   )rG   r?   r   r&   r&   r'   
output_dim  s   zContextPooler.output_dim)r7   r8   r9   r   r5   propertyrX  r;   r&   r&   r$   r'   rR    s
    
rR  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                       s   e Zd Z fddZdd Zdd Ze									ddeej	 d	eej	 d
eej	 deej	 deej	 deej	 dee
 dee
 dee
 deeef fddZ  ZS ) DebertaForSequenceClassificationc                    s   t  | t|dd}|| _t|| _t|| _| jj}t	
||| _t|dd }|d u r2| jjn|}t	|| _|   d S )N
num_labelsr*   cls_dropout)r   r   r   r\  r  r  rR  poolerrX  r   r>   
classifierrG   rD   rC   rE   r"  )r!   rG   r\  rX  drop_outr$   r&   r'   r     s   

z)DebertaForSequenceClassification.__init__c                 C   s
   | j  S r   )r  r%  r   r&   r&   r'   r%    s   
z5DebertaForSequenceClassification.get_input_embeddingsc                 C   s   | j | d S r   )r  r(  r&  r&   r&   r'   r(    s   z5DebertaForSequenceClassification.set_input_embeddingsNr   r   r   r   r   rI  r   r  r  r   c
              
   C   s:  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}| |}d}|dur| j jdu r| jdkrQt	 }|
d|j}|||
d}n| dks^|ddkr|dk }| }|ddkrt|d||d|d}t|d|
d}t }||
d| j |
d}n^td|}nUtd}||| d  }nC| j jdkrt	 }| jdkr|| | }n+|||}n%| j jdkrt }||
d| j|
d}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r  r  r   r   r(   
regressionsingle_label_classificationmulti_label_classificationrK  )rG   r,  r  r^  rE   r_  problem_typer\  r   r   rS   r0   r+   rs   r"   nonzerorR   r   ru   r_   r   r,   rj   
LogSoftmaxsumr-   r   r   r   r1   r  )r!   r   r   r   r   r   rI  r   r  r  rN  encoder_layerrW  rM  rL  loss_fnlabel_indexlabeled_logitsrP  log_softmaxr   r&   r&   r'   r5     sh   



 


z(DebertaForSequenceClassification.forwardrQ  )r7   r8   r9   r   r%  r(  r   r   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   r[    sF    	

r[  c                       s   e Zd Z fddZe									ddeej deej deej deej deej d	eej d
ee dee dee de	e
ef fddZ  ZS )DebertaForTokenClassificationc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   )r   r   r\  r  r  r   rC   rD   rE   r>   r?   r_  r"  rF   r$   r&   r'   r   %  s   
z&DebertaForTokenClassification.__init__Nr   r   r   r   r   rI  r   r  r  r   c
              
   C   s   |	dur|	n| j j}	| j||||||||	d}
|
d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|
dd  }|durO|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrJ  r   r(   r   rK  )rG   r,  r  rE   r_  r   rS   r\  r   r1   r  )r!   r   r   r   r   r   rI  r   r  r  rN  r4  rM  rL  rP  r   r&   r&   r'   r5   0  s0   

z%DebertaForTokenClassification.forwardrQ  )r7   r8   r9   r   r   r   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   rm  #  sB    	

rm  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee dee dee de	e
ef fddZ  ZS )DebertaForQuestionAnsweringc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r   r   r\  r  r  r   r>   r?   
qa_outputsr"  rF   r$   r&   r'   r   c  s
   
z$DebertaForQuestionAnswering.__init__Nr   r   r   r   r   start_positionsend_positionsr   r  r  r   c              
   C   sF  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t||||j|jdS )	NrJ  r   r   r(   r   )ignore_indexr*   )rL  start_logits
end_logitsr1   r  )rG   r,  r  ro  splitr   r   r   r"   r   r   r   r1   r  )r!   r   r   r   r   r   rp  rq  r   r  r  rN  r4  rM  rs  rt  
total_lossignored_indexrP  
start_lossend_lossr   r&   r&   r'   r5   m  sN   






z#DebertaForQuestionAnswering.forward)
NNNNNNNNNN)r7   r8   r9   r   r   r   r   r   r   r   r   r   r5   r;   r&   r&   r$   r'   rn  a  sH    
	

rn  )rB  rn  r[  rm  r  r  )Ar:   typingr   r   r   r   torch.nnr   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_debertar   
get_loggerr7   loggerModuler   r=   jitscriptr]   rb   rd   rg   r   r   rk   rm   rq   rv   rw   r   r   r   r   r   r   r  r  r5  r  r:  r  r@  rB  rR  r[  rm  rn  __all__r&   r&   r&   r'   <module>   sv   




 GQ#!Rj
Vj=K