o
    ei                     @   s0  d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" e#e$Z%G dd dej&Z'G dd dej&Z(G dd dej&Z)G dd dej&Z*G dd dej&Z+G dd dej&Z,G dd dej&Z-G dd dej&Z.G dd dej&Z/eG d d! d!eZ0eG d"d# d#e0Z1eG d$d% d%e0Z2G d&d' d'ej&Z3ed(d)G d*d+ d+e0Z4eG d,d- d-e0Z5eG d.d/ d/e0Z6G d0d1 d1ej&Z7eG d2d3 d3e0Z8d7d4d5Z9g d6Z:dS )8zPyTorch I-BERT model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s  t    |j| _d| _d| _d| _d| _d| _t|j	|j
|j| j| jd| _t|j|j
| j| jd| _| jdt|jdd	d
 |j| _t|j|j
| j| j| jd| _t| j| jd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t|j | _!d S )N             )padding_idx
weight_bit
quant_mode)r!   r"   position_idsr   F)
persistentr"   eps
output_bitr"   force_dequant)"super__init__r"   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandr    position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr+   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ibert/modeling_ibert.pyr-   2   sN   
	zIBertEmbeddings.__init__Nr   c                 C   s  |d u r|d urt || j||j}n| |}|d ur"| }n| d d }|d u r9tj|tj| j	jd}|d u rE| 
|\}}nd }| |\}}	| j||||	d\}
}| |\}}| j|
|||d\}
}| |
|\}
}| |
}
| |
|\}
}|
|fS )Nr%   dtypedeviceidentityidentity_scaling_factor)"create_position_ids_from_input_idsr    torP   &create_position_ids_from_inputs_embedssizer:   zeroslongr#   r6   r8   r?   r>   rB   rF   rC   )rH   	input_idstoken_type_idsr#   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr8   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorr>   "position_embeddings_scaling_factorrL   rL   rM   forwardc   sD   




zIBertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr%   r   rN   r   )rW   r:   r;   r    rY   rP   	unsqueezer=   )rH   r\   r^   sequence_lengthr#   rL   rL   rM   rV      s   	z6IBertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r-   rd   rV   __classcell__rL   rL   rJ   rM   r   -   s    2
.r   c                       *   e Zd Z fddZ		dddZ  ZS )IBertSelfAttentionc              	      sV  t    |j|j dkrt|dstd|j d|j d|j| _d| _d| _d| _	|j| _t
|j|j | _| j| j | _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t|j| _t| j	| j|jd| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   Tbiasr!   bias_bitr"   per_channelr'   r"   r+   )r,   r-   r4   num_attention_headshasattr
ValueErrorr"   r!   rr   r0   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrC   r   rD   attention_probs_dropout_probrF   r   r+   softmaxrG   rJ   rL   rM   r-      s^   

		zIBertSelfAttention.__init__NFc                 C   s  |  ||\}}| ||\}}| ||\}	}
| ||\}}| ||\}}| |	|
\}}|j\}}}||d| j| j	
dd}||d| j| j	
dd}||d| j| j	
dd}t||
dd}t| j	}|| }| jr~|| | }nd }|d ur|| }| ||\}}| |}t||}|d ur|| }nd }|dddd }| d d | jf }|j| }| ||\}}|r||fn|f}|r||fn|f}||fS )Nr%   r      r   r   )r{   r|   r}   r~   r   r   shapeviewru   ry   	transposer:   matmulmathsqrtr"   r   rF   permute
contiguousrW   rz   rC   )rH   hidden_stateshidden_states_scaling_factorattention_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factor
batch_size
seq_length_attention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factorrL   rL   rM   rd      s\   



zIBertSelfAttention.forwardNFrg   rh   ri   r-   rd   rk   rL   rL   rJ   rM   rm      s
    ;rm   c                       $   e Zd Z fddZdd Z  ZS )IBertSelfOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j	d| j| j| jdd| _
t| j| jd| _t|j	|j| j| j|jd| _t| j| jd| _t|j| _d S Nr   r   r   Trp   r'   r(   )r,   r-   r"   r0   r!   rr   r1   r2   r   r4   denser   ln_input_actr   rA   r+   rB   rC   r   rD   rE   rF   rG   rJ   rL   rM   r-   *  4   
	zIBertSelfOutput.__init__c                 C   X   |  ||\}}| |}| j||||d\}}| ||\}}| ||\}}||fS NrQ   r   rF   r   rB   rC   rH   r   r   input_tensorinput_tensor_scaling_factorrL   rL   rM   rd   G     

zIBertSelfOutput.forwardr   rL   rL   rJ   rM   r   )      r   c                       rl   )IBertAttentionc                    s*   t    |j| _t|| _t|| _d S N)r,   r-   r"   rm   rH   r   outputrG   rJ   rL   rM   r-   Y  s   

zIBertAttention.__init__NFc                 C   s\   |  ||||\}}| |d |d ||\}}|f|dd   }	|f|dd   }
|	|
fS )Nr   r   )rH   r   )rH   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factorrL   rL   rM   rd   _  s   zIBertAttention.forwardr   r   rL   rL   rJ   rM   r   X  s
    
r   c                       r   )IBertIntermediatec              	      s   t    |j| _d| _d| _d| _t|j|jd| j| j| jdd| _	|j
dkr,tdt| j|jd| _t| j| jd| _d S )	Nr   r   Trp   r   z3I-BERT only supports 'gelu' for `config.hidden_act`rt   r'   )r,   r-   r"   r0   r!   rr   r   r4   intermediate_sizer   
hidden_actrw   r   r+   intermediate_act_fnr   rC   rG   rJ   rL   rM   r-   u  s$   

	zIBertIntermediate.__init__c                 C   s8   |  ||\}}| ||\}}| ||\}}||fS r   )r   r   rC   )rH   r   r   rL   rL   rM   rd     s   zIBertIntermediate.forwardr   rL   rL   rJ   rM   r   t  s    r   c                       r   )IBertOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j
d| j| j| jdd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t|j| _d S r   )r,   r-   r"   r0   r!   rr   r1   r2   r   r   r4   r   r   r   r   rA   r+   rB   rC   r   rD   rE   rF   rG   rJ   rL   rM   r-     r   zIBertOutput.__init__c                 C   r   r   r   r   rL   rL   rM   rd     r   zIBertOutput.forwardr   rL   rL   rJ   rM   r     r   r   c                       s2   e Zd Z fddZ		d	ddZdd Z  ZS )

IBertLayerc                    sd   t    |j| _d| _d| _t|| _t|| _t	|| _
t| j| jd| _t| j| jd| _d S )Nr   r   r'   )r,   r-   r"   r0   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrG   rJ   rL   rM   r-     s   



zIBertLayer.__init__NFc                 C   sP   | j ||||d\}}|d }|d }|dd  }	| ||\}
}|
f|	 }	|	S )N)r   r   r   )r   feed_forward_chunk)rH   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factorrL   rL   rM   rd     s   

zIBertLayer.forwardc                 C   sL   |  ||\}}| ||\}}| ||\}}| ||||\}}||fS r   )r   r   r   r   )rH   r   r   intermediate_output"intermediate_output_scaling_factorr   r   rL   rL   rM   r     s   zIBertLayer.feed_forward_chunkr   )rg   rh   ri   r-   rd   r   rk   rL   rL   rJ   rM   r     s    
r   c                       s.   e Zd Z fddZ				dddZ  ZS )	IBertEncoderc                    s<   t     | _ j| _t fddt jD | _d S )Nc                    s   g | ]}t  qS rL   )r   ).0r   rI   rL   rM   
<listcomp>  s    z)IBertEncoder.__init__.<locals>.<listcomp>)	r,   r-   rI   r"   r   
ModuleListrangenum_hidden_layerslayerrG   rJ   r   rM   r-     s   
$zIBertEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }d }	t | jD ]\}
}|r||f }|||||}|d }|r2||d f }q|r:||f }|sItdd ||||	fD S t||||	dS )NrL   r   r   c                 s   s    | ]	}|d ur|V  qd S r   rL   )r   vrL   rL   rM   	<genexpr>$  s    z'IBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentionscross_attentions)	enumerater   tupler	   )rH   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_outputsrL   rL   rM   rd     s@   	


zIBertEncoder.forward)NFFTr   rL   rL   rJ   rM   r     s    
r   c                       r   )IBertPoolerc                    s2   t    |j| _t|j|j| _t | _d S r   )	r,   r-   r"   r   Linearr4   r   Tanh
activationrG   rJ   rL   rM   r-   7  s   
zIBertPooler.__init__c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )rH   r   first_token_tensorpooled_outputrL   rL   rM   rd   =  s   

zIBertPooler.forwardr   rL   rL   rJ   rM   r   6  s    r   c                   @   s4   e Zd ZU eed< dZe dd ZdddZ	dS )	IBertPreTrainedModelrI   ibertc                 C   s  t |ttjfrDtj|jd| jjd |j	durt
|j	 t|dddur2t
|j t
|j t|dddurBt
|j dS dS t |ttjfrtj|jd| jjd |jdurlt|jddslt
|j|j  t|dddurt
|j t
|j dS dS t |ttjfrt
|j	 t|j t|d	ddurt
|j dS dS t |trt
|j	 dS t |trt|jt|jjd
 d dS t |trt |j!d t |j"d t
|j# dS dS )zInitialize the weightsg        )meanstdNweight_integerbias_integer_is_hf_initializedFweight_scaling_factorshiftr%   r$   gh㈵gh㈵>)$
isinstancer   r   r   initnormal_weightrI   initializer_rangerq   zeros_getattrr   fc_scaling_factorr   r   	Embeddingr    r   r   rB   ones_r   IBertLMHeadr   copy_r#   r:   r;   r   r=   r   	constant_x_minx_maxact_scaling_factor)rH   modulerL   rL   rM   _init_weightsK  sB   


&
z"IBertPreTrainedModel._init_weightsNc                 C   s   t d)Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rH   new_num_tokensrL   rL   rM   resize_token_embeddingsm     z,IBertPreTrainedModel.resize_token_embeddingsr   )
rg   rh   ri   r   __annotations__base_model_prefixr:   no_gradr  r  rL   rL   rL   rM   r   F  s   
 
!r   c                       s   e Zd ZdZd fdd	Zdd Zdd Ze																dd
ej	d	B dej
d	B dej	d	B dej	d	B dej
d	B ded	B ded	B ded	B deeej
 B fddZ  ZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    Tc                    sL   t  | || _|j| _t|| _t|| _|rt|nd| _	| 
  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r,   r-   rI   r"   r   ra   r   encoderr   pooler	post_init)rH   rI   add_pooling_layerrJ   rL   rM   r-   |  s   

zIBertModel.__init__c                 C      | j jS r   ra   r6   rH   rL   rL   rM   get_input_embeddings  r  zIBertModel.get_input_embeddingsc                 C   s   || j _d S r   r  )rH   r}   rL   rL   rM   set_input_embeddings  s   zIBertModel.set_input_embeddingsNrZ   r   r[   r#   r\   r   r   r   returnc	                 K   sl  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u retj	||f|d}|d u rrtj
|
tj|d}| ||
}| j||||d\}}| j||||||d}|d }| jd ur| |nd }|s||f|d	d   S t|||j|j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer%   z5You have to specify either input_ids or inputs_embeds)rP   rN   )rZ   r#   r[   r\   )r   r   r   r   r   r   )r   pooler_outputr   r   r   )rI   r   r   use_return_dictrw   %warn_if_padding_and_no_attention_maskrW   rP   r:   onesrX   rY   get_extended_attention_maskra   r  r  r
   r   r   r   )rH   rZ   r   r[   r#   r\   r   r   r   kwargsr^   r   r   rP   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr   rL   rL   rM   rd     sX   

zIBertModel.forward)T)NNNNNNNN)rg   rh   ri   rj   r-   r  r  r   r:   
LongTensorFloatTensorboolr
   r   rd   rk   rL   rL   rJ   rM   r  q  sB    		r  c                       s   e Zd ZdddZ fddZdd Zdd	 Ze	
	
	
	
	
	
	
	
	
ddej	d
B dej
d
B dej	d
B dej	d
B dej
d
B dej	d
B ded
B ded
B ded
B deeej
 B fddZ  ZS )IBertForMaskedLMz(ibert.embeddings.word_embeddings.weight$zlm_head.bias)zlm_head.decoder.weightzlm_head.decoder.biasc                    s0   t  | t|dd| _t|| _|   d S NF)r  )r,   r-   r  r   r  lm_headr  rG   rJ   rL   rM   r-     s   
zIBertForMaskedLM.__init__c                 C   r  r   )r/  decoderr  rL   rL   rM   get_output_embeddings  r  z&IBertForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )r/  r0  rq   )rH   new_embeddingsrL   rL   rM   set_output_embeddings  s   z&IBertForMaskedLM.set_output_embeddingsNrZ   r   r[   r#   r\   labelsr   r   r   r  c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}d}|dur7t }||d| j j|d}|	sM|f|dd  }|durK|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r[   r#   r\   r   r   r   r   r%   r   losslogitsr   r   )
rI   r   r   r/  r   r   r3   r   r   r   )rH   rZ   r   r[   r#   r\   r4  r   r   r   r$  r   r)  prediction_scoresmasked_lm_lossloss_fctr   rL   rL   rM   rd     s4   

zIBertForMaskedLM.forward	NNNNNNNNN)rg   rh   ri   _tied_weights_keysr-   r1  r3  r   r:   r*  r+  r,  r   r   rd   rk   rL   rL   rJ   rM   r-    sL    		
r-  c                       (   e Zd ZdZ fddZdd Z  ZS )r  z)I-BERT Head for masked language modeling.c                    sZ   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _d S )N)r)   )r,   r-   r   r   r4   r   rB   rA   
layer_normr3   r0  	Parameterr:   rX   rq   rG   rJ   rL   rM   r-   '  s
   
zIBertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r   r?  r0  )rH   featuresr$  xrL   rL   rM   rd   /  s
   


zIBertLMHead.forwardrg   rh   ri   rj   r-   rd   rk   rL   rL   rJ   rM   r  $  s    r  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB de	e
ej B fddZ  ZS )IBertForSequenceClassificationc                    s8   t  | |j| _t|dd| _t|| _|   d S r.  )r,   r-   
num_labelsr  r   IBertClassificationHead
classifierr  rG   rJ   rL   rM   r-   A  s
   
z'IBertForSequenceClassification.__init__NrZ   r   r[   r#   r\   r4  r   r   r   r  c
              
   K   sf  |	dur|	n| j j}	| j||||||||	d}|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtjksG|jtj	krLd| j _nd| j _| j jdkrnt
 }| jdkrh|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|	s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr5  r   r   
regressionsingle_label_classificationmulti_label_classificationr%   r   r6  )rI   r   r   rI  problem_typerG  rO   r:   rY   rx   r   squeezer   r   r   r   r   r   rH   rZ   r   r[   r#   r\   r4  r   r   r   r$  r   r)  r8  r7  r;  r   rL   rL   rM   rd   K  sR   



"


z&IBertForSequenceClassification.forwardr<  )rg   rh   ri   r-   r   r:   r*  r+  r,  r   r   rd   rk   rL   rL   rJ   rM   rF  :  sB    
	
rF  c                       s   e Zd Z fddZe									ddejdB dejdB dejdB dejdB dejdB d	ejdB d
edB dedB dedB de	e
ej B fddZ  ZS )IBertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r,   r-   r  r   r   rD   rE   rF   r   r4   rI  r  rG   rJ   rL   rM   r-     s
   
zIBertForMultipleChoice.__init__NrZ   r[   r   r4  r#   r\   r   r   r   r  c
              
   K   sl  |	dur|	n| j j}	|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	d}|d }| |}| |}|d|}d}|durt }|||}|	s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r%   r   )r#   r[   r   r\   r   r   r   r   r6  )rI   r   r   r   rW   r   rF   rI  r   r   r   r   )rH   rZ   r[   r   r4  r#   r\   r   r   r   r$  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r   r8  reshaped_logitsr7  r;  r   rL   rL   rM   rd     sJ   ,



zIBertForMultipleChoice.forwardr<  )rg   rh   ri   r-   r   r:   r*  r+  r,  r   r   rd   rk   rL   rL   rJ   rM   rP    sB    
	
rP  c                       rE  )IBertForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S r.  )r,   r-   rG  r  r   r   rD   rE   rF   r   r4   rI  r  rG   rJ   rL   rM   r-     s   z$IBertForTokenClassification.__init__NrZ   r   r[   r#   r\   r4  r   r   r   r  c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|dur;t }||d| j|d}|	sQ|f|dd  }|durO|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr5  r   r%   r   r6  )rI   r   r   rF   rI  r   r   rG  r   r   r   rO  rL   rL   rM   rd     s6   

z#IBertForTokenClassification.forwardr<  )rg   rh   ri   r-   r   r:   r*  r+  r,  r   r   rd   rk   rL   rL   rJ   rM   rX    sB    	
rX  c                       r>  )rH  z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r   )r,   r-   r   r   r4   r   rD   rE   rF   rG  out_projrG   rJ   rL   rM   r-   ;  s   
z IBertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )rF   r   r:   tanhrY  )rH   rA  r$  r   rL   rL   rM   rd   A  s   




zIBertClassificationHead.forwardrC  rL   rL   rJ   rM   rH  8  s    rH  c                       s   e Zd Z fddZe										ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB de	e
ej B fddZ  ZS )IBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r.  )
r,   r-   rG  r  r   r   r   r4   
qa_outputsr  rG   rJ   rL   rM   r-   M  s
   z"IBertForQuestionAnswering.__init__NrZ   r   r[   r#   r\   start_positionsend_positionsr   r   r   r  c              
   K   sF  |
d ur|
n| j j}
| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|
s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr5  r   r   r%   dim)ignore_indexr   )r7  start_logits
end_logitsr   r   )rI   r   r   r\  splitrN  r   lenrW   clampr   r   r   r   )rH   rZ   r   r[   r#   r\   r]  r^  r   r   r   r$  r   r)  r8  rb  rc  
total_lossignored_indexr;  
start_lossend_lossr   rL   rL   rM   rd   W  sN   






z!IBertForQuestionAnswering.forward)
NNNNNNNNNN)rg   rh   ri   r-   r   r:   r*  r+  r,  r   r   rd   rk   rL   rL   rJ   rM   r[  K  sH    
	
r[  c                 C   s6   |  | }tj|dd|| | }| | S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r_  )nerx   r:   cumsumtype_asrY   )rZ   r    r]   maskincremental_indicesrL   rL   rM   rT     s   rT   )r-  rP  r[  rF  rX  r  r   )r   );rj   r   r:   r   torch.nnr   r   r    r   r   activationsr   modeling_outputsr	   r
   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerrg   loggerModuler   rm   r   r   r   r   r   r   r   r   r  r-  r  rF  rP  rX  rH  r[  rT   __all__rL   rL   rL   rM   <module>   sV   $	 
v /"/89*gJPeA
L