o
    i(                     @   sD  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% e&e'Z(G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd dej)Z-G dd dej)Z.G dd dej)Z/G dd dej)Z0G dd dej)Z1G dd  d ej)Z2eG d!d" d"eZ3eG d#d$ d$e3Z4eG d%d& d&e3Z5G d'd( d(ej)Z6ed)d*G d+d, d,e3Z7eG d-d. d.e3Z8eG d/d0 d0e3Z9G d1d2 d2ej)Z:eG d3d4 d4e3Z;d8d5d6Z<g d7Z=dS )9zPyTorch I-BERT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )gelu))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )IBertConfig)IntGELUIntLayerNorm
IntSoftmaxQuantActQuantEmbeddingQuantLinearc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )IBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    |j| _d| _d| _d| _d| _d| _t|j	|j
|j| j| jd| _t|j|j
| j| jd| _| jdt|jdd	d
 t|dd| _|j| _t|j|j
| j| j| jd| _t| j| jd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t !|j"| _#d S )N             )padding_idx
weight_bit
quant_mode)r$   r%   position_ids)r   F)
persistentposition_embedding_typeabsoluter%   eps
output_bitr%   force_dequant)$super__init__r%   embedding_bitembedding_act_bitact_bitln_input_bitln_output_bitr   
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsregister_buffertorcharangemax_position_embeddingsexpandgetattrr)   r#   position_embeddingsr   embeddings_act1embeddings_act2r   layer_norm_epsr/   	LayerNormoutput_activationr   Dropouthidden_dropout_probdropoutselfconfig	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/ibert/modeling_ibert.pyr1   4   sP   
	zIBertEmbeddings.__init__Nr   c                 C   s  |d u r|d urt || j||j}n| |}|d ur"| }n| d d }|d u r9tj|tj| j	jd}|d u rE| 
|\}}nd }| |\}}	| j||||	d\}
}| jdkrp| |\}}| j|
|||d\}
}| |
|\}
}| |
}
| |
|\}
}|
|fS )Nr'   dtypedeviceidentityidentity_scaling_factorr*   )"create_position_ids_from_input_idsr#   torU   &create_position_ids_from_inputs_embedssizer>   zeroslongr&   r:   r<   rD   r)   rC   rG   rK   rH   )rM   	input_idstoken_type_idsr&   inputs_embedspast_key_values_lengthinput_shapeinputs_embeds_scaling_factorr<   $token_type_embeddings_scaling_factor
embeddingsembeddings_scaling_factorrC   "position_embeddings_scaling_factorrQ   rQ   rR   forwardh   sF   





zIBertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr'   r   rS   r   )r\   r>   r?   r#   r^   rU   	unsqueezerA   )rM   ra   rc   sequence_lengthr&   rQ   rQ   rR   r[      s   	z6IBertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r1   ri   r[   __classcell__rQ   rQ   rO   rR   r   /   s    5
/r   c                       s,   e Zd Z fddZ			dddZ  ZS )IBertSelfAttentionc              	      sv  t    |j|j dkrt|dstd|j d|j d|j| _d| _d| _d| _	|j| _t
|j|j | _| j| j | _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t|j| jd| j| j| jdd	| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t| j	| jd
| _t|j| _t|dd| _| jdkrtdt| j	| j|jd| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r"   Tbiasr$   bias_bitr%   per_channelr+   r)   r*   zDI-BERT only supports 'absolute' for `config.position_embedding_type`r%   r/   )r0   r1   r8   num_attention_headshasattr
ValueErrorr%   r$   rv   r4   intattention_head_sizeall_head_sizer   querykeyvaluer   query_activationkey_activationvalue_activationrH   r   rI   attention_probs_dropout_probrK   rB   r)   r   r/   softmaxrL   rO   rQ   rR   r1      sd   

		
zIBertSelfAttention.__init__NFc                 C   s  |  ||\}}| ||\}}	| ||\}
}| ||\}}| ||	\}}| |
|\}}|j\}}}||d| j| j	
dd}||d| j| j	
dd}||d| j| j	
dd}t||
dd}t| j	}|| }| jr~|| | }nd }|d ur|| }| ||\}}| |}|d ur|| }t||}|d ur|| }nd }|dddd }| d d | jf }|j| }| ||\}}|r||fn|f}|r||fn|f}||fS )Nr'   r      r   r   )r   r   r   r   r   r   shapeviewry   r}   	transposer>   matmulmathsqrtr%   r   rK   permute
contiguousr\   r~   rH   )rM   hidden_stateshidden_states_scaling_factorattention_mask	head_maskoutput_attentionsmixed_query_layer mixed_query_layer_scaling_factormixed_key_layermixed_key_layer_scaling_factormixed_value_layer mixed_value_layer_scaling_factorquery_layerquery_layer_scaling_factor	key_layerkey_layer_scaling_factorvalue_layervalue_layer_scaling_factor
batch_size
seq_length_attention_scoresscaleattention_scores_scaling_factorattention_probsattention_probs_scaling_factorcontext_layercontext_layer_scaling_factornew_context_layer_shapeoutputsoutput_scaling_factorrQ   rQ   rR   ri      s`   	



zIBertSelfAttention.forwardNNFrl   rm   rn   r1   ri   rp   rQ   rQ   rO   rR   rq      s    >rq   c                       $   e Zd Z fddZdd Z  ZS )IBertSelfOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j	d| j| j| jdd| _
t| j| jd| _t|j	|j| j| j|jd| _t| j| jd| _t|j| _d S Nr   r"   r!   Trt   r+   r,   )r0   r1   r%   r4   r$   rv   r5   r6   r   r8   denser   ln_input_actr   rF   r/   rG   rH   r   rI   rJ   rK   rL   rO   rQ   rR   r1   8  4   
	zIBertSelfOutput.__init__c                 C   X   |  ||\}}| |}| j||||d\}}| ||\}}| ||\}}||fS NrV   r   rK   r   rG   rH   rM   r   r   input_tensorinput_tensor_scaling_factorrQ   rQ   rR   ri   U     

zIBertSelfOutput.forwardr   rQ   rQ   rO   rR   r   7      r   c                       s4   e Zd Z fddZdd Z			d	ddZ  ZS )
IBertAttentionc                    s2   t    |j| _t|| _t|| _t | _d S N)	r0   r1   r%   rq   rM   r   outputsetpruned_headsrL   rO   rQ   rR   r1   g  s
   


zIBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   dim)lenr   rM   ry   r}   r   r   r   r   r   r   r   r~   union)rM   headsindexrQ   rQ   rR   prune_headsn  s   zIBertAttention.prune_headsNFc                 C   s^   |  |||||\}}| |d |d ||\}}	|f|dd   }
|	f|dd   }|
|fS )Nr   r   )rM   r   )rM   r   r   r   r   r   self_outputsself_outputs_scaling_factorattention_outputattention_output_scaling_factorr   outputs_scaling_factorrQ   rQ   rR   ri     s   zIBertAttention.forwardr   )rl   rm   rn   r1   r   ri   rp   rQ   rQ   rO   rR   r   f  s    r   c                       r   )IBertIntermediatec              	      s   t    |j| _d| _d| _d| _t|j|jd| j| j| jdd| _	|j
dkr,tdt| j|jd| _t| j| jd| _d S )	Nr   r"   Trt   r	   z3I-BERT only supports 'gelu' for `config.hidden_act`rx   r+   )r0   r1   r%   r4   r$   rv   r   r8   intermediate_sizer   
hidden_actr{   r   r/   intermediate_act_fnr   rH   rL   rO   rQ   rR   r1     s$   

	zIBertIntermediate.__init__c                 C   s8   |  ||\}}| ||\}}| ||\}}||fS r   )r   r   rH   )rM   r   r   rQ   rQ   rR   ri     s   zIBertIntermediate.forwardr   rQ   rQ   rO   rR   r     s    r   c                       r   )IBertOutputc              	      s   t    |j| _d| _d| _d| _d| _d| _t|j	|j
d| j| j| jdd| _t| j| jd| _t|j
|j| j| j|jd| _t| j| jd| _t|j| _d S r   )r0   r1   r%   r4   r$   rv   r5   r6   r   r   r8   r   r   r   r   rF   r/   rG   rH   r   rI   rJ   rK   rL   rO   rQ   rR   r1     r   zIBertOutput.__init__c                 C   r   r   r   r   rQ   rQ   rR   ri     r   zIBertOutput.forwardr   rQ   rQ   rO   rR   r     r   r   c                       s4   e Zd Z fddZ			d	ddZdd Z  ZS )

IBertLayerc                    sd   t    |j| _d| _d| _t|| _t|| _t	|| _
t| j| jd| _t| j| jd| _d S )Nr   r   r+   )r0   r1   r%   r4   seq_len_dimr   	attentionr   intermediater   r   r   pre_intermediate_actpre_output_actrL   rO   rQ   rR   r1     s   



zIBertLayer.__init__NFc                 C   sR   | j |||||d\}}|d }|d }	|dd  }
| ||	\}}|f|
 }
|
S )N)r   r   r   )r   feed_forward_chunk)rM   r   r   r   r   r   self_attention_outputs%self_attention_outputs_scaling_factorr   r   r   layer_outputlayer_output_scaling_factorrQ   rQ   rR   ri     s   

zIBertLayer.forwardc                 C   sL   |  ||\}}| ||\}}| ||\}}| ||||\}}||fS r   )r   r   r   r   )rM   r   r   intermediate_output"intermediate_output_scaling_factorr   r   rQ   rQ   rR   r     s   zIBertLayer.feed_forward_chunkr   )rl   rm   rn   r1   ri   r   rp   rQ   rQ   rO   rR   r     s    
r   c                       s0   e Zd Z fddZ					dddZ  ZS )	IBertEncoderc                    s<   t     | _ j| _t fddt jD | _d S )Nc                    s   g | ]}t  qS rQ   )r   ).0r   rN   rQ   rR   
<listcomp>'  s    z)IBertEncoder.__init__.<locals>.<listcomp>)	r0   r1   rN   r%   r   
ModuleListrangenum_hidden_layerslayerrL   rO   r   rR   r1   #  s   
$zIBertEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }	d }
t | jD ]*\}}|r||f }|d ur&|| nd }||||||}|d }|r=|	|d f }	q|rE||f }|sTtdd |||	|
fD S t|||	|
dS )NrQ   r   r   c                 s   s    | ]	}|d ur|V  qd S r   rQ   )r   vrQ   rQ   rR   	<genexpr>M  s    z'IBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentionscross_attentions)	enumerater   tupler
   )rM   r   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputsrQ   rQ   rR   ri   )  sD   



zIBertEncoder.forward)NNFFTr   rQ   rQ   rO   rR   r   "  s    
r   c                       r   )IBertPoolerc                    s2   t    |j| _t|j|j| _t | _d S r   )	r0   r1   r%   r   Linearr8   r   Tanh
activationrL   rO   rQ   rR   r1   `  s   
zIBertPooler.__init__c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r   )rM   r   first_token_tensorpooled_outputrQ   rQ   rR   ri   f  s   

zIBertPooler.forwardr   rQ   rQ   rO   rR   r   _  s    r   c                   @   s,   e Zd ZU eed< dZdd ZdddZdS )	IBertPreTrainedModelrN   ibertc                 C   s   t |ttjfr"|jjjd| jjd |j	dur |j	j
  dS dS t |ttjfrG|jjjd| jjd |jdurE|jj|j 
  dS dS t |ttjfr^|j	j
  |jjd dS t |trk|j	j
  dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   r   r   weightdatanormal_rN   initializer_rangeru   zero_r   	Embeddingr#   r   rG   fill_IBertLMHead)rM   modulerQ   rQ   rR   _init_weightst  s    


z"IBertPreTrainedModel._init_weightsNc                 C   s   t d)Nz6`resize_token_embeddings` is not supported for I-BERT.)NotImplementedError)rM   new_num_tokensrQ   rQ   rR   resize_token_embeddings     z,IBertPreTrainedModel.resize_token_embeddingsr   )rl   rm   rn   r   __annotations__base_model_prefixr  r  rQ   rQ   rQ   rR   r  o  s
   
 r  c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Ze									dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeee
j f fddZ  ZS )
IBertModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    Tc                    sL   t  | || _|j| _t|| _t|| _|rt|nd| _	| 
  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r0   r1   rN   r%   r   rf   r   encoderr   pooler	post_init)rM   rN   add_pooling_layerrO   rQ   rR   r1     s   

zIBertModel.__init__c                 C      | j jS r   rf   r:   rM   rQ   rQ   rR   get_input_embeddings  r  zIBertModel.get_input_embeddingsc                 C   s   || j _d S r   r  )rM   r   rQ   rQ   rR   set_input_embeddings  s   zIBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rM   heads_to_pruner   r   rQ   rQ   rR   _prune_heads  s   zIBertModel._prune_headsNr_   r   r`   r&   r   ra   r   r   r   returnc
              	   C   s~  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u retj	||f|d}|d u rrtj
|
tj|d}| ||
}| || j j}| j||||d\}}| j|||||||	d}|d }| jd ur| |nd }|	s||f|d	d   S t|||j|j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer'   z5You have to specify either input_ids or inputs_embeds)rU   rS   )r_   r&   r`   ra   )r   r   r   r   r   r   r   )r   pooler_outputr   r   r   )rN   r   r   use_return_dictr{   %warn_if_padding_and_no_attention_maskr\   rU   r>   onesr]   r^   get_extended_attention_maskget_head_maskr   rf   r  r  r   r   r   r   )rM   r_   r   r`   r&   r   ra   r   r   r   rc   r   r   rU   extended_attention_maskembedding_outputembedding_output_scaling_factorencoder_outputssequence_outputr   rQ   rQ   rR   ri     s\   

	zIBertModel.forward)T)	NNNNNNNNN)rl   rm   rn   ro   r1   r  r  r"  r   r   r>   
LongTensorFloatTensorboolr   r   r   ri   rp   rQ   rQ   rO   rR   r    sJ    		
r  c                       s   e Zd ZddgZ fddZdd Zdd Ze																				dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j
 dee dee dee deeee	j f fddZ  ZS )IBertForMaskedLMzlm_head.decoder.biaszlm_head.decoder.weightc                    s0   t  | t|dd| _t|| _|   d S NF)r  )r0   r1   r  r  r  lm_headr  rL   rO   rQ   rR   r1     s   
zIBertForMaskedLM.__init__c                 C   r  r   )r4  decoderr  rQ   rQ   rR   get_output_embeddings  r  z&IBertForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )r4  r5  ru   )rM   new_embeddingsrQ   rQ   rR   set_output_embeddings  s   z&IBertForMaskedLM.set_output_embeddingsNr_   r   r`   r&   r   ra   labelsr   r   r   r#  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r`   r&   r   ra   r   r   r   r   r'   r   losslogitsr   r   )
rN   r%  r  r4  r   r   r7   r   r   r   )rM   r_   r   r`   r&   r   ra   r9  r   r   r   r   r.  prediction_scoresmasked_lm_lossloss_fctr   rQ   rQ   rR   ri     s6   
zIBertForMaskedLM.forward
NNNNNNNNNN)rl   rm   rn   _tied_weights_keysr1   r6  r8  r   r   r>   r/  r0  r1  r   r   r   ri   rp   rQ   rQ   rO   rR   r2    sN    		
r2  c                       s2   e Zd ZdZ fddZdd Zd
dd	Z  ZS )r  z)I-BERT Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S )N)r-   )r0   r1   r   r   r8   r   rG   rF   
layer_normr7   r5  	Parameterr>   r]   ru   rL   rO   rQ   rR   r1   N  s   
zIBertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r	   rC  r5  )rM   featureskwargsxrQ   rQ   rR   ri   W  s
   


zIBertLMHead.forwardr#  Nc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r5  ru   rU   typer  rQ   rQ   rR   _tie_weightsa  s   zIBertLMHead._tie_weights)r#  N)rl   rm   rn   ro   r1   ri   rJ  rp   rQ   rQ   rO   rR   r  K  s
    	
r  z
    I-BERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForSequenceClassificationc                    s8   t  | |j| _t|dd| _t|| _|   d S r3  )r0   r1   
num_labelsr  r  IBertClassificationHead
classifierr  rL   rO   rQ   rR   r1   q  s
   
z'IBertForSequenceClassification.__init__Nr_   r   r`   r&   r   ra   r9  r   r   r   r#  c                 C   sh  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur| j jdu rQ| jdkr7d| j _n| jdkrM|jtjksH|jtj	krMd| j _nd| j _| j jdkrot
 }| jdkri|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr:  r   r   
regressionsingle_label_classificationmulti_label_classificationr'   r   r;  )rN   r%  r  rP  problem_typerN  rT   r>   r^   r|   r   squeezer   r   r   r   r   r   rM   r_   r   r`   r&   r   ra   r9  r   r   r   r   r.  r=  r<  r@  r   rQ   rQ   rR   ri   {  sT   


"


z&IBertForSequenceClassification.forwardrA  )rl   rm   rn   r1   r   r   r>   r/  r0  r1  r   r   r   ri   rp   rQ   rQ   rO   rR   rM  j  sH    
	
rM  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r0   r1   r  r  r   rI   rJ   rK   r   r8   rP  r  rL   rO   rQ   rR   r1     s
   
zIBertForMultipleChoice.__init__Nr_   r`   r   r9  r&   r   ra   r   r   r   r#  c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r'   r   )r&   r`   r   r   ra   r   r   r   r   r;  )rN   r%  r   r   r\   r  rK   rP  r   r   r   r   )rM   r_   r`   r   r9  r&   r   ra   r   r   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r   r=  reshaped_logitsr<  r@  r   rQ   rQ   rR   ri     sL   ,


zIBertForMultipleChoice.forwardrA  )rl   rm   rn   r1   r   r   r>   r/  r0  r1  r   r   r   ri   rp   rQ   rQ   rO   rR   rW    sH    
	
rW  c                       rL  )IBertForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S r3  )r0   r1   rN  r  r  r   rI   rJ   rK   r   r8   rP  r  rL   rO   rQ   rR   r1   *  s   z$IBertForTokenClassification.__init__Nr_   r   r`   r&   r   ra   r9  r   r   r   r#  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr:  r   r'   r   r;  )rN   r%  r  rK   rP  r   r   rN  r   r   r   rV  rQ   rQ   rR   ri   5  s8   

z#IBertForTokenClassification.forwardrA  )rl   rm   rn   r1   r   r   r>   r/  r0  r1  r   r   r   ri   rp   rQ   rQ   rO   rR   r_  (  sH    	
r_  c                       s(   e Zd ZdZ fddZdd Z  ZS )rO  z-Head for sentence-level classification tasks.c                    s@   t    t|j|j| _t|j| _t|j|j	| _
d S r   )r0   r1   r   r   r8   r   rI   rJ   rK   rN  out_projrL   rO   rQ   rR   r1   n  s   
z IBertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r   )rK   r   r>   tanhr`  )rM   rE  rF  r   rQ   rQ   rR   ri   t  s   




zIBertClassificationHead.forward)rl   rm   rn   ro   r1   ri   rp   rQ   rQ   rO   rR   rO  k  s    rO  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eeej f fddZ  ZS )IBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r3  )
r0   r1   rN  r  r  r   r   r8   
qa_outputsr  rL   rO   rQ   rR   r1     s
   z"IBertForQuestionAnswering.__init__Nr_   r   r`   r&   r   ra   start_positionsend_positionsr   r   r   r#  c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr:  r   r   r'   r   )ignore_indexr   )r<  start_logits
end_logitsr   r   )rN   r%  r  rc  splitrU  r   r   r\   clampr   r   r   r   )rM   r_   r   r`   r&   r   ra   rd  re  r   r   r   r   r.  r=  rg  rh  
total_lossignored_indexr@  
start_lossend_lossr   rQ   rQ   rR   ri     sP   






z!IBertForQuestionAnswering.forward)NNNNNNNNNNN)rl   rm   rn   r1   r   r   r>   r/  r0  r1  r   r   r   ri   rp   rQ   rQ   rO   rR   rb  ~  sN    
	
rb  c                 C   s6   |  | }tj|dd|| | }| | S )aM  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's *utils.make_positions*.

    Args:
    input_ids (`torch.LongTensor`):
           Indices of input sequence tokens in the vocabulary.

    Returns: torch.Tensor
    r   r   )ner|   r>   cumsumtype_asr^   )r_   r#   rb   maskincremental_indicesrQ   rQ   rR   rY     s   rY   )r2  rW  rb  rM  r_  r  r  )r   )>ro   r   typingr   r   r>   r   torch.nnr   r   r   activationsr	   modeling_outputsr
   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   configuration_ibertr   quant_modulesr   r   r   r   r   r   
get_loggerrl   loggerModuler   rq   r   r   r   r   r   r   r   r  r  r2  r  rM  rW  r_  rO  rb  rY   __all__rQ   rQ   rQ   rR   <module>   sX   $	 
z /1"/:=wHQfB
M