o
    wi                     @   sB  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z! ddl"m#Z# e!$e%Z&dd Z'G dd de	j(Z)e	j*e)dZ+G dd de	j(Z,G dd de	j(Z-G dd de	j(Z.G dd de	j(Z/G dd de	j(Z0G dd de	j(Z1G dd  d e	j(Z2G d!d" d"e	j(Z3G d#d$ d$e	j(Z4G d%d& d&e	j(Z5G d'd( d(e	j(Z6G d)d* d*e	j(Z7G d+d, d,e	j(Z8G d-d. d.e	j(Z9G d/d0 d0e	j(Z:G d1d2 d2e	j(Z;G d3d4 d4e	j(Z<G d5d6 d6e	j(Z=e G d7d8 d8eZ>ee d9d:G d;d< d<eZ?e G d=d> d>e>Z@e d?d:G d@dA dAe>ZAe G dBdC dCe>ZBG dDdE dEe	j(ZCe dFd:G dGdH dHe>ZDe dId:G dJdK dKe>ZEe G dLdM dMe>ZFe G dNdO dOe>ZGe G dPdQ dQe>ZHg dRZIdS )S    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )MobileBertConfigc                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
d}
tdd |
D rt	dd|
  q\| }|
D ]~}|d|r|d|}n|g}|d dks|d dkrt|d}nI|d dks|d dkrt|d}n7|d dkrt|d}n+|d dkrt|d}nz	t||d }W n ty   t	dd|
  Y qw t|dkrt|d }|| }q|d d d!kr%t|d}n
|dkr/||}z|j|jksDJ d"|j d#|j d$W n ty^ } z| j|j|jf7  _ d}~ww t	d%|
  t||_q\| S )&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape 	ffn_layerffnFakeLayerNorm	LayerNormextra_output_weightszdense/kernelbert
mobilebert/c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0nr(   r(   o/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mobilebert/modeling_mobilebert.py	<genexpr>V   s
    
z0load_tf_weights_in_mobilebert.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipreplacesplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr8   nptftf_path	init_varsnamesarraysnamerQ   arraypointerm_namescope_namesnumer(   r(   r+   load_tf_weights_in_mobilebert5   s   



rg   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	NoNormNc                    s2   t    tt|| _tt|| _d S N)	super__init__r   	ParameterrT   zerosr2   onesr/   )self	feat_sizeeps	__class__r(   r+   rk      s   
zNoNorm.__init__input_tensorreturnc                 C   s   || j  | j S ri   )r/   r2   )ro   rt   r(   r(   r+   forward   s   zNoNorm.forwardri   __name__
__module____qualname__rk   rT   Tensorrv   __classcell__r(   r(   rr   r+   rh      s    rh   )
layer_normno_normc                       sb   e Zd ZdZ fddZ				ddeej deej deej deej d	ej	f
d
dZ
  ZS )MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    |j| _|j| _|j| _tj|j|j|jd| _	t|j
|j| _t|j|j| _| jr4dnd}| j| }t||j| _t|j |j| _t|j| _| jdt|j
ddd d S )N)padding_idxr	   r   position_ids)r   F)
persistent)rj   rk   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_typer   Dropouthidden_dropout_probdropoutregister_bufferrT   arangeexpand)ro   rX   embed_dim_multiplierembedded_input_sizerr   r(   r+   rk      s   


zMobileBertEmbeddings.__init__N	input_idstoken_type_idsr   inputs_embedsru   c           
      C   s*  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jrktjt	j
j|d d dd f g ddd|t	j
j|d d d df g dddgdd	}| jst| j| jkry| |}| |}| |}|| | }	| |	}	| |	}	|	S )
Nr   r   dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r   r6   dim)sizer   rT   rm   longr   r   r   catr   
functionalpadr   r   r   r   r   r   r   )
ro   r   r   r   r   input_shape
seq_lengthr   r   
embeddingsr(   r(   r+   rv      s4   

$$




zMobileBertEmbeddings.forward)NNNN)rx   ry   rz   __doc__rk   r   rT   
LongTensorFloatTensorr{   rv   r|   r(   r(   rr   r+   r      s$    r   c                       sn   e Zd Z fddZdd Z			ddejdejdejd	eej d
eej dee	 de
ej fddZ  ZS )MobileBertSelfAttentionc                    s   t    |j| _t|j|j | _| j| j | _t|j| j| _	t|j| j| _
t|jr3|jn|j| j| _t|j| _d S ri   )rj   rk   num_attention_headsrO   true_hidden_sizeattention_head_sizeall_head_sizer   r   querykeyuse_bottleneck_attentionr   r   r   attention_probs_dropout_probr   ro   rX   rr   r(   r+   rk      s   
z MobileBertSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r   r6   r   r	   )r   r   r   viewpermute)ro   xnew_x_shaper(   r(   r+   transpose_for_scores   s   
z,MobileBertSelfAttention.transpose_for_scoresNquery_tensor
key_tensorvalue_tensorattention_mask	head_maskoutput_attentionsru   c                 C   s   |  |}| |}| |}	| |}
| |}| |	}t|
|dd}|t| j	 }|d ur8|| }t
jj|dd}| |}|d urM|| }t||}|dddd }| d d | jf }||}|rv||f}|S |f}|S )Nr   r   r   r6   r   r	   )r   r   r   r   rT   matmulrP   mathsqrtr   r   r   softmaxr   r   
contiguousr   r   r   )ro   r   r   r   r   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr(   r(   r+   rv      s,   
	






zMobileBertSelfAttention.forwardNNN)rx   ry   rz   rk   r   rT   r{   r   r   booltuplerv   r|   r(   r(   rr   r+   r      s*    
r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MobileBertSelfOutputc                    sX   t    |j| _t|j|j| _t|j |j|j	d| _
| js*t|j| _d S d S Nrq   )rj   rk   use_bottleneckr   r   r   denser   r   layer_norm_epsr   r   r   r   r   rr   r(   r+   rk     s   
zMobileBertSelfOutput.__init__hidden_statesresidual_tensorru   c                 C   s,   |  |}| js| |}| || }|S ri   )r   r   r   r   ro   r   r   layer_outputsr(   r(   r+   rv     s
   

zMobileBertSelfOutput.forwardrw   r(   r(   rr   r+   r     s    $r   c                       st   e Zd Z fddZdd Z			ddejdejdejd	ejd
eej deej dee	 de
ej fddZ  ZS )MobileBertAttentionc                    s*   t    t|| _t|| _t | _d S ri   )rj   rk   r   ro   r   outputsetpruned_headsr   rr   r(   r+   rk   (  s   


zMobileBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )rN   r   ro   r   r   r   r   r   r   r   r   r   r   union)ro   headsindexr(   r(   r+   prune_heads.  s   zMobileBertAttention.prune_headsNr   r   r   layer_inputr   r   r   ru   c                 C   s:   |  ||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )ro   r   )ro   r   r   r   r   r   r   r   self_outputsattention_outputr   r(   r(   r+   rv   @  s   

zMobileBertAttention.forwardr   )rx   ry   rz   rk   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r   '  s.    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MobileBertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S ri   )rj   rk   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnr   rr   r(   r+   rk   Z  s
   
zMobileBertIntermediate.__init__r   ru   c                 C   s   |  |}| |}|S ri   )r   r   ro   r   r(   r(   r+   rv   b     

zMobileBertIntermediate.forwardrw   r(   r(   rr   r+   r   Y  s    r   c                       r   )OutputBottleneckc                    sF   t    t|j|j| _t|j |j|j	d| _
t|j| _d S r   )rj   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   rr   r(   r+   rk   i  s   
zOutputBottleneck.__init__r   r   ru   c                 C   s&   |  |}| |}| || }|S ri   )r   r   r   r   r(   r(   r+   rv   o  s   

zOutputBottleneck.forwardrw   r(   r(   rr   r+   r   h  s    $r   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	MobileBertOutputc                    s\   t    |j| _t|j|j| _t|j	 |j| _
| js't|j| _d S t|| _d S ri   )rj   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   
bottleneckr   rr   r(   r+   rk   w  s   
zMobileBertOutput.__init__intermediate_statesresidual_tensor_1residual_tensor_2ru   c                 C   sJ   |  |}| js| |}| || }|S | || }| ||}|S ri   )r   r   r   r   r   )ro   r   r   r   layer_outputr(   r(   r+   rv     s   

zMobileBertOutput.forwardrw   r(   r(   rr   r+   r   v  s    
r   c                       r   )BottleneckLayerc                    8   t    t|j|j| _t|j |j|j	d| _
d S r   )rj   rk   r   r   r   intra_bottleneck_sizer   r   r   r   r   r   rr   r(   r+   rk        
zBottleneckLayer.__init__r   ru   c                 C   s   |  |}| |}|S ri   r   r   )ro   r   r   r(   r(   r+   rv     r   zBottleneckLayer.forwardrw   r(   r(   rr   r+   r         r   c                       s6   e Zd Z fddZdejdeej fddZ  ZS )
Bottleneckc                    s<   t    |j| _|j| _t|| _| jrt|| _d S d S ri   )rj   rk   key_query_shared_bottleneckr   r   input	attentionr   rr   r(   r+   rk     s   

zBottleneck.__init__r   ru   c                 C   sB   |  |}| jr|fd S | jr| |}||||fS ||||fS )N   )r  r   r  r  )ro   r   bottlenecked_hidden_statesshared_attention_inputr(   r(   r+   rv     s   


zBottleneck.forward	rx   ry   rz   rk   rT   r{   r   rv   r|   r(   r(   rr   r+   r    s    "r  c                       r   )	FFNOutputc                    r   r   )rj   rk   r   r   r   r   r   r   r   r   r   r   rr   r(   r+   rk     r  zFFNOutput.__init__r   r   ru   c                 C   s   |  |}| || }|S ri   r  r   r(   r(   r+   rv     s   
zFFNOutput.forwardrw   r(   r(   rr   r+   r    s    $r  c                       r   )FFNLayerc                    s"   t    t|| _t|| _d S ri   )rj   rk   r   intermediater  r   r   rr   r(   r+   rk     s   

zFFNLayer.__init__r   ru   c                 C   s   |  |}| ||}|S ri   )r  r   )ro   r   intermediate_outputr   r(   r(   r+   rv     s   
zFFNLayer.forwardrw   r(   r(   rr   r+   r    r  r  c                       sZ   e Zd Z fddZ			ddejdeej deej dee de	ej f
d	d
Z
  ZS )MobileBertLayerc                    s~   t     j| _ j| _t | _t | _t | _	| jr$t
 | _ jdkr=t fddt jd D | _d S d S )Nr   c                       g | ]}t  qS r(   )r  r)   _rX   r(   r+   
<listcomp>      z,MobileBertLayer.__init__.<locals>.<listcomp>)rj   rk   r   num_feedforward_networksr   r  r   r  r   r   r  r   r   
ModuleListranger   r   rr   r  r+   rk     s   





(zMobileBertLayer.__init__Nr   r   r   r   ru   c              	   C   s   | j r| |\}}}}n	|gd \}}}}| j|||||||d}	|	d }
|
f}|	dd  }| jdkrGt| jD ]\}}||
}
||
f7 }q9| |
}| ||
|}|f| t	d|||||
|f | }|S )Nr	  )r   r   r   i  )
r   r   r  r  	enumerater   r  r   rT   tensor)ro   r   r   r   r   r   r   r   r   self_attention_outputsr   sr   i
ffn_moduler  r   r(   r(   r+   rv     sJ   	

zMobileBertLayer.forwardr   )rx   ry   rz   rk   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r    s     r  c                       sp   e Zd Z fddZ					ddejdeej deej d	ee d
ee dee de	e
ef fddZ  ZS )MobileBertEncoderc                    s.   t    t fddt jD | _d S )Nc                    r  r(   )r  r  r  r(   r+   r    r  z.MobileBertEncoder.__init__.<locals>.<listcomp>)rj   rk   r   r  r  num_hidden_layerslayerr   rr   r  r+   rk     s   
$zMobileBertEncoder.__init__NFTr   r   r   r   output_hidden_statesreturn_dictru   c                 C   s   |rdnd }|r
dnd }t | jD ]!\}	}
|r||f }|
||||	 |}|d }|r2||d f }q|r:||f }|sHtdd |||fD S t|||dS )Nr(   r   r   c                 s   s    | ]	}|d ur|V  qd S ri   r(   )r)   vr(   r(   r+   r,   9  s    z,MobileBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r  r#  r   r   )ro   r   r   r   r   r$  r%  all_hidden_statesall_attentionsr  layer_moduler   r(   r(   r+   rv     s,   	

zMobileBertEncoder.forward)NNFFT)rx   ry   rz   rk   rT   r{   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r!    s,    
r!  c                       r   )MobileBertPoolerc                    s2   t    |j| _| jrt|j|j| _d S d S ri   )rj   rk   classifier_activationdo_activater   r   r   r   r   rr   r(   r+   rk   @  s
   
zMobileBertPooler.__init__r   ru   c                 C   s2   |d d df }| j s|S | |}t|}|S )Nr   )r.  r   rT   tanh)ro   r   first_token_tensorpooled_outputr(   r(   r+   rv   F  s   

zMobileBertPooler.forwardrw   r(   r(   rr   r+   r,  ?  s    r,  c                       r   )!MobileBertPredictionHeadTransformc                    sX   t    t|j|j| _t|jtrt	|j | _
n|j| _
td |j|jd| _d S )Nr}   r   )rj   rk   r   r   r   r   r   r   r   r
   transform_act_fnr   r   r   r   rr   r(   r+   rk   S  s   
z*MobileBertPredictionHeadTransform.__init__r   ru   c                 C   s"   |  |}| |}| |}|S ri   )r   r3  r   r   r(   r(   r+   rv   \  s   


z)MobileBertPredictionHeadTransform.forwardrw   r(   r(   rr   r+   r2  R  s    	r2  c                       s<   e Zd Z fddZd
ddZdejdejfdd	Z  ZS )MobileBertLMPredictionHeadc                    sh   t    t|| _tj|j|j|j dd| _	tj|j|jdd| _
tt|j| _| j| j
_d S )NF)r2   )rj   rk   r2  	transformr   r   r   r   r   r   decoderrl   rT   rm   r2   r   rr   r(   r+   rk   d  s   

z#MobileBertLMPredictionHead.__init__ru   Nc                 C   s   | j | j_ d S ri   )r2   r6  ro   r(   r(   r+   _tie_weightso  s   z'MobileBertLMPredictionHead._tie_weightsr   c                 C   s>   |  |}|tj| jj | jjgdd}|| jj7 }|S )Nr   r   )	r5  r   rT   r   r6  r/   tr   r2   r   r(   r(   r+   rv   r  s   
$z"MobileBertLMPredictionHead.forward)ru   N)	rx   ry   rz   rk   r8  rT   r{   rv   r|   r(   r(   rr   r+   r4  c  s    
r4  c                       r   )MobileBertOnlyMLMHeadc                    s   t    t|| _d S ri   )rj   rk   r4  predictionsr   rr   r(   r+   rk   z  s   
zMobileBertOnlyMLMHead.__init__sequence_outputru   c                 C      |  |}|S ri   )r;  )ro   r<  prediction_scoresr(   r(   r+   rv   ~     
zMobileBertOnlyMLMHead.forwardrw   r(   r(   rr   r+   r:  y      r:  c                       s<   e Zd Z fddZdejdejdeej fddZ  ZS )MobileBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S Nr6   )rj   rk   r4  r;  r   r   r   seq_relationshipr   rr   r(   r+   rk     s   

z#MobileBertPreTrainingHeads.__init__r<  r1  ru   c                 C   s   |  |}| |}||fS ri   )r;  rC  )ro   r<  r1  r>  seq_relationship_scorer(   r(   r+   rv     s   

z"MobileBertPreTrainingHeads.forwardr  r(   r(   rr   r+   rA    s    (rA  c                   @   s    e Zd ZeZeZdZdd ZdS )MobileBertPreTrainedModelr!   c                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjtfrZ|jj	  |jjd dS t |trg|jj	  dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   r   r/   rV   normal_rX   initializer_ranger2   zero_r   r   r   rh   fill_r4  )ro   moduler(   r(   r+   _init_weights  s    


z'MobileBertPreTrainedModel._init_weightsN)	rx   ry   rz   r   config_classrg   load_tf_weightsbase_model_prefixrM  r(   r(   r(   r+   rE    s
    rE  z6
    Output type of [`MobileBertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )MobileBertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r(  )rx   ry   rz   r   rS  r   rT   r   __annotations__rT  rU  r   r   r(  r(   r(   r(   r+   rR    s   
 rR  c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Ze									dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeef fddZ  ZS )MobileBertModelz2
    https://huggingface.co/papers/2004.02984
    Tc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rj   rk   rX   r   r   r!  encoderr,  pooler	post_init)ro   rX   add_pooling_layerrr   r(   r+   rk     s   

zMobileBertModel.__init__c                 C   s   | j jS ri   r   r   r7  r(   r(   r+   get_input_embeddings  s   z$MobileBertModel.get_input_embeddingsc                 C   s   || j _d S ri   r\  )ro   r   r(   r(   r+   set_input_embeddings  s   z$MobileBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrX  r#  r  r   )ro   heads_to_pruner#  r   r(   r(   r+   _prune_heads  s   zMobileBertModel._prune_headsNr   r   r   r   r   r   r$  r   r%  ru   c
                 C   sh  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d}|d }| jd ur| |nd }|	s||f|d	d   S t|||j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   r   r   r   )r   r   r   r$  r%  r   r   )r'  pooler_outputr   r(  )rX   r   r$  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rT   rn   rm   r   get_extended_attention_maskget_head_maskr"  r   rX  rY  r   r   r(  )ro   r   r   r   r   r   r   r$  r   r%  r   r   extended_attention_maskembedding_outputencoder_outputsr<  r1  r(   r(   r+   rv     sP   
zMobileBertModel.forward)T)	NNNNNNNNN)rx   ry   rz   r   rk   r]  r^  ra  r   r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   rW    sJ    	

rW  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    c                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																						ddeej deej deej deej deej deej deej deej deej deej deej deeef fddZ  ZS )MobileBertForPreTrainingcls.predictions.decoder.weightcls.predictions.decoder.biasc                    ,   t  | t|| _t|| _|   d S ri   )rj   rk   rW  r!   rA  clsrZ  r   rr   r(   r+   rk   5  s   

z!MobileBertForPreTraining.__init__c                 C   
   | j jjS ri   ro  r;  r6  r7  r(   r(   r+   get_output_embeddings=     
z.MobileBertForPreTraining.get_output_embeddingsc                 C      || j j_|j| j j_d S ri   ro  r;  r6  r2   ro   new_embeddingsr(   r(   r+   set_output_embeddings@     
z.MobileBertForPreTraining.set_output_embeddingsNnew_num_tokensru   c                    *   | j | jjj|dd| jj_t j|dS NT)rz  
transposed)rz  _get_resized_lm_headro  r;  r   rj   resize_token_embeddingsro   rz  rr   r(   r+   r  D  s   z0MobileBertForPreTraining.resize_token_embeddingsr   r   r   r   r   r   labelsnext_sentence_labelr   r$  r%  c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| ||\}}d}|durS|durSt }||d| j j|d}||dd|d}|| }|sj||f|dd  }|durh|f| S |S t||||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Nr   r   r   r   r   r   r$  r%  r6   r   )rS  rT  rU  r   r(  )
rX   rc  r!   ro  r   r   r   rR  r   r(  )ro   r   r   r   r   r   r   r  r  r   r$  r%  r   r<  r1  r>  rD  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   r(   r(   r+   rv   L  s<   +z MobileBertForPreTraining.forwardri   NNNNNNNNNNN)rx   ry   rz   _tied_weights_keysrk   rr  rx  r   rO   r   r   r  r   rT   r   r   r   r   rR  rv   r|   r(   r(   rr   r+   rk  ,  sV    	

rk  c                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )MobileBertForMaskedLMrl  rm  c                    s6   t  | t|dd| _t|| _|| _|   d S NF)r[  )rj   rk   rW  r!   r:  ro  rX   rZ  r   rr   r(   r+   rk     s
   
zMobileBertForMaskedLM.__init__c                 C   rp  ri   rq  r7  r(   r(   r+   rr    rs  z+MobileBertForMaskedLM.get_output_embeddingsc                 C   rt  ri   ru  rv  r(   r(   r+   rx    ry  z+MobileBertForMaskedLM.set_output_embeddingsNrz  ru   c                    r{  r|  r~  r  rr   r(   r+   r    s   z-MobileBertForMaskedLM.resize_token_embeddingsr   r   r   r   r   r   r  r   r$  r%  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   r   r6   rS  logitsr   r(  )
rX   rc  r!   ro  r   r   r   r   r   r(  )ro   r   r   r   r   r   r   r  r   r$  r%  r   r<  r>  r  r  r   r(   r(   r+   rv     s6   
zMobileBertForMaskedLM.forwardri   
NNNNNNNNNN)rx   ry   rz   r  rk   rr  rx  r   rO   r   r   r  r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sP    		

r  c                       r   )MobileBertOnlyNSPHeadc                    s   t    t|jd| _d S rB  )rj   rk   r   r   r   rC  r   rr   r(   r+   rk     s   
zMobileBertOnlyNSPHead.__init__r1  ru   c                 C   r=  ri   )rC  )ro   r1  rD  r(   r(   r+   rv     r?  zMobileBertOnlyNSPHead.forwardrw   r(   r(   rr   r+   r    r@  r  zZ
    MobileBert Model with a `next sentence prediction (classification)` head on top.
    c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )#MobileBertForNextSentencePredictionc                    rn  ri   )rj   rk   rW  r!   r  ro  rZ  r   rr   r(   r+   rk     s   

z,MobileBertForNextSentencePrediction.__init__Nr   r   r   r   r   r   r  r   r$  r%  ru   c                 K   s   d|v rt dt |d}|
dur|
n| jj}
| j||||||||	|
d	}|d }| |}d}|durEt }||	dd|	d}|
s[|f|dd  }|durY|f| S |S t
|||j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r   r   r6   r  )warningswarnFutureWarningpoprX   rc  r!   ro  r   r   r   r   r(  )ro   r   r   r   r   r   r   r  r   r$  r%  kwargsr   r1  rD  r  r  r   r(   r(   r+   rv     sB   )

z+MobileBertForNextSentencePrediction.forwardr  )rx   ry   rz   rk   r   r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sH    		

r  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee dee dee de	e
ej ef fddZ  ZS )#MobileBertForSequenceClassificationc                    sd   t  | |j| _|| _t|| _|jd ur|jn|j}t	|| _
t|j|j| _|   d S ri   )rj   rk   
num_labelsrX   rW  r!   classifier_dropoutr   r   r   r   r   r   r5   rZ  ro   rX   r  rr   r(   r+   rk   `  s   
z,MobileBertForSequenceClassification.__init__Nr   r   r   r   r   r   r  r   r$  r%  ru   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationr   r6   r  )rX   rc  r!   r   r5   problem_typer  r   rT   r   rO   r   squeezer   r   r   r   r   r(  )ro   r   r   r   r   r   r   r  r   r$  r%  r   r1  r  rS  r  r   r(   r(   r+   rv   o  sV   



"


z+MobileBertForSequenceClassification.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r  X  sH    	
r  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee dee dee de	e
ej ef fddZ  ZS )MobileBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
rj   rk   r  rW  r!   r   r   r   
qa_outputsrZ  r   rr   r(   r+   rk     s
   z'MobileBertForQuestionAnswering.__init__Nr   r   r   r   r   r   start_positionsend_positionsr   r$  r%  ru   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr  r   r   r   r   )ignore_indexr6   )rS  start_logits
end_logitsr   r(  )rX   rc  r!   r  rH   r  r   rN   r   clampr   r   r   r(  )ro   r   r   r   r   r   r   r  r  r   r$  r%  r   r<  r  r  r  r  ignored_indexr  
start_lossend_lossr   r(   r(   r+   rv     sP   






z&MobileBertForQuestionAnswering.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sN    
	
r  c                       r  )MobileBertForMultipleChoicec                    sT   t  | t|| _|jd ur|jn|j}t|| _t	|j
d| _|   d S )Nr   )rj   rk   rW  r!   r  r   r   r   r   r   r   r5   rZ  r  rr   r(   r+   rk   
  s   
z$MobileBertForMultipleChoice.__init__Nr   r   r   r   r   r   r  r   r$  r%  ru   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r   r  r6   r  )rX   rc  rQ   r   r   r!   r   r5   r   r   r   r(  )ro   r   r   r   r   r   r   r  r   r$  r%  num_choicesr   r1  r  reshaped_logitsrS  r  r   r(   r(   r+   rv     sL   ,


z#MobileBertForMultipleChoice.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sH    	
r  c                       r  ) MobileBertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r  )rj   rk   r  rW  r!   r  r   r   r   r   r   r   r5   rZ  r  rr   r(   r+   rk   v  s   z)MobileBertForTokenClassification.__init__Nr   r   r   r   r   r   r  r   r$  r%  ru   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r6   r  )rX   rc  r!   r   r5   r   r   r  r   r   r(  )ro   r   r   r   r   r   r   r  r   r$  r%  r   r<  r  rS  r  r   r(   r(   r+   rv     s8   

z(MobileBertForTokenClassification.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r  s  sH    	
r  )r  r  r  rk  r  r  r  r  rW  rE  rg   )Jr   r>   r  dataclassesr   typingr   r   rT   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilebertr   
get_loggerrx   r<   rg   Modulerh   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r!  r,  r2  r4  r:  rA  rE  rR  rW  rk  r  r  r  r  r  r  r  __all__r(   r(   r(   r+   <module>   s   (

N
L:2$?*
jiP
]YMjE