o
    	۷i                     @   sB  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z! ddl"m#Z# e!$e%Z&dd Z'G dd de	j(Z)e	j*e)dZ+G dd de	j(Z,G dd de	j(Z-G dd de	j(Z.G dd de	j(Z/G dd de	j(Z0G dd de	j(Z1G dd  d e	j(Z2G d!d" d"e	j(Z3G d#d$ d$e	j(Z4G d%d& d&e	j(Z5G d'd( d(e	j(Z6G d)d* d*e	j(Z7G d+d, d,e	j(Z8G d-d. d.e	j(Z9G d/d0 d0e	j(Z:G d1d2 d2e	j(Z;G d3d4 d4e	j(Z<G d5d6 d6e	j(Z=e G d7d8 d8eZ>ee d9d:G d;d< d<eZ?e G d=d> d>e>Z@e d?d:G d@dA dAe>ZAe G dBdC dCe>ZBG dDdE dEe	j(ZCe dFd:G dGdH dHe>ZDe dId:G dJdK dKe>ZEe G dLdM dMe>ZFe G dNdO dOe>ZGe G dPdQ dQe>ZHg dRZIdS )S    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )MobileBertConfigc                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
d}
tdd |
D rt	dd|
  q\| }|
D ]~}|d|r|d|}n|g}|d dks|d dkrt|d}nI|d dks|d dkrt|d}n7|d dkrt|d}n+|d dkrt|d}nz	t||d }W n ty   t	dd|
  Y qw t|dkrt|d }|| }q|d d d!kr%t|d}n
|dkr/||}z|j|jksDJ d"|j d#|j d$W n ty^ } z| j|j|jf7  _ d}~ww t	d%|
  t||_q\| S )&z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape 	ffn_layerffnFakeLayerNorm	LayerNormextra_output_weightszdense/kernelbert
mobilebert/c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0nr(   r(   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/mobilebert/modeling_mobilebert.py	<genexpr>V   s
    
z0load_tf_weights_in_mobilebert.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipreplacesplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr8   nptftf_path	init_varsnamesarraysnamerQ   arraypointerm_namescope_namesnumer(   r(   r+   load_tf_weights_in_mobilebert5   s   



rg   c                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	NoNormNc                    s2   t    tt|| _tt|| _d S N)	super__init__r   	ParameterrT   zerosr2   onesr/   )self	feat_sizeeps	__class__r(   r+   rk      s   
zNoNorm.__init__input_tensorreturnc                 C   s   || j  | j S ri   )r/   r2   )ro   rt   r(   r(   r+   forward   s   zNoNorm.forwardri   __name__
__module____qualname__rk   rT   Tensorrv   __classcell__r(   r(   rr   r+   rh      s    rh   )
layer_normno_normc                       sb   e Zd ZdZ fddZ				ddeej deej deej deej d	ej	f
d
dZ
  ZS )MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    |j| _|j| _|j| _tj|j|j|jd| _	t|j
|j| _t|j|j| _| jr4dnd}| j| }t||j| _t|j |j| _t|j| _| jdt|j
ddd d S )N)padding_idxr	   r   position_ids)r   F)
persistent)rj   rk   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_typer   Dropouthidden_dropout_probdropoutregister_bufferrT   arangeexpand)ro   rX   embed_dim_multiplierembedded_input_sizerr   r(   r+   rk      s   


zMobileBertEmbeddings.__init__N	input_idstoken_type_idsr   inputs_embedsru   c           
      C   s*  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jrktjt	j
j|d d dd f g ddd|t	j
j|d d d df g dddgdd	}| jst| j| jkry| |}| |}| |}|| | }	| |	}	| |	}	|	S )
Nr   r   dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r   r6   dim)sizer   rT   rm   longr   r   r   catr   
functionalpadr   r   r   r   r   r   r   )
ro   r   r   r   r   input_shape
seq_lengthr   r   
embeddingsr(   r(   r+   rv      s4   

$$




zMobileBertEmbeddings.forward)NNNN)rx   ry   rz   __doc__rk   r   rT   
LongTensorFloatTensorr{   rv   r|   r(   r(   rr   r+   r      s$    r   c                       sf   e Zd Z fddZ			ddejdejdejdeej deej d	ee d
e	ej fddZ
  ZS )MobileBertSelfAttentionc                    s   t    |j| _t|j|j | _| j| j | _t|j| j| _	t|j| j| _
t|jr3|jn|j| j| _t|j| _d S ri   )rj   rk   num_attention_headsrO   true_hidden_sizeattention_head_sizeall_head_sizer   r   querykeyuse_bottleneck_attentionr   r   r   attention_probs_dropout_probr   ro   rX   rr   r(   r+   rk      s   
z MobileBertSelfAttention.__init__Nquery_tensor
key_tensorvalue_tensorattention_mask	head_maskoutput_attentionsru   c                 C   s,  |j \}}}	| ||d| j| jdd}
| ||d| j| jdd}| ||d| j| jdd}t	|
|dd}|t
| j }|d urS|| }tjj|dd}| |}|d urh|| }t	||}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r6   r   r   r	   )rQ   r   viewr   r   rP   r   r   rT   matmulmathsqrtr   r   softmaxr   permute
contiguousr   r   )ro   r   r   r   r   r   r   
batch_sizer   _query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr(   r(   r+   rv      s:   	

zMobileBertSelfAttention.forwardNNNrx   ry   rz   rk   rT   r{   r   r   booltuplerv   r|   r(   r(   rr   r+   r      s(    r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MobileBertSelfOutputc                    sX   t    |j| _t|j|j| _t|j |j|j	d| _
| js*t|j| _d S d S Nrq   )rj   rk   use_bottleneckr   r   r   denser   r   layer_norm_epsr   r   r   r   r   rr   r(   r+   rk     s   
zMobileBertSelfOutput.__init__hidden_statesresidual_tensorru   c                 C   s,   |  |}| js| |}| || }|S ri   )r   r   r   r   ro   r   r   layer_outputsr(   r(   r+   rv   #  s
   

zMobileBertSelfOutput.forwardrw   r(   r(   rr   r+   r     s    $r   c                       st   e Zd Z fddZdd Z			ddejdejdejd	ejd
eej deej dee	 de
ej fddZ  ZS )MobileBertAttentionc                    s*   t    t|| _t|| _t | _d S ri   )rj   rk   r   ro   r   outputsetpruned_headsr   rr   r(   r+   rk   ,  s   


zMobileBertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )rN   r   ro   r   r   r   r   r   r   r   r   r   r   union)ro   headsindexr(   r(   r+   prune_heads2  s   zMobileBertAttention.prune_headsNr   r   r   layer_inputr   r   r   ru   c                 C   s:   |  ||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )ro   r   )ro   r   r   r   r   r   r   r   self_outputsattention_outputr   r(   r(   r+   rv   D  s   

zMobileBertAttention.forwardr   )rx   ry   rz   rk   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r   +  s.    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MobileBertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S ri   )rj   rk   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnr   rr   r(   r+   rk   ^  s
   
zMobileBertIntermediate.__init__r   ru   c                 C   s   |  |}| |}|S ri   )r   r   ro   r   r(   r(   r+   rv   f     

zMobileBertIntermediate.forwardrw   r(   r(   rr   r+   r   ]  s    r   c                       r   )OutputBottleneckc                    sF   t    t|j|j| _t|j |j|j	d| _
t|j| _d S r   )rj   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   rr   r(   r+   rk   m  s   
zOutputBottleneck.__init__r   r   ru   c                 C   s&   |  |}| |}| || }|S ri   )r   r   r   r   r(   r(   r+   rv   s  s   

zOutputBottleneck.forwardrw   r(   r(   rr   r+   r   l  s    $r   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	MobileBertOutputc                    s\   t    |j| _t|j|j| _t|j	 |j| _
| js't|j| _d S t|| _d S ri   )rj   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   
bottleneckr   rr   r(   r+   rk   {  s   
zMobileBertOutput.__init__intermediate_statesresidual_tensor_1residual_tensor_2ru   c                 C   sJ   |  |}| js| |}| || }|S | || }| ||}|S ri   )r   r   r   r   r   )ro   r   r   r   layer_outputr(   r(   r+   rv     s   

zMobileBertOutput.forwardrw   r(   r(   rr   r+   r   z  s    
r   c                       r   )BottleneckLayerc                    8   t    t|j|j| _t|j |j|j	d| _
d S r   )rj   rk   r   r   r   intra_bottleneck_sizer   r   r   r   r   r   rr   r(   r+   rk        
zBottleneckLayer.__init__r   ru   c                 C   s   |  |}| |}|S ri   r   r   )ro   r   r   r(   r(   r+   rv     r   zBottleneckLayer.forwardrw   r(   r(   rr   r+   r         r   c                       s6   e Zd Z fddZdejdeej fddZ  ZS )
Bottleneckc                    s<   t    |j| _|j| _t|| _| jrt|| _d S d S ri   )rj   rk   key_query_shared_bottleneckr   r   input	attentionr   rr   r(   r+   rk     s   

zBottleneck.__init__r   ru   c                 C   sB   |  |}| jr|fd S | jr| |}||||fS ||||fS )N   )r  r   r  r  )ro   r   bottlenecked_hidden_statesshared_attention_inputr(   r(   r+   rv     s   


zBottleneck.forward	rx   ry   rz   rk   rT   r{   r   rv   r|   r(   r(   rr   r+   r    s    "r  c                       r   )	FFNOutputc                    r   r   )rj   rk   r   r   r   r   r   r   r   r   r   r   rr   r(   r+   rk     r   zFFNOutput.__init__r   r   ru   c                 C   s   |  |}| || }|S ri   r   r   r(   r(   r+   rv     s   
zFFNOutput.forwardrw   r(   r(   rr   r+   r
    s    $r
  c                       r   )FFNLayerc                    s"   t    t|| _t|| _d S ri   )rj   rk   r   intermediater
  r   r   rr   r(   r+   rk     s   

zFFNLayer.__init__r   ru   c                 C   s   |  |}| ||}|S ri   )r  r   )ro   r   intermediate_outputr   r(   r(   r+   rv     s   
zFFNLayer.forwardrw   r(   r(   rr   r+   r    r  r  c                       sZ   e Zd Z fddZ			ddejdeej deej dee de	ej f
d	d
Z
  ZS )MobileBertLayerc                    s~   t     j| _ j| _t | _t | _t | _	| jr$t
 | _ jdkr=t fddt jd D | _d S d S )Nr   c                       g | ]}t  qS r(   )r  r)   r   rX   r(   r+   
<listcomp>      z,MobileBertLayer.__init__.<locals>.<listcomp>)rj   rk   r   num_feedforward_networksr   r  r   r  r   r   r  r   r   
ModuleListranger   r   rr   r  r+   rk     s   





(zMobileBertLayer.__init__Nr   r   r   r   ru   c              	   C   s   | j r| |\}}}}n	|gd \}}}}| j|||||||d}	|	d }
|
f}|	dd  }| jdkrGt| jD ]\}}||
}
||
f7 }q9| |
}| ||
|}|f| t	d|||||
|f | }|S )Nr  )r   r   r   i  )
r   r   r  r  	enumerater   r  r   rT   tensor)ro   r   r   r   r   r   r   r   r   self_attention_outputsr   sr   i
ffn_moduler  r   r(   r(   r+   rv     sJ   	

zMobileBertLayer.forwardr   r   r(   r(   rr   r+   r    s     r  c                       sp   e Zd Z fddZ					ddejdeej deej d	ee d
ee dee de	e
ef fddZ  ZS )MobileBertEncoderc                    s.   t    t fddt jD | _d S )Nc                    r  r(   )r  r  r  r(   r+   r    r  z.MobileBertEncoder.__init__.<locals>.<listcomp>)rj   rk   r   r  r  num_hidden_layerslayerr   rr   r  r+   rk     s   
$zMobileBertEncoder.__init__NFTr   r   r   r   output_hidden_statesreturn_dictru   c                 C   s   |rdnd }|r
dnd }t | jD ]!\}	}
|r||f }|
||||	 |}|d }|r2||d f }q|r:||f }|sHtdd |||fD S t|||dS )Nr(   r   r   c                 s   s    | ]	}|d ur|V  qd S ri   r(   )r)   vr(   r(   r+   r,   =  s    z,MobileBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r  r  r   r   )ro   r   r   r   r   r   r!  all_hidden_statesall_attentionsr  layer_moduler   r(   r(   r+   rv     s,   	

zMobileBertEncoder.forward)NNFFT)rx   ry   rz   rk   rT   r{   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r    s,    
r  c                       r   )MobileBertPoolerc                    s2   t    |j| _| jrt|j|j| _d S d S ri   )rj   rk   classifier_activationdo_activater   r   r   r   r   rr   r(   r+   rk   D  s
   
zMobileBertPooler.__init__r   ru   c                 C   s2   |d d df }| j s|S | |}t|}|S )Nr   )r*  r   rT   tanh)ro   r   first_token_tensorpooled_outputr(   r(   r+   rv   J  s   

zMobileBertPooler.forwardrw   r(   r(   rr   r+   r(  C  s    r(  c                       r   )!MobileBertPredictionHeadTransformc                    sX   t    t|j|j| _t|jtrt	|j | _
n|j| _
td |j|jd| _d S )Nr}   r   )rj   rk   r   r   r   r   r   r   r   r
   transform_act_fnr   r   r   r   rr   r(   r+   rk   W  s   
z*MobileBertPredictionHeadTransform.__init__r   ru   c                 C   s"   |  |}| |}| |}|S ri   )r   r/  r   r   r(   r(   r+   rv   `  s   


z)MobileBertPredictionHeadTransform.forwardrw   r(   r(   rr   r+   r.  V  s    	r.  c                       s<   e Zd Z fddZd
ddZdejdejfdd	Z  ZS )MobileBertLMPredictionHeadc                    sh   t    t|| _tj|j|j|j dd| _	tj|j|jdd| _
tt|j| _| j| j
_d S )NF)r2   )rj   rk   r.  	transformr   r   r   r   r   r   decoderrl   rT   rm   r2   r   rr   r(   r+   rk   h  s   

z#MobileBertLMPredictionHead.__init__ru   Nc                 C   s   | j | j_ d S ri   )r2   r2  ro   r(   r(   r+   _tie_weightss  s   z'MobileBertLMPredictionHead._tie_weightsr   c                 C   s>   |  |}|tj| jj | jjgdd}|| jj7 }|S )Nr   r   )	r1  r   rT   r   r2  r/   tr   r2   r   r(   r(   r+   rv   v  s   
$z"MobileBertLMPredictionHead.forward)ru   N)	rx   ry   rz   rk   r4  rT   r{   rv   r|   r(   r(   rr   r+   r0  g  s    
r0  c                       r   )MobileBertOnlyMLMHeadc                    s   t    t|| _d S ri   )rj   rk   r0  predictionsr   rr   r(   r+   rk   ~  s   
zMobileBertOnlyMLMHead.__init__sequence_outputru   c                 C      |  |}|S ri   )r7  )ro   r8  prediction_scoresr(   r(   r+   rv        
zMobileBertOnlyMLMHead.forwardrw   r(   r(   rr   r+   r6  }      r6  c                       s<   e Zd Z fddZdejdejdeej fddZ  ZS )MobileBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S Nr6   )rj   rk   r0  r7  r   r   r   seq_relationshipr   rr   r(   r+   rk     s   

z#MobileBertPreTrainingHeads.__init__r8  r-  ru   c                 C   s   |  |}| |}||fS ri   )r7  r?  )ro   r8  r-  r:  seq_relationship_scorer(   r(   r+   rv     s   

z"MobileBertPreTrainingHeads.forwardr	  r(   r(   rr   r+   r=    s    (r=  c                   @   s&   e Zd ZU eed< eZdZdd ZdS )MobileBertPreTrainedModelrX   r!   c                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjtfrZ|jj	  |jjd dS t |trg|jj	  dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   r   r/   rV   normal_rX   initializer_ranger2   zero_r   r   r   rh   fill_r0  )ro   moduler(   r(   r+   _init_weights  s    


z'MobileBertPreTrainedModel._init_weightsN)	rx   ry   rz   r   __annotations__rg   load_tf_weightsbase_model_prefixrI  r(   r(   r(   r+   rA    s
   
 rA  z6
    Output type of [`MobileBertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )MobileBertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r$  )rx   ry   rz   r   rO  r   rT   r   rJ  rP  rQ  r   r   r$  r(   r(   r(   r+   rN    s   
 rN  c                       s   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Ze									dde	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e deeef fddZ  ZS )MobileBertModelz2
    https://huggingface.co/papers/2004.02984
    Tc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rj   rk   rX   r   r   r  encoderr(  pooler	post_init)ro   rX   add_pooling_layerrr   r(   r+   rk     s   

zMobileBertModel.__init__c                 C   s   | j jS ri   r   r   r3  r(   r(   r+   get_input_embeddings  s   z$MobileBertModel.get_input_embeddingsc                 C   s   || j _d S ri   rW  )ro   r   r(   r(   r+   set_input_embeddings  s   z$MobileBertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrS  r  r  r   )ro   heads_to_pruner  r   r(   r(   r+   _prune_heads  s   zMobileBertModel._prune_headsNr   r   r   r   r   r   r   r   r!  ru   c
                 C   sh  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| ||
}| || j j}| j||||d}| j||||||	d}|d }| jd ur| |nd }|	s||f|d	d   S t|||j|jd
S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   r   r   r   )r   r   r   r   r!  r   r   )r#  pooler_outputr   r$  )rX   r   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rT   rn   rm   r   get_extended_attention_maskget_head_maskr  r   rS  rT  r   r   r$  )ro   r   r   r   r   r   r   r   r   r!  r   r   extended_attention_maskembedding_outputencoder_outputsr8  r-  r(   r(   r+   rv     sP   
zMobileBertModel.forward)T)	NNNNNNNNN)rx   ry   rz   r   rk   rX  rY  r\  r   r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   rR    sJ    	

rR  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    c                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																						ddeej deej deej deej deej deej deej deej deej deej deej deeef fddZ  ZS )MobileBertForPreTrainingcls.predictions.decoder.weightcls.predictions.decoder.biasc                    ,   t  | t|| _t|| _|   d S ri   )rj   rk   rR  r!   r=  clsrU  r   rr   r(   r+   rk   9  s   

z!MobileBertForPreTraining.__init__c                 C   
   | j jjS ri   rj  r7  r2  r3  r(   r(   r+   get_output_embeddingsA     
z.MobileBertForPreTraining.get_output_embeddingsc                 C      || j j_|j| j j_d S ri   rj  r7  r2  r2   ro   new_embeddingsr(   r(   r+   set_output_embeddingsD     
z.MobileBertForPreTraining.set_output_embeddingsNnew_num_tokensru   c                    *   | j | jjj|dd| jj_t j|dS NT)ru  
transposed)ru  _get_resized_lm_headrj  r7  r   rj   resize_token_embeddingsro   ru  rr   r(   r+   r{  H  s   z0MobileBertForPreTraining.resize_token_embeddingsr   r   r   r   r   r   labelsnext_sentence_labelr   r   r!  c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| ||\}}d}|durS|durSt }||d| j j|d}||dd|d}|| }|sj||f|dd  }|durh|f| S |S t||||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Nr   r   r   r   r   r   r   r!  r6   r   )rO  rP  rQ  r   r$  )
rX   r^  r!   rj  r   r   r   rN  r   r$  )ro   r   r   r   r   r   r   r}  r~  r   r   r!  r   r8  r-  r:  r@  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   r(   r(   r+   rv   P  s<   +z MobileBertForPreTraining.forwardri   NNNNNNNNNNN)rx   ry   rz   _tied_weights_keysrk   rm  rs  r   rO   r   r   r{  r   rT   r   r   r   r   rN  rv   r|   r(   r(   rr   r+   rf  0  sV    	

rf  c                       s   e Zd ZddgZ fddZdd Zdd Zdd
ee de	j
f fddZe																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )MobileBertForMaskedLMrg  rh  c                    s6   t  | t|dd| _t|| _|| _|   d S NF)rV  )rj   rk   rR  r!   r6  rj  rX   rU  r   rr   r(   r+   rk     s
   
zMobileBertForMaskedLM.__init__c                 C   rk  ri   rl  r3  r(   r(   r+   rm    rn  z+MobileBertForMaskedLM.get_output_embeddingsc                 C   ro  ri   rp  rq  r(   r(   r+   rs    rt  z+MobileBertForMaskedLM.set_output_embeddingsNru  ru   c                    rv  rw  ry  r|  rr   r(   r+   r{    s   z-MobileBertForMaskedLM.resize_token_embeddingsr   r   r   r   r   r   r}  r   r   r!  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   r   r6   rO  logitsr   r$  )
rX   r^  r!   rj  r   r   r   r   r   r$  )ro   r   r   r   r   r   r   r}  r   r   r!  r   r8  r:  r  r  r   r(   r(   r+   rv     s6   
zMobileBertForMaskedLM.forwardri   
NNNNNNNNNN)rx   ry   rz   r  rk   rm  rs  r   rO   r   r   r{  r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sP    		

r  c                       r   )MobileBertOnlyNSPHeadc                    s   t    t|jd| _d S r>  )rj   rk   r   r   r   r?  r   rr   r(   r+   rk     s   
zMobileBertOnlyNSPHead.__init__r-  ru   c                 C   r9  ri   )r?  )ro   r-  r@  r(   r(   r+   rv     r;  zMobileBertOnlyNSPHead.forwardrw   r(   r(   rr   r+   r    r<  r  zZ
    MobileBert Model with a `next sentence prediction (classification)` head on top.
    c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )#MobileBertForNextSentencePredictionc                    ri  ri   )rj   rk   rR  r!   r  rj  rU  r   rr   r(   r+   rk      s   

z,MobileBertForNextSentencePrediction.__init__Nr   r   r   r   r   r   r}  r   r   r!  ru   c                 K   s   d|v rt dt |d}|
dur|
n| jj}
| j||||||||	|
d	}|d }| |}d}|durEt }||	dd|	d}|
s[|f|dd  }|durY|f| S |S t
|||j|jdS )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r~  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r   r   r6   r  )warningswarnFutureWarningpoprX   r^  r!   rj  r   r   r   r   r$  )ro   r   r   r   r   r   r   r}  r   r   r!  kwargsr   r-  r@  r  r  r   r(   r(   r+   rv   	  sB   )

z+MobileBertForNextSentencePrediction.forwardr  )rx   ry   rz   rk   r   r   rT   r   r   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sH    		

r  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee dee dee de	e
ej ef fddZ  ZS )#MobileBertForSequenceClassificationc                    sd   t  | |j| _|| _t|| _|jd ur|jn|j}t	|| _
t|j|j| _|   d S ri   )rj   rk   
num_labelsrX   rR  r!   classifier_dropoutr   r   r   r   r   r   r5   rU  ro   rX   r  rr   r(   r+   rk   d  s   
z,MobileBertForSequenceClassification.__init__Nr   r   r   r   r   r   r}  r   r   r!  ru   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationr   r6   r  )rX   r^  r!   r   r5   problem_typer  r   rT   r   rO   r   squeezer   r   r   r   r   r$  )ro   r   r   r   r   r   r   r}  r   r   r!  r   r-  r  rO  r  r   r(   r(   r+   rv   s  sV   



"


z+MobileBertForSequenceClassification.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r  \  sH    	
r  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee dee dee de	e
ej ef fddZ  ZS )MobileBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
rj   rk   r  rR  r!   r   r   r   
qa_outputsrU  r   rr   r(   r+   rk     s
   z'MobileBertForQuestionAnswering.__init__Nr   r   r   r   r   r   start_positionsend_positionsr   r   r!  ru   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr  r   r   r   r   )ignore_indexr6   )rO  start_logits
end_logitsr   r$  )rX   r^  r!   r  rH   r  r   rN   r   clampr   r   r   r$  )ro   r   r   r   r   r   r   r  r  r   r   r!  r   r8  r  r  r  r  ignored_indexr  
start_lossend_lossr   r(   r(   r+   rv     sP   






z&MobileBertForQuestionAnswering.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sN    
	
r  c                       r  )MobileBertForMultipleChoicec                    sT   t  | t|| _|jd ur|jn|j}t|| _t	|j
d| _|   d S )Nr   )rj   rk   rR  r!   r  r   r   r   r   r   r   r5   rU  r  rr   r(   r+   rk     s   
z$MobileBertForMultipleChoice.__init__Nr   r   r   r   r   r   r}  r   r   r!  ru   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r   r  r6   r  )rX   r^  rQ   r   r   r!   r   r5   r   r   r   r$  )ro   r   r   r   r   r   r   r}  r   r   r!  num_choicesr   r-  r  reshaped_logitsrO  r  r   r(   r(   r+   rv     sL   ,


z#MobileBertForMultipleChoice.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r    sH    	
r  c                       r  ) MobileBertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r  )rj   rk   r  rR  r!   r  r   r   r   r   r   r   r5   rU  r  rr   r(   r+   rk   z  s   z)MobileBertForTokenClassification.__init__Nr   r   r   r   r   r   r}  r   r   r!  ru   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r6   r  )rX   r^  r!   r   r5   r   r   r  r   r   r$  )ro   r   r   r   r   r   r   r}  r   r   r!  r   r8  r  rO  r  r   r(   r(   r+   rv     s8   

z(MobileBertForTokenClassification.forwardr  )rx   ry   rz   rk   r   r   rT   r{   r   r   r   r   rv   r|   r(   r(   rr   r+   r  w  sH    	
r  )r  r  r  rf  r  r  r  r  rR  rA  rg   )Jr   r>   r  dataclassesr   typingr   r   rT   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilebertr   
get_loggerrx   r<   rg   Modulerh   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  r  r(  r.  r0  r6  r=  rA  rN  rR  rf  r  r  r  r  r  r  r  __all__r(   r(   r(   r+   <module>   s   (

N
L>2$?*
jiP
]YMjE