o
    ei                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, e%-e.Z/G dd dej0Z1ej2e1dZ3G dd dej0Z4		dadej0dej5dej5dej5dej5dB de6dB de6d e e# fd!d"Z7G d#d$ d$ej0Z8G d%d& d&ej0Z9G d'd( d(ej0Z:G d)d* d*ej0Z;G d+d, d,ej0Z<G d-d. d.ej0Z=G d/d0 d0ej0Z>G d1d2 d2ej0Z?G d3d4 d4ej0Z@G d5d6 d6ej0ZAG d7d8 d8eZBG d9d: d:ej0ZCG d;d< d<ej0ZDG d=d> d>ej0ZEG d?d@ d@ej0ZFG dAdB dBej0ZGG dCdD dDej0ZHe$G dEdF dFeZIee$dGdHG dIdJ dJe"ZJe$G dKdL dLeIZKe$dMdHG dNdO dOeIZLe$G dPdQ dQeIZMG dRdS dSej0ZNe$dTdHG dUdV dVeIZOe$dWdHG dXdY dYeIZPe$G dZd[ d[eIZQe$G d\d] d]eIZRe$G d^d_ d_eIZSg d`ZTdS )b    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )MobileBertConfigc                       s4   e Zd Zd fdd	ZdejdejfddZ  ZS )	NoNormNc                    s2   t    tt|| _tt|| _d S N)	super__init__r   	Parametertorchzerosbiasonesweight)self	feat_sizeeps	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mobilebert/modeling_mobilebert.pyr$   8   s   
zNoNorm.__init__input_tensorreturnc                 C   s   || j  | j S r"   )r*   r(   )r+   r2   r0   r0   r1   forward=   s   zNoNorm.forwardr"   __name__
__module____qualname__r$   r&   Tensorr4   __classcell__r0   r0   r.   r1   r!   7   s    r!   )
layer_normno_normc                       sb   e Zd ZdZ fddZ				ddejdB dejdB dejdB dejdB d	ejf
d
dZ	  Z
S )MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    |j| _|j| _|j| _tj|j|j|jd| _	t|j
|j| _t|j|j| _| jr4dnd}| j| }t||j| _t|j |j| _t|j| _| jdt|j
ddd d S )N)padding_idxr   r   position_idsr   F)
persistent)r#   r$   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_type	LayerNormDropouthidden_dropout_probdropoutregister_bufferr&   arangeexpand)r+   configembed_dim_multiplierembedded_input_sizer.   r0   r1   r$   G   s   


zMobileBertEmbeddings.__init__N	input_idstoken_type_idsr?   inputs_embedsr3   c           
      C   s*  |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u r3tj|tj| jjd}|d u r<| |}| jrktjt	j
j|d d dd f g ddd|t	j
j|d d d df g dddgdd	}| jst| j| jkry| |}| |}| |}|| | }	| |	}	| |	}	|	S )
NrA   r   )dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r      dim)sizer?   r&   r'   longr`   rI   rC   catr   
functionalpadrD   rE   rO   rK   rM   rR   rU   )
r+   r\   r]   r?   r^   input_shape
seq_lengthrK   rM   
embeddingsr0   r0   r1   r4   ]   s4   

$$




zMobileBertEmbeddings.forward)NNNN)r6   r7   r8   __doc__r$   r&   
LongTensorFloatTensorr9   r4   r:   r0   r0   r.   r1   r=   D   s$    r=   ra   modulequerykeyrb   attention_maskscalingrU   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )NrA         rc   r   rd   )ptrainingr   )
rf   r&   matmul	transposer   ri   softmaxrU   ry   
contiguous)
rq   rr   rs   rb   rt   ru   rU   rv   attn_weightsattn_outputr0   r0   r1   eager_attention_forward   s   
r   c                       sX   e Zd Z fddZ	ddejdejdejdejdB dee d	e	ej fd
dZ
  ZS )MobileBertSelfAttentionc                    s   t    || _|j| _t|j|j | _| j| j | _| jd | _t	
|j| j| _t	
|j| j| _t	
|jr<|jn|j| j| _t	|j| _d| _d S )Nrw   F)r#   r$   rY   num_attention_headsinttrue_hidden_sizeattention_head_sizeall_head_sizeru   r   rN   rr   rs   use_bottleneck_attentionrE   rb   rS   attention_probs_dropout_probrU   	is_causalr+   rY   r.   r0   r1   r$      s   

z MobileBertSelfAttention.__init__Nquery_tensor
key_tensorvalue_tensorrt   rv   r3   c                 K   s   |j d d }g |d| jR }| |j| dd}| |j| dd}	| |j| dd}
t| j	j
t}|| ||	|
|f| jsIdn| jj| jd|\}}|jg |dR   }||fS )NrA   r   rc   ra   )rU   ru   )shaper   rr   viewr{   rs   rb   r   get_interfacerY   _attn_implementationr   ry   rU   rx   ru   reshaper}   )r+   r   r   r   rt   rv   rk   hidden_shapequery_layer	key_layervalue_layerattention_interfacer   r~   r0   r0   r1   r4      s,   

zMobileBertSelfAttention.forwardr"   r6   r7   r8   r$   r&   r9   rp   r   r   tupler4   r:   r0   r0   r.   r1   r      s     r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )MobileBertSelfOutputc                    sX   t    |j| _t|j|j| _t|j |j|j	d| _
| js*t|j| _d S d S Nr-   )r#   r$   use_bottleneckr   rN   r   denserP   rQ   layer_norm_epsrR   rS   rT   rU   r   r.   r0   r1   r$      s   
zMobileBertSelfOutput.__init__hidden_statesresidual_tensorr3   c                 C   s,   |  |}| js| |}| || }|S r"   )r   r   rU   rR   r+   r   r   layer_outputsr0   r0   r1   r4      s
   

zMobileBertSelfOutput.forwardr5   r0   r0   r.   r1   r      s    $r   c                       s^   e Zd Z fddZ	ddejdejdejdejdejdB d	ee d
e	ej fddZ
  ZS )MobileBertAttentionc                    "   t    t|| _t|| _d S r"   )r#   r$   r   r+   r   outputr   r.   r0   r1   r$         

zMobileBertAttention.__init__Nr   r   r   layer_inputrt   rv   r3   c           	      K   s0   | j ||||fi |\}}| ||}||fS r"   )r+   r   )	r+   r   r   r   r   rt   rv   attention_outputr~   r0   r0   r1   r4      s   	
	zMobileBertAttention.forwardr"   r   r0   r0   r.   r1   r      s$    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )MobileBertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r"   )r#   r$   r   rN   r   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnr   r.   r0   r1   r$     s
   
zMobileBertIntermediate.__init__r   r3   c                 C   s   |  |}| |}|S r"   )r   r   r+   r   r0   r0   r1   r4        

zMobileBertIntermediate.forwardr5   r0   r0   r.   r1   r     s    r   c                       r   )OutputBottleneckc                    sF   t    t|j|j| _t|j |j|j	d| _
t|j| _d S r   )r#   r$   r   rN   r   rE   r   rP   rQ   r   rR   rS   rT   rU   r   r.   r0   r1   r$     s   
zOutputBottleneck.__init__r   r   r3   c                 C   s&   |  |}| |}| || }|S r"   )r   rU   rR   r   r0   r0   r1   r4   $  s   

zOutputBottleneck.forwardr5   r0   r0   r.   r1   r     s    $r   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	MobileBertOutputc                    s\   t    |j| _t|j|j| _t|j	 |j| _
| js't|j| _d S t|| _d S r"   )r#   r$   r   r   rN   r   r   r   rP   rQ   rR   rS   rT   rU   r   
bottleneckr   r.   r0   r1   r$   ,  s   
zMobileBertOutput.__init__intermediate_statesresidual_tensor_1residual_tensor_2r3   c                 C   sJ   |  |}| js| |}| || }|S | || }| ||}|S r"   )r   r   rU   rR   r   )r+   r   r   r   layer_outputr0   r0   r1   r4   6  s   

zMobileBertOutput.forwardr5   r0   r0   r.   r1   r   +  s    
r   c                       r   )BottleneckLayerc                    8   t    t|j|j| _t|j |j|j	d| _
d S r   )r#   r$   r   rN   rE   intra_bottleneck_sizer   rP   rQ   r   rR   r   r.   r0   r1   r$   D     
zBottleneckLayer.__init__r   r3   c                 C   s   |  |}| |}|S r"   r   rR   )r+   r   r   r0   r0   r1   r4   I  r   zBottleneckLayer.forwardr5   r0   r0   r.   r1   r   C      r   c                       s6   e Zd Z fddZdejdeej fddZ  ZS )
Bottleneckc                    s<   t    |j| _|j| _t|| _| jrt|| _d S d S r"   )r#   r$   key_query_shared_bottleneckr   r   input	attentionr   r.   r0   r1   r$   P  s   

zBottleneck.__init__r   r3   c                 C   sB   |  |}| jr|fd S | jr| |}||||fS ||||fS )N   )r   r   r   r   )r+   r   bottlenecked_hidden_statesshared_attention_inputr0   r0   r1   r4   X  s   


zBottleneck.forward	r6   r7   r8   r$   r&   r9   r   r4   r:   r0   r0   r.   r1   r   O  s    "r   c                       r   )	FFNOutputc                    r   r   )r#   r$   r   rN   r   r   r   rP   rQ   r   rR   r   r.   r0   r1   r$   t  r   zFFNOutput.__init__r   r   r3   c                 C   s   |  |}| || }|S r"   r   r   r0   r0   r1   r4   y  s   
zFFNOutput.forwardr5   r0   r0   r.   r1   r   s  s    $r   c                       r   )FFNLayerc                    r   r"   )r#   r$   r   intermediater   r   r   r.   r0   r1   r$     r   zFFNLayer.__init__r   r3   c                 C   s   |  |}| ||}|S r"   )r   r   )r+   r   intermediate_outputr   r0   r0   r1   r4     s   
zFFNLayer.forwardr5   r0   r0   r.   r1   r     r   r   c                
       sL   e Zd Z fddZ	d
dejdejdB dee de	ej fdd	Z
  ZS )MobileBertLayerc                    s~   t     j| _ j| _t | _t | _t | _	| jr$t
 | _ jdkr=t fddt jd D | _d S d S )Nr   c                       g | ]}t  qS r0   )r   .0_rY   r0   r1   
<listcomp>      z,MobileBertLayer.__init__.<locals>.<listcomp>)r#   r$   r   num_feedforward_networksr   r   r   r   r   r   r   r   r   
ModuleListrangeffnr   r.   r   r1   r$     s   





(zMobileBertLayer.__init__Nr   rt   rv   r3   c                 K   s   | j r| |\}}}}n	|gd \}}}}| j|||||fi |\}}	|}
| jdkr6| jD ]}||
}
q/| |
}| ||
|}|S )Nr   r   )r   r   r   r   r   r   r   )r+   r   rt   rv   r   r   r   r   self_attention_outputr   r   
ffn_moduler   r   r0   r0   r1   r4     s&   




zMobileBertLayer.forwardr"   r   r0   r0   r.   r1   r     s    r   c                
       sJ   e Zd Z fddZ	d
dejdejdB dee de	e
B fdd	Z  ZS )MobileBertEncoderc                    s.   t    t fddt jD | _d S )Nc                    r   r0   )r   r   r   r0   r1   r     r   z.MobileBertEncoder.__init__.<locals>.<listcomp>)r#   r$   r   r   r   num_hidden_layerslayerr   r.   r   r1   r$     s   
$zMobileBertEncoder.__init__Nr   rt   rv   r3   c                 K   s0   t | jD ]\}}|||fi |}qt|dS )N)last_hidden_state)	enumerater   r   )r+   r   rt   rv   ilayer_moduler0   r0   r1   r4     s   
zMobileBertEncoder.forwardr"   )r6   r7   r8   r$   r&   r9   rp   r   r   r   r   r4   r:   r0   r0   r.   r1   r     s    r   c                       r   )MobileBertPoolerc                    s2   t    |j| _| jrt|j|j| _d S d S r"   )r#   r$   classifier_activationdo_activater   rN   rE   r   r   r.   r0   r1   r$     s
   
zMobileBertPooler.__init__r   r3   c                 C   s2   |d d df }| j s|S | |}t|}|S )Nr   )r   r   r&   tanh)r+   r   first_token_tensorpooled_outputr0   r0   r1   r4     s   

zMobileBertPooler.forwardr5   r0   r0   r.   r1   r     s    r   c                       r   )!MobileBertPredictionHeadTransformc                    sX   t    t|j|j| _t|jtrt	|j | _
n|j| _
td |j|jd| _d S )Nr;   r   )r#   r$   r   rN   rE   r   r   r   r   r
   transform_act_fnrP   r   rR   r   r.   r0   r1   r$     s   
z*MobileBertPredictionHeadTransform.__init__r   r3   c                 C   s"   |  |}| |}| |}|S r"   )r   r   rR   r   r0   r0   r1   r4     s   


z)MobileBertPredictionHeadTransform.forwardr5   r0   r0   r.   r1   r         	r   c                       r   )MobileBertLMPredictionHeadc                    s^   t    t|| _tj|j|j|j dd| _	tj|j|jdd| _
tt|j| _d S )NF)r(   T)r#   r$   r   	transformr   rN   rG   rE   rD   r   decoderr%   r&   r'   r(   r   r.   r0   r1   r$     s
   

z#MobileBertLMPredictionHead.__init__r   r3   c                 C   s>   |  |}|tj| jj | jjgdd}|| jj7 }|S )Nr   rd   )	r   rz   r&   rh   r   r*   tr   r(   r   r0   r0   r1   r4     s   
$z"MobileBertLMPredictionHead.forwardr5   r0   r0   r.   r1   r     r   r   c                       r   )MobileBertOnlyMLMHeadc                    s   t    t|| _d S r"   )r#   r$   r   predictionsr   r.   r0   r1   r$     s   
zMobileBertOnlyMLMHead.__init__sequence_outputr3   c                 C      |  |}|S r"   )r   )r+   r   prediction_scoresr0   r0   r1   r4        
zMobileBertOnlyMLMHead.forwardr5   r0   r0   r.   r1   r          r   c                       s<   e Zd Z fddZdejdejdeej fddZ  ZS )MobileBertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S Nrc   )r#   r$   r   r   r   rN   rE   seq_relationshipr   r.   r0   r1   r$     s   

z#MobileBertPreTrainingHeads.__init__r   r   r3   c                 C   s   |  |}| |}||fS r"   )r   r   )r+   r   r   r   seq_relationship_scorer0   r0   r1   r4     s   

z"MobileBertPreTrainingHeads.forwardr   r0   r0   r.   r1   r   
  s    (r   c                       sP   e Zd ZU eed< dZdZdZdZdZ	dZ
eedZe  fddZ  ZS )MobileBertPreTrainedModelrY   
mobilebertT)r   
attentionsc                    s   t  | t|trt|j t|j dS t|t	r&t|j dS t|t
r>t|jt|jjd d dS dS )zInitialize the weightsrA   r@   N)r#   _init_weightsr   r!   initzeros_r(   ones_r*   r   r=   copy_r?   r&   rW   r   rX   )r+   rq   r.   r0   r1   r   $  s   


&z'MobileBertPreTrainedModel._init_weights)r6   r7   r8   r    __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr&   no_gradr   r:   r0   r0   r.   r1   r     s   
 r   z6
    Output type of [`MobileBertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dZeej dB ed< dS )MobileBertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logitsseq_relationship_logitsr   r   )r6   r7   r8   rn   r  r&   rp   r  r  r  r   r   r   r0   r0   r0   r1   r  1  s   
 r  c                       s   e Zd ZdZd fdd	Zdd Zdd Zeee											dd
e
jd	B de
jd	B de
jd	B de
jd	B de
jd	B dee deeB fddZ  ZS )MobileBertModelz2
    https://huggingface.co/papers/2004.02984
    Tc                    sJ   t  | || _d| _t|| _t|| _|rt|nd| _	| 
  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        FN)r#   r$   rY   gradient_checkpointingr=   rm   r   encoderr   pooler	post_init)r+   rY   add_pooling_layerr.   r0   r1   r$   P  s   

zMobileBertModel.__init__c                 C   s   | j jS r"   rm   rI   r+   r0   r0   r1   get_input_embeddingsa  s   z$MobileBertModel.get_input_embeddingsc                 C   s   || j _d S r"   r  )r+   rb   r0   r0   r1   set_input_embeddingsd  s   z$MobileBertModel.set_input_embeddingsNr\   rt   r]   r?   r^   rv   r3   c                 K   s|   |d u |d uA rt d| j||||d}t| j||d}| j|fd|i|}|d }	| jd ur6| |	nd }
t|	|
dS )Nz:You must specify exactly one of input_ids or inputs_embeds)r\   r?   r]   r^   )rY   r^   rt   rt   r   )r   pooler_output)
ValueErrorrm   r   rY   r  r  r   )r+   r\   rt   r]   r?   r^   rv   embedding_outputencoder_outputsr   r   r0   r0   r1   r4   g  s4   zMobileBertModel.forward)T)NNNNN)r6   r7   r8   rn   r$   r  r  r   r   r   r&   ro   rp   r   r   r   r   r4   r:   r0   r0   r.   r1   r  J  s8    r  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    c                       s   e Zd ZdddZ fddZdd Zdd	 Zdded
B dej	f fddZ
ee	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B dejd
B dejd
B dejd
B dejd
B dee deeB fddZ  ZS )MobileBertForPreTrainingcls.predictions.bias,mobilebert.embeddings.word_embeddings.weightzcls.predictions.decoder.biaszcls.predictions.decoder.weightc                    ,   t  | t|| _t|| _|   d S r"   )r#   r$   r  r   r   clsr  r   r.   r0   r1   r$     s   

z!MobileBertForPreTraining.__init__c                 C   
   | j jjS r"   r#  r   r   r  r0   r0   r1   get_output_embeddings     
z.MobileBertForPreTraining.get_output_embeddingsc                 C      || j j_|j| j j_d S r"   r#  r   r   r(   r+   new_embeddingsr0   r0   r1   set_output_embeddings     
z.MobileBertForPreTraining.set_output_embeddingsNnew_num_tokensr3   c                    *   | j | jjj|dd| jj_t j|dS NT)r.  
transposed)r.  _get_resized_lm_headr#  r   r   r#   resize_token_embeddingsr+   r.  r.   r0   r1   r4    s   z0MobileBertForPreTraining.resize_token_embeddingsr\   rt   r]   r?   r^   labelsnext_sentence_labelrv   c                 K   s   | j |f||||dd|}	|	dd \}
}| |
|\}}d}|durJ|durJt }||d| jj|d}||dd|d}|| }t||||	j|	jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Trt   r]   r?   r^   return_dictNrc   rA   )r  r  r  r   r   )	r   r#  r   r   rY   rG   r  r   r   )r+   r\   rt   r]   r?   r^   r6  r7  rv   outputsr   r   r   r   
total_lossloss_fctmasked_lm_lossnext_sentence_lossr0   r0   r1   r4     s4   )	z MobileBertForPreTraining.forwardr"   NNNNNNN)r6   r7   r8   _tied_weights_keysr$   r&  r,  r   r   rF   r4  r   r   r&   ro   rp   r   r   r   r  r4   r:   r0   r0   r.   r1   r    sH    	
r  c                       s   e Zd ZdddZ fddZdd Zdd	 Zdded
B dej	f fddZ
ee	
	
	
	
	
	
ddejd
B dejd
B dejd
B dejd
B dejd
B dejd
B dee deeB fddZ  ZS )MobileBertForMaskedLMr  r   r!  c                    s6   t  | t|dd| _t|| _|| _|   d S NF)r  )r#   r$   r  r   r   r#  rY   r  r   r.   r0   r1   r$      s
   
zMobileBertForMaskedLM.__init__c                 C   r$  r"   r%  r  r0   r0   r1   r&  	  r'  z+MobileBertForMaskedLM.get_output_embeddingsc                 C   r(  r"   r)  r*  r0   r0   r1   r,    r-  z+MobileBertForMaskedLM.set_output_embeddingsNr.  r3   c                    r/  r0  r2  r5  r.   r0   r1   r4    s   z-MobileBertForMaskedLM.resize_token_embeddingsr\   rt   r]   r?   r^   r6  rv   c                 K   sr   | j |f||||dd|}|d }	| |	}
d}|dur/t }||
d| jj|d}t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Tr8  r   NrA   r  logitsr   r   )	r   r#  r   r   rY   rG   r   r   r   )r+   r\   rt   r]   r?   r^   r6  rv   r:  r   r   r=  r<  r0   r0   r1   r4     s.   

zMobileBertForMaskedLM.forwardr"   NNNNNN)r6   r7   r8   r@  r$   r&  r,  r   r   rF   r4  r   r   r&   ro   rp   r   r   r   r   r4   r:   r0   r0   r.   r1   rA    sB    		rA  c                       r   )MobileBertOnlyNSPHeadc                    s   t    t|jd| _d S r   )r#   r$   r   rN   rE   r   r   r.   r0   r1   r$   D  s   
zMobileBertOnlyNSPHead.__init__r   r3   c                 C   r   r"   )r   )r+   r   r   r0   r0   r1   r4   H  r   zMobileBertOnlyNSPHead.forwardr5   r0   r0   r.   r1   rF  C  r   rF  zZ
    MobileBert Model with a `next sentence prediction (classification)` head on top.
    c                       s   e Zd Z fddZee						ddejdB dejdB dejdB dejdB dejdB d	ejdB d
e	e
 deeB fddZ  ZS )#MobileBertForNextSentencePredictionc                    r"  r"   )r#   r$   r  r   rF  r#  r  r   r.   r0   r1   r$   S  s   

z,MobileBertForNextSentencePrediction.__init__Nr\   rt   r]   r?   r^   r6  rv   r3   c                 K   sn   | j |f||||dd|}|d }	| |	}
d}|dur-t }||
dd|d}t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```Tr8  r   NrA   rc   rC  )r   r#  r   r   r   r   r   )r+   r\   rt   r]   r?   r^   r6  rv   r:  r   r   r>  r<  r0   r0   r1   r4   \  s.   &

z+MobileBertForNextSentencePrediction.forwardrE  )r6   r7   r8   r$   r   r   r&   ro   rp   r   r   r   r   r4   r:   r0   r0   r.   r1   rG  M  s6    		rG  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZee						ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ee	 de
ej eB fddZ  ZS )#MobileBertForSequenceClassificationc                    sd   t  | |j| _|| _t|| _|jd ur|jn|j}t	|| _
t|j|j| _|   d S r"   )r#   r$   
num_labelsrY   r  r   classifier_dropoutrT   r   rS   rU   rN   rE   
classifierr  r+   rY   rK  r.   r0   r1   r$     s   
z,MobileBertForSequenceClassification.__init__Nr\   rt   r]   r?   r^   r6  rv   r3   c                 K   s4  | j |f||||dd|}|d }	| |	}	| |	}
d}|dur| jjdu rM| jdkr3d| j_n| jdkrI|jtjksD|jtj	krId| j_nd| j_| jjdkrkt
 }| jdkre||
 | }n+||
|}n%| jjdkrt }||
d| j|d}n| jjdkrt }||
|}t||
|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr8  r   N
regressionsingle_label_classificationmulti_label_classificationrA   rC  )r   rU   rL  rY   problem_typerJ  r_   r&   rg   r   r   squeezer   r   r   r   r   r   )r+   r\   rt   r]   r?   r^   r6  rv   r:  r   rD  r  r<  r0   r0   r1   r4     sN   




"


z+MobileBertForSequenceClassification.forwardrE  )r6   r7   r8   r$   r   r   r&   r9   r   r   r   r   r4   r:   r0   r0   r.   r1   rI    s6    	rI  c                       s   e Zd Z fddZee							ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dee	 de
ej eB fddZ  ZS )MobileBertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S rB  )
r#   r$   rJ  r  r   r   rN   rE   
qa_outputsr  r   r.   r0   r1   r$     s
   z'MobileBertForQuestionAnswering.__init__Nr\   rt   r]   r?   r^   start_positionsend_positionsrv   r3   c                 K   s  | j |f||||dd|}	|	d }
| |
}|jddd\}}|d }|d }d }|d ury|d uryt| dkrF|d}t| dkrS|d}|d}|d|}|d|}t|d}|||}|||}|| d }t	||||	j
|	jd	S )
NTr8  r   r   rA   rd   )ignore_indexrc   )r  start_logits
end_logitsr   r   )r   rT  splitrR  r}   lenrf   clampr   r   r   r   )r+   r\   rt   r]   r?   r^   rU  rV  rv   r:  r   rD  rX  rY  r;  ignored_indexr<  
start_lossend_lossr0   r0   r1   r4      sH   







z&MobileBertForQuestionAnswering.forwardr?  )r6   r7   r8   r$   r   r   r&   r9   r   r   r   r   r4   r:   r0   r0   r.   r1   rS    s<    
	
rS  c                       rH  )MobileBertForMultipleChoicec                    sT   t  | t|| _|jd ur|jn|j}t|| _t	|j
d| _|   d S )Nr   )r#   r$   r  r   rK  rT   r   rS   rU   rN   rE   rL  r  rM  r.   r0   r1   r$   ;  s   
z$MobileBertForMultipleChoice.__init__Nr\   rt   r]   r?   r^   r6  rv   r3   c                 K   s0  |dur	|j d n|j d }|dur|d|dnd}|dur*|d|dnd}|dur9|d|dnd}|durH|d|dnd}|dur[|d|d|dnd}| j|f||||dd|}	|	d }
| |
}
| |
}|d|}d}|durt }|||}t|||	j|	j	dS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rA   Tr8  rC  )
r   r   rf   r   rU   rL  r   r   r   r   )r+   r\   rt   r]   r?   r^   r6  rv   num_choicesr:  r   rD  reshaped_logitsr  r<  r0   r0   r1   r4   H  sD   *



z#MobileBertForMultipleChoice.forwardrE  )r6   r7   r8   r$   r   r   r&   r9   r   r   r   r   r4   r:   r0   r0   r.   r1   r`  8  s6    	r`  c                       rH  ) MobileBertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S rB  )r#   r$   rJ  r  r   rK  rT   r   rS   rU   rN   rE   rL  r  rM  r.   r0   r1   r$     s   z)MobileBertForTokenClassification.__init__Nr\   rt   r]   r?   r^   r6  rv   r3   c                 K   sz   | j |f||||dd|}|d }	| |	}	| |	}
d}|dur3t }||
d| j|d}t||
|j|jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr8  r   NrA   rC  )	r   rU   rL  r   r   rJ  r   r   r   )r+   r\   rt   r]   r?   r^   r6  rv   r:  r   rD  r  r<  r0   r0   r1   r4     s0   


z(MobileBertForTokenClassification.forwardrE  )r6   r7   r8   r$   r   r   r&   r9   r   r   r   r   r4   r:   r0   r0   r.   r1   rd    s6    	rd  )
rA  r`  rG  r  rS  rI  rd  r   r  r   )Nra   )Ucollections.abcr   dataclassesr   r&   r   torch.nnr   r   r    r	   r   activationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_mobilebertr    
get_loggerr6   loggerModuler!   rR   rP   r=   r9   floatr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  rA  rF  rG  rI  rS  r`  rd  __all__r0   r0   r0   r1   <module>   s   (


S
4$,
FbI
JPCa;