o
    iQ                     @   sT  d Z ddlZddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% e#&e'Z(dd Z)G dd de	j*Z+G dd de	j*Z,G dd de,Z-e,e-dZ.G dd de	j*Z/G dd de	j*Z0G dd de	j*Z1e"G dd  d eZ2ee"d!d"G d#d$ d$e!Z3e"G d%d& d&e2Z4e"d'd"G d(d) d)e2Z5G d*d+ d+e	j*Z6G d,d- d-e	j*Z7e"G d.d/ d/e2Z8e"d0d"G d1d2 d2e2Z9e"G d3d4 d4e2Z:e"G d5d6 d6e2Z;e"G d7d8 d8e2Z<g d9Z=dS ):zPyTorch ALBERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc                 C   s*  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}t|
 q\t||	D ]\}
}|
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
d d!}
|
d"d#}
|
d$d%}
t|
dd&krd'|
v sd(|
v rd)|
 }
d*|
v r|
d+d,}
|
d-d.}
|
d}
d/|
v s!d0|
v s!d1|
v s!d2|
v s!d3|
v r-t	d4d|
  qj| }|
D ]}|d5|rA|d6|}n|g}|d d7ksR|d d8krXt|d.}nN|d d'ksf|d d9krlt|d:}n:|d d(kryt|d.}n-|d d;krt|d<}n z	t||d }W n ty   t	d4d|
  Y q1w t|d=krt|d& }|| }q1|d>d d?krt|d.}n
|d7kr||}z|j|jkrtd@|j dA|j dBW n ty } z| j|j|jf7  _ d}~ww tdC|
 dD|  t||_qj| S )Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ ffn_1ffnzbert/zalbert/attention_1	attentionz
transform/LayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsinner_group_zalbert_layers/group_zalbert_layer_groups/r   output_biasoutput_weightszclassifier/seq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightadam_madam_vAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammabetabiassquad
classifier   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr<   nptftf_path	init_varsnamesarraysnamerU   arrayoriginal_namepointerm_namescope_namesnume rl   ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albert2   s   

"








rn   c                       sn   e Zd ZdZdef fddZ					ddeej deej d	eej d
eej	 de
dejfddZ  ZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    r\   c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _| jd	tj| j tjd
dd d S )N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr#   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrX   arangeexpandrQ   rv   zerosrs   sizelongselfr\   	__class__rl   rm   r|      s   

zAlbertEmbeddings.__init__Nr   	input_idsrx   rs   inputs_embedspast_key_values_lengthreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nrt   r   rx   r   rz   devicerw   )r   rs   hasattrrx   r   rX   r   r   r   r   r   rv   r   r#   r   )r   r   rx   rs   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r'   r   rl   rl   rm   forward   s,   







zAlbertEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r   r|   r   rX   
LongTensorFloatTensorrS   Tensorr   __classcell__rl   rl   r   rm   ro      s*    ro   c                       s   e Zd Zdef fddZdee ddfddZ				dd
ej	de
ej de
ej dedeeej	 eej	ej	f f f
ddZ  ZS )AlbertAttentionr\   c                    s4  t    |j|j dkrt|dstd|j d|j |j| _|j| _|j|j | _| j| j | _t	|j| j| _
t	|j| j| _t	|j| j| _t|j| _t|j| _t	|j|j| _tj|j|jd| _t | _t|dd| _| jdks| jd	kr|j| _td
|j d | j| _d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads (rq   rv   rw   relative_keyrelative_key_queryr:   r   )r{   r|   hidden_sizenum_attention_headsr   rV   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattention_dropoutr   output_dropoutdenser#   r   setpruned_headsrQ   rv   r   r}   distance_embeddingr   r   rl   rm   r|      s0   

zAlbertAttention.__init__headsr   Nc                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S )Nr   r   dim)rM   r   r   r   r   r   r   r   r   r   r   union)r   r   indexrl   rl   rm   prune_heads  s   zAlbertAttention.prune_headsFhidden_statesattention_mask	head_maskoutput_attentionsc                 C   s  |j \}}}| |}| |}	| |}
||d| j| jdd}|	|d| j| jdd}	|
|d| j| jdd}
t	||	dd}|t
| j }|d urY|| }| jdksc| jdkr| d }tj|tj|jddd}tj|tj|jddd}|| }| || j d }|j|jd}| jdkrtd	||}|| }n| jdkrtd	||}td
|	|}|| | }tjj|dd}| |}|d ur|| }t	||
}|ddd}| |}| |}| || }|r||fS |fS )Nrt   r   r:   r   r   r   ry   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )rU   r   r   r   viewr   r   rT   rX   matmulmathsqrtrv   r   r   r   r   r   r   torz   einsumr   
functionalsoftmaxr   flattenr   r   r#   )r   r   r   r   r   
batch_sizer   _query_layer	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layerrl   rl   rm   r   #  sN   








zAlbertAttention.forwardNNF)r   r   r   r   r|   listrS   r   rX   r   r   r   boolr   tupler   r   rl   rl   r   rm   r      s"    r   c                       sn   e Zd Z fddZ			ddejdeej deej ded	e	e
ej e
ejejf f f
 fd
dZ  ZS )AlbertSdpaAttentionc                    s   t  | |j| _d S N)r{   r|   r   dropout_probr   r   rl   rm   r|   c  s   zAlbertSdpaAttention.__init__NFr   r   r   r   r   c                    s  | j dks|rtd t j|||dS | \}}}| ||d| j| j	
dd}| ||d| j| j	
dd}	| ||d| j| j	
dd}
tjjj||	|
|| jr]| jnddd	}|
dd}|||| j}| |}| |}| || }|fS )
Nrw   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   rt   r   r:           F)r   r   r   	attn_mask	dropout_p	is_causal)rv   r@   warningr{   r   r   r   r   r   r   rT   r   r   rX   r   r   scaled_dot_product_attentiontrainingr   reshaper   r   r   r#   )r   r   r   r   r   r   seq_lenr   r   r   r   attention_outputr   r   r   r   rl   rm   r   g  s@   	

zAlbertSdpaAttention.forwardr   )r   r   r   r|   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r   b  s     r   )eagersdpac                       s~   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
ejejf fddZdejdejfddZ  ZS )AlbertLayerr\   c                    s   t    || _|j| _d| _tj|j|jd| _	t
|j || _t|j|j| _t|j|j| _t|j | _t|j| _d S )Nr   rq   )r{   r|   r\   chunk_size_feed_forwardseq_len_dimr   r#   r   r   r"   ALBERT_ATTENTION_CLASSES_attn_implementationr    r   intermediate_sizer   
ffn_outputr
   
hidden_act
activationr   r   r   r   r   rl   rm   r|     s   
zAlbertLayer.__init__NFr   r   r   r   output_hidden_statesr   c                 C   sL   |  ||||}t| j| j| j|d }| ||d  }|f|dd   S )Nr   r   )r    r   ff_chunkr   r   r"   )r   r   r   r   r   r   r   r   rl   rl   rm   r     s   zAlbertLayer.forwardr   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r   r   r   rl   rl   rm   r     s   


zAlbertLayer.ff_chunkNNFF)r   r   r   r   r|   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r     s(    
r   c                       st   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
eeje
ej f df fddZ  ZS )AlbertLayerGroupr\   c                    s.   t    t fddt jD | _d S )Nc                       g | ]}t  qS rl   )r   .0r   r\   rl   rm   
<listcomp>      z-AlbertLayerGroup.__init__.<locals>.<listcomp>)r{   r|   r   
ModuleListrangeinner_group_numalbert_layersr   r   r  rm   r|     s   
$zAlbertLayerGroup.__init__NFr   r   r   r   r   r   .c                 C   s|   d}d}t | jD ]!\}}	|	|||| |}
|
d }|r#||
d f }|r*||f }q	|f}|r5||f }|r<||f }|S )Nrl   r   r   )	enumerater	  )r   r   r   r   r   r   layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputoutputsrl   rl   rm   r     s    


zAlbertLayerGroup.forwardr   )r   r   r   r   r|   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r     s&    r   c                       sj   e Zd Zdef fddZ					ddejdeej d	eej d
e	de	de	de
eef fddZ  ZS )AlbertTransformerr\   c                    sF   t     | _t j j| _t fddt	 j
D | _d S )Nc                    r   rl   )r   r  r  rl   rm   r    r  z.AlbertTransformer.__init__.<locals>.<listcomp>)r{   r|   r\   r   r   r   r   embedding_hidden_mapping_inr  r  num_hidden_groupsalbert_layer_groupsr   r   r  rm   r|     s   
$zAlbertTransformer.__init__NFTr   r   r   r   r   return_dictr   c                 C   s   |  |}|r
|fnd }|rdnd }|d u rd g| jj n|}t| jjD ]@}	t| jj| jj }
t|	| jj| jj  }| j| |||||
 |d |
  ||}|d }|r^||d  }|re||f }q%|sttdd |||fD S t|||dS )Nrl   r   r   rt   c                 s   s    | ]	}|d ur|V  qd S r   rl   )r  vrl   rl   rm   	<genexpr>  s    z,AlbertTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	r  r\   num_hidden_layersr  rS   r  r  r   r   )r   r   r   r   r   r   r  all_hidden_statesall_attentionsilayers_per_group	group_idxlayer_group_outputrl   rl   rm   r     s2   
	
zAlbertTransformer.forward)NNFFT)r   r   r   r   r|   rX   r   r   r   r   r   r   r   r   r   rl   rl   r   rm   r    s,    

r  c                   @   s*   e Zd ZU eed< eZdZdZdd Z	dS )AlbertPreTrainedModelr\   albertTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weights.r   )meanstdN      ?)
isinstancer   r   r.   rZ   normal_r\   initializer_ranger7   zero_r}   rp   r#   fill_AlbertMLMHead)r   modulerl   rl   rm   _init_weights)  s    


z#AlbertPreTrainedModel._init_weightsN)
r   r   r   r   __annotations__rn   load_tf_weightsbase_model_prefix_supports_sdpar-  rl   rl   rl   rm   r!  "  s   
 r!  z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r  )r   r   r   r   r4  r   rX   r   r.  r5  r6  r   r   r  rl   rl   rl   rm   r3  <  s   
 r3  c                       s   e Zd ZU eed< dZddedef fddZdej	fdd	Z
d
ej	ddfddZdeeee f ddfddZe									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertModelr\   r"  Tadd_pooling_layerc                    sp   t  | || _t|| _t|| _|r$t|j	|j	| _
t | _nd| _
d| _|j| _|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r{   r|   r\   ro   r'   r  encoderr   r   r   r%   Tanhpooler_activationr   attn_implementationrv   	post_init)r   r\   r8  r   rl   rm   r|   Z  s   

zAlbertModel.__init__r   c                 C      | j jS r   r'   r   r   rl   rl   rm   get_input_embeddingsq     z AlbertModel.get_input_embeddingsr   Nc                 C      || j _d S r   r?  )r   r   rl   rl   rm   set_input_embeddingst     z AlbertModel.set_input_embeddingsheads_to_prunec                 C   sT   |  D ]#\}}t|| jj }t||| jj  }| jj| j| j| qdS )a  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	itemsrS   r\   r  r9  r  r	  r    r   )r   rF  layerr   r  inner_group_idxrl   rl   rm   _prune_headsw  s
   zAlbertModel._prune_headsr   r   rx   rs   r   r   r   r   r  c
                 C   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u rctj	|
|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| j||||d}| jdko| jd	ko|d u o| }|rt||j|d
}n|dd}|j| jd}d| t| jj }| || j j}| j||||||	d}|d }| jd ur| | |d d df nd }|	s||f|dd   S t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timert   z5You have to specify either input_ids or inputs_embeds)r   rx   r   )rs   rx   r   r   rw   )tgt_lenr   r:   ry   r%  )r   r   r   r  r   )r  pooler_outputr   r  ) r\   r   r   use_return_dictrV   %warn_if_padding_and_no_attention_maskr   r   rX   onesr   r'   rx   r   r   r   r<  rv   r   rz   	unsqueezer   finfominget_head_maskr  r9  r%   r;  r   r   r  )r   r   r   rx   rs   r   r   r   r   r  r   r   r   r   r   r   embedding_outputuse_sdpa_attention_maskextended_attention_maskencoder_outputssequence_outputpooled_outputrl   rl   rm   r     st   

	*zAlbertModel.forward)T)	NNNNNNNNN)r   r   r   r   r.  r0  r   r|   r   r}   rA  rD  dictrS   r   rJ  r   r   rX   r   r   r   r   r   r   r   rl   rl   r   rm   r7  U  sL   
 	

r7  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       s   e Zd ZddgZdef fddZdejfddZd	ejdd
fddZ	dej
fddZe	
	
	
	
	
	
	
	
	
	
	
ddeej deej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForPreTrainingpredictions.decoder.biaspredictions.decoder.weightr\   c                    s6   t  | t|| _t|| _t|| _|   d S r   )	r{   r|   r7  r"  r+  r&   AlbertSOPHeadsop_classifierr=  r   r   rl   rm   r|     s
   


zAlbertForPreTraining.__init__r   c                 C   r>  r   r&   decoderr@  rl   rl   rm   get_output_embeddings  rB  z*AlbertForPreTraining.get_output_embeddingsnew_embeddingsNc                 C   rC  r   r`  r   rc  rl   rl   rm   set_output_embeddings  rE  z*AlbertForPreTraining.set_output_embeddingsc                 C   
   | j jjS r   r"  r'   r   r@  rl   rl   rm   rA       
z)AlbertForPreTraining.get_input_embeddingsr   r   rx   rs   r   r   labelssentence_order_labelr   r   r  c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| |}| |}d}|durU|durUt }||d| j j|d}||dd|d}|| }|sl||f|dd  }|durj|f| S |S t||||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Nr   rx   rs   r   r   r   r   r  r:   rt   )r4  r5  r6  r   r  )r\   rM  r"  r&   r_  r   r   r~   r3  r   r  )r   r   r   rx   rs   r   r   ri  rj  r   r   r  r  rX  rY  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossoutputrl   rl   rm   r     s>   )

zAlbertForPreTraining.forwardNNNNNNNNNNN)r   r   r   _tied_weights_keysr   r|   r   r   rb  re  r}   rA  r   r   rX   r   r   r   r   r3  r   r   r   rl   rl   r   rm   r[    sV    
	

r[  c                       sB   e Zd Zdef fddZdejdejfddZdd	d
Z  Z	S )r+  r\   c                    sp   t    tj|j|jd| _tt|j	| _
t|j|j| _t|j|j	| _t|j | _| j
| j_
d S )Nrq   )r{   r|   r   r#   r   r   	ParameterrX   r   r~   r7   r   r   r   ra  r
   r   r   r   r   rl   rm   r|   M  s   
zAlbertMLMHead.__init__r   r   c                 C   s0   |  |}| |}| |}| |}|}|S r   )r   r   r#   ra  )r   r   rl  rl   rl   rm   r   W  s   



zAlbertMLMHead.forwardNc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)ra  r7   r   typer@  rl   rl   rm   _tie_weightsa  s   zAlbertMLMHead._tie_weights)r   N)
r   r   r   r   r|   rX   r   r   rx  r   rl   rl   r   rm   r+  L  s    

r+  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )r^  r\   c                    s.   t    t|j| _t|j|j| _	d S r   )
r{   r|   r   r   classifier_dropout_probr   r   r   
num_labelsr9   r   r   rl   rm   r|   k  s   
zAlbertSOPHead.__init__rY  r   c                 C   s   |  |}| |}|S r   )r   r9   )r   rY  dropout_pooled_outputlogitsrl   rl   rm   r   q  s   

zAlbertSOPHead.forward)	r   r   r   r   r|   rX   r   r   r   rl   rl   r   rm   r^  j  s    r^  c                       s   e Zd ZddgZ fddZdejfddZdejdd	fd
dZdej	fddZ
e																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForMaskedLMr\  r]  c                    s0   t  | t|dd| _t|| _|   d S NF)r8  )r{   r|   r7  r"  r+  r&   r=  r   r   rl   rm   r|   {  s   
zAlbertForMaskedLM.__init__r   c                 C   r>  r   r`  r@  rl   rl   rm   rb    rB  z'AlbertForMaskedLM.get_output_embeddingsrc  Nc                 C   s   || j _|j| j _d S r   )r&   ra  r7   rd  rl   rl   rm   re    s   z'AlbertForMaskedLM.set_output_embeddingsc                 C   rf  r   rg  r@  rl   rl   rm   rA    rh  z&AlbertForMaskedLM.get_input_embeddingsr   r   rx   rs   r   r   ri  r   r   r  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N	r   r   rx   rs   r   r   r   r   r  r   rt   r:   r4  r|  r   r  )
r\   rM  r"  r&   r   r   r~   r   r   r  )r   r   r   rx   rs   r   r   ri  r   r   r  r  sequence_outputsrl  rp  ro  rr  rl   rl   rm   r     s6   1
zAlbertForMaskedLM.forward
NNNNNNNNNN)r   r   r   rt  r|   r   r   rb  re  r}   rA  r   r   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r}  w  sP    		

r}  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Zdef fddZe										ddeej deej	 deej deej d	eej	 d
eej	 deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForSequenceClassificationr\   c                    sR   t  | |j| _|| _t|| _t|j| _	t
|j| jj| _|   d S r   )r{   r|   rz  r\   r7  r"  r   r   ry  r   r   r   r9   r=  r   r   rl   rm   r|     s   
z(AlbertForSequenceClassification.__init__Nr   r   rx   rs   r   r   ri  r   r   r  r   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationrt   r:   r  )r\   rM  r"  r   r9   problem_typerz  rz   rX   r   rS   r   squeezer   r   r   r   r   r  )r   r   r   rx   rs   r   r   ri  r   r   r  r  rY  r|  r4  ro  rr  rl   rl   rm   r     sV   



"


z'AlbertForSequenceClassification.forwardr  )r   r   r   r   r|   r   r   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r    sH    	

r  c                       r  )AlbertForTokenClassificationr\   c                    sd   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j| jj| _|   d S r~  )r{   r|   rz  r7  r"  ry  r   r   r   r   r   r   r\   r9   r=  )r   r\   ry  r   rl   rm   r|   @  s   
z%AlbertForTokenClassification.__init__Nr   r   rx   rs   r   r   ri  r   r   r  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrk  r   rt   r:   r  )r\   rM  r"  r   r9   r   r   rz  r   r   r  )r   r   r   rx   rs   r   r   ri  r   r   r  r  rX  r|  r4  ro  rr  rl   rl   rm   r   P  s8   

z$AlbertForTokenClassification.forwardr  )r   r   r   r   r|   r   r   rX   r   r   r   r   r   r   r   r   rl   rl   r   rm   r  >  sH    	

r  c                       s   e Zd Zdef fddZe											ddeej deej	 deej deej d	eej	 d
eej	 deej deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForQuestionAnsweringr\   c                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r~  )
r{   r|   rz  r7  r"  r   r   r   
qa_outputsr=  r   r   rl   rm   r|     s
   z#AlbertForQuestionAnswering.__init__Nr   r   rx   rs   r   r   start_positionsend_positionsr   r   r  r   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr  r   r   rt   r   )ignore_indexr:   )r4  start_logits
end_logitsr   r  )r\   rM  r"  r  rN   r  
contiguousrM   r   clampr   r   r   r  )r   r   r   rx   rs   r   r   r  r  r   r   r  r  rX  r|  r  r  rn  ignored_indexro  
start_lossend_lossrr  rl   rl   rm   r     sP   






z"AlbertForQuestionAnswering.forwardrs  r   r   r   r   r|   r   r   rX   r   r   r   r   r3  r   r   r   rl   rl   r   rm   r    sN    
	

r  c                       r  )AlbertForMultipleChoicer\   c                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r{   r|   r7  r"  r   r   ry  r   r   r   r9   r=  r   r   rl   rm   r|     s
   
z AlbertForMultipleChoice.__init__Nr   r   rx   rs   r   r   ri  r   r   r  r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   rt   r   rk  r:   r  )r\   rM  rU   r   r   r"  r   r9   r   r   r   r  )r   r   r   rx   rs   r   r   ri  r   r   r  num_choicesr  rY  r|  reshaped_logitsr4  ro  rr  rl   rl   rm   r     sL   ,


zAlbertForMultipleChoice.forwardr  r  rl   rl   r   rm   r    sH    
	

r  )	rn   r!  r7  r[  r}  r  r  r  r  )>r   r   rB   dataclassesr   typingr   r   rX   r   torch.nnr   r   r   activationsr
   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_albertr   
get_loggerr   r@   rn   Modulero   r   r   r   r   r   r  r!  r3  r7  r[  r+  r^  r}  r  r  r  r  __all__rl   rl   rl   rm   <module>   sn   $	
~Dn:)#5 fiWGMf