o
    wi                     @   sX  d Z ddlZddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& e$'e(Z)dd Z*G dd de	j+Z,G dd de	j+Z-G dd de-Z.e-e.dZ/G dd de	j+Z0G dd de	j+Z1G dd de	j+Z2e#G dd  d eZ3ee#d!d"G d#d$ d$e"Z4e#G d%d& d&e3Z5e#d'd"G d(d) d)e3Z6G d*d+ d+e	j+Z7G d,d- d-e	j+Z8e#G d.d/ d/e3Z9e#d0d"G d1d2 d2e3Z:e#G d3d4 d4e3Z;e#G d5d6 d6e3Z<e#G d7d8 d8e3Z=g d9Z>dS ):zPyTorch ALBERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)#_prepare_4d_attention_mask_for_sdpa)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indices"is_torch_greater_or_equal_than_2_2prune_linear_layer)ModelOutputauto_docstringlogging   )AlbertConfigc                 C   s*  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}t|
 q\t||	D ]\}
}|
}|
dd}
|
d	d
}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
dd}
|
d d!}
|
d"d#}
|
d$d%}
t|
dd&krd'|
v sd(|
v rd)|
 }
d*|
v r|
d+d,}
|
d-d.}
|
d}
d/|
v s!d0|
v s!d1|
v s!d2|
v s!d3|
v r-t	d4d|
  qj| }|
D ]}|d5|rA|d6|}n|g}|d d7ksR|d d8krXt|d.}nN|d d'ksf|d d9krlt|d:}n:|d d(kryt|d.}n-|d d;krt|d<}n z	t||d }W n ty   t	d4d|
  Y q1w t|d=krt|d& }|| }q1|d>d d?krt|d.}n
|d7kr||}z|j|jkrtd@|j dA|j dBW n ty } z| j|j|jf7  _ d}~ww tdC|
 dD|  t||_qj| S )Ez'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zmodule/ ffn_1ffnzbert/zalbert/attention_1	attentionz
transform/LayerNorm_1full_layer_layer_norm	LayerNormzattention/LayerNormztransformer/zintermediate/dense/zffn/intermediate/output/dense/zffn_output/z/output//z/self/zpooler/densepoolerzcls/predictionspredictionszpredictions/attentionzembeddings/attention
embeddingsinner_group_zalbert_layers/group_zalbert_layer_groups/r   output_biasoutput_weightszclassifier/seq_relationshipzseq_relationship/output_zsop_classifier/classifier/weightsweightadam_madam_vAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepz	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammabetabiassquad
classifier   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight z from )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipprintreplacelensplitjoin	fullmatchgetattrAttributeErrorint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr=   nptftf_path	init_varsnamesarraysnamerV   arrayoriginal_namepointerm_namescope_namesnume rm   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/albert/modeling_albert.pyload_tf_weights_in_albert3   s   

"








ro   c                       sn   e Zd ZdZdef fddZ					ddeej deej d	eej d
eej	 de
dejfddZ  ZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    r]   c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _| jd	tj| j tjd
dd d S )N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr$   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrY   arangeexpandrR   rw   zerosrt   sizelongselfr]   	__class__rm   rn   r}      s   

zAlbertEmbeddings.__init__Nr   	input_idsry   rt   inputs_embedspast_key_values_lengthreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nru   r   ry   r   r{   devicerx   )r   rt   hasattrry   r   rY   r   r   r   r   r   rw   r   r$   r   )r   r   ry   rt   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   r(   r   rm   rm   rn   forward   s,   







zAlbertEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r   r}   r   rY   
LongTensorFloatTensorrT   Tensorr   __classcell__rm   rm   r   rn   rp      s*    rp   c                       s   e Zd Zdef fddZdejdejfddZdee	 dd	fd
dZ
					ddejdeej deej dedeeej eejejf f f
ddZ  ZS )AlbertAttentionr]   c                    s4  t    |j|j dkrt|dstd|j d|j |j| _|j| _|j|j | _| j| j | _t	|j| j| _
t	|j| j| _t	|j| j| _t|j| _t|j| _t	|j|j| _tj|j|jd| _t | _t|dd| _| jdks| jd	kr|j| _td
|j d | j| _d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads (rr   rw   rx   relative_keyrelative_key_queryr;   r   )r|   r}   hidden_sizenum_attention_headsr   rW   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattention_dropoutr   output_dropoutdenser$   r   setpruned_headsrR   rw   r   r~   distance_embeddingr   r   rm   rn   r}      s0   

zAlbertAttention.__init__xr   c                 C   s6   |  d d | j| jf }||}|ddddS )Nru   r   r;   r   r	   )r   r   r   viewpermute)r   r   new_x_shaperm   rm   rn   transpose_for_scores  s   
z$AlbertAttention.transpose_for_scoresheadsNc                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S )Nr   r   dim)rN   r   r   r   r   r   r   r   r   r   r   union)r   r   indexrm   rm   rn   prune_heads  s   zAlbertAttention.prune_headsFhidden_statesattention_mask	head_maskoutput_attentionsc                 C   s  |  |}| |}| |}| |}| |}	| |}
t||	dd}|t| j	 }|d ur8|| }| j
dksB| j
dkr| d }tj|tj|jddd}tj|tj|jddd}|| }| || j d }|j|jd}| j
dkrtd||}|| }n| j
dkrtd||}td	|	|}|| | }tjj|dd
}| |}|d ur|| }t||
}|ddd}| |}| |}| || }|r||fS |fS )Nru   r   r   r   r   rz   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r;   )r   r   r   r   rY   matmulrU   mathsqrtr   rw   r   r   r   r   r   r   r   tor{   einsumr   
functionalsoftmaxr   flattenr   r   r$   )r   r   r   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresr   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layerprojected_context_layerprojected_context_layer_dropoutlayernormed_context_layerrm   rm   rn   r   *  sD   











zAlbertAttention.forwardNNF)r   r   r   r   r}   rY   r   r   listrT   r   r   r   boolr   tupler   r   rm   rm   r   rn   r      s$    r   c                       sn   e Zd Z fddZ			ddejdeej deej ded	e	e
ej e
ejejf f f
 fd
dZ  ZS )AlbertSdpaAttentionc                    s    t  | |j| _t | _d S N)r|   r}   r   dropout_probr   require_contiguous_qkvr   r   rm   rn   r}   f  s   zAlbertSdpaAttention.__init__NFr   r   r   r   r   c                    s  | j dks|rtd t j|||dS | \}}}| | |}| | |}	| | 	|}
| j
rM|jjdkrM|d urM| }|	 }	|
 }
tjjj||	|
|| jr[| jnddd}|dd	}|||| j}| |}| |}| || }|fS )
Nrx   a  AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` . Falling back to the eager attention implementation, but specifying the eager implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   cuda        F)r   r   r   	attn_mask	dropout_p	is_causalr   r;   )rw   rA   warningr|   r   r   r   r   r   r   r   r   type
contiguousrY   r   r   scaled_dot_product_attentiontrainingr   rU   reshaper   r   r   r$   )r   r   r   r   r   
batch_sizeseq_len_r   r   r   attention_outputr   r   r   r   rm   rn   r   k  s6   	

zAlbertSdpaAttention.forwardr   )r   r   r   r}   rY   r   r   r   r   r   r   r   r   rm   rm   r   rn   r   e  s     r   )eagersdpac                       s~   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
ejejf fddZdejdejfddZ  ZS )AlbertLayerr]   c                    s   t    || _|j| _d| _tj|j|jd| _	t
|j || _t|j|j| _t|j|j| _t|j | _t|j| _d S )Nr   rr   )r|   r}   r]   chunk_size_feed_forwardseq_len_dimr   r$   r   r   r#   ALBERT_ATTENTION_CLASSES_attn_implementationr!   r   intermediate_sizer   
ffn_outputr
   
hidden_act
activationr   r   r   r   r   rm   rn   r}     s   
zAlbertLayer.__init__NFr   r   r   r   output_hidden_statesr   c                 C   sL   |  ||||}t| j| j| j|d }| ||d  }|f|dd   S )Nr   r   )r!   r   ff_chunkr   r  r#   )r   r   r   r   r   r  r   r  rm   rm   rn   r     s   zAlbertLayer.forwardr   c                 C   s"   |  |}| |}| |}|S r   )r   r  r  )r   r   r  rm   rm   rn   r	    s   


zAlbertLayer.ff_chunkNNFF)r   r   r   r   r}   rY   r   r   r   r   r   r   r	  r   rm   rm   r   rn   r     s(    
r   c                       st   e Zd Zdef fddZ				ddejdeej deej d	e	d
e	de
eeje
ej f df fddZ  ZS )AlbertLayerGroupr]   c                    s.   t    t fddt jD | _d S )Nc                       g | ]}t  qS rm   )r   .0r   r]   rm   rn   
<listcomp>      z-AlbertLayerGroup.__init__.<locals>.<listcomp>)r|   r}   r   
ModuleListrangeinner_group_numalbert_layersr   r   r  rn   r}     s   
$zAlbertLayerGroup.__init__NFr   r   r   r   r  r   .c                 C   s|   d}d}t | jD ]!\}}	|	|||| |}
|
d }|r#||
d f }|r*||f }q	|f}|r5||f }|r<||f }|S )Nrm   r   r   )	enumerater  )r   r   r   r   r   r  layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputoutputsrm   rm   rn   r     s    


zAlbertLayerGroup.forwardr
  )r   r   r   r   r}   rY   r   r   r   r   r   r   r   r   rm   rm   r   rn   r    s&    r  c                       sj   e Zd Zdef fddZ					ddejdeej d	eej d
e	de	de	de
eef fddZ  ZS )AlbertTransformerr]   c                    sF   t     | _t j j| _t fddt	 j
D | _d S )Nc                    r  rm   )r  r  r  rm   rn   r    r  z.AlbertTransformer.__init__.<locals>.<listcomp>)r|   r}   r]   r   r   r   r   embedding_hidden_mapping_inr  r  num_hidden_groupsalbert_layer_groupsr   r   r  rn   r}     s   
$zAlbertTransformer.__init__NFTr   r   r   r   r  return_dictr   c                 C   s   |  |}|r
|fnd }|rdnd }|d u rd g| jj n|}t| jjD ]@}	t| jj| jj }
t|	| jj| jj  }| j| |||||
 |d |
  ||}|d }|r^||d  }|re||f }q%|sttdd |||fD S t|||dS )Nrm   r   r   ru   c                 s   s    | ]	}|d ur|V  qd S r   rm   )r  vrm   rm   rn   	<genexpr>  s    z,AlbertTransformer.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	r  r]   num_hidden_layersr  rT   r  r   r   r   )r   r   r   r   r   r  r!  all_hidden_statesall_attentionsilayers_per_group	group_idxlayer_group_outputrm   rm   rn   r     s2   
	
zAlbertTransformer.forward)NNFFT)r   r   r   r   r}   rY   r   r   r   r   r   r   r   r   r   rm   rm   r   rn   r    s,    

r  c                   @   s$   e Zd ZeZeZdZdZdd Z	dS )AlbertPreTrainedModelalbertTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weights.r   )meanstdN      ?)
isinstancer   r   r/   r[   normal_r]   initializer_ranger8   zero_r~   rq   r$   fill_AlbertMLMHead)r   modulerm   rm   rn   _init_weights)  s    


z#AlbertPreTrainedModel._init_weightsN)
r   r   r   r   config_classro   load_tf_weightsbase_model_prefix_supports_sdpar9  rm   rm   rm   rn   r-  "  s    r-  z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r%  )r   r   r   r   r@  r   rY   r   __annotations__rA  rB  r   r   r%  rm   rm   rm   rn   r?  <  s   
 r?  c                       s   e Zd ZeZdZddedef fddZdej	fdd	Z
d
ej	ddfddZdeeee f ddfddZe									ddeej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertModelr.  Tr]   add_pooling_layerc                    sp   t  | || _t|| _t|| _|r$t|j	|j	| _
t | _nd| _
d| _|j| _|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r|   r}   r]   rp   r(   r  encoderr   r   r   r&   Tanhpooler_activationr  attn_implementationrw   	post_init)r   r]   rE  r   rm   rn   r}   Z  s   

zAlbertModel.__init__r   c                 C      | j jS r   r(   r   r   rm   rm   rn   get_input_embeddingsq     z AlbertModel.get_input_embeddingsr   Nc                 C      || j _d S r   rL  )r   r   rm   rm   rn   set_input_embeddingst     z AlbertModel.set_input_embeddingsheads_to_prunec                 C   sT   |  D ]#\}}t|| jj }t||| jj  }| jj| j| j| qdS )a  
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.

        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.

        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
        information about head pruning
        N)	itemsrT   r]   r  rF  r   r  r!   r   )r   rS  layerr   r+  inner_group_idxrm   rm   rn   _prune_headsw  s
   zAlbertModel._prune_headsr   r   ry   rt   r   r   r   r  r!  c
                 C   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u rctj	|
|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| j||||d}| jdko| jd	ko|d u o| }|rt||j|d
}n|dd}|j| jd}d| t| jj }| || j j}| j||||||	d}|d }| jd ur| | |d d df nd }|	s||f|dd   S t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timeru   z5You have to specify either input_ids or inputs_embeds)r   ry   r   )rt   ry   r   r   rx   )tgt_lenr   r;   rz   r1  )r   r   r  r!  r   )r$  pooler_outputr   r%  ) r]   r   r  use_return_dictrW   %warn_if_padding_and_no_attention_maskr   r   rY   onesr   r(   ry   r   r   r   rI  rw   r   r{   	unsqueezer   finfominget_head_maskr&  rF  r&   rH  r   r   r%  )r   r   r   ry   rt   r   r   r   r  r!  r   r   r   r   r   r   embedding_outputuse_sdpa_attention_maskextended_attention_maskencoder_outputssequence_outputpooled_outputrm   rm   rn   r     st   

	*zAlbertModel.forward)T)	NNNNNNNNN)r   r   r   r   r:  r<  r   r}   r   r~   rN  rQ  dictrT   r   rW  r   r   rY   r   r   r   r   r   r   r   rm   rm   r   rn   rD  U  sL    	

rD  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       s   e Zd ZddgZdef fddZdejfddZd	ejdd
fddZ	dej
fddZe	
	
	
	
	
	
	
	
	
	
	
ddeej deej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForPreTrainingpredictions.decoder.biaspredictions.decoder.weightr]   c                    s6   t  | t|| _t|| _t|| _|   d S r   )	r|   r}   rD  r.  r7  r'   AlbertSOPHeadsop_classifierrJ  r   r   rm   rn   r}     s
   


zAlbertForPreTraining.__init__r   c                 C   rK  r   r'   decoderrM  rm   rm   rn   get_output_embeddings  rO  z*AlbertForPreTraining.get_output_embeddingsnew_embeddingsNc                 C   rP  r   rm  r   rp  rm   rm   rn   set_output_embeddings  rR  z*AlbertForPreTraining.set_output_embeddingsc                 C   
   | j jjS r   r.  r(   r   rM  rm   rm   rn   rN       
z)AlbertForPreTraining.get_input_embeddingsr   r   ry   rt   r   r   labelssentence_order_labelr   r  r!  c                 C   s   |dur|n| j j}| j|||||||	|
|d	}|dd \}}| |}| |}d}|durU|durUt }||d| j j|d}||dd|d}|| }|sl||f|dd  }|durj|f| S |S t||||j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Nr   ry   rt   r   r   r   r  r!  r;   ru   )r@  rA  rB  r   r%  )r]   rZ  r.  r'   rl  r   r   r   r?  r   r%  )r   r   r   ry   rt   r   r   rv  rw  r   r  r!  r  re  rf  prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossoutputrm   rm   rn   r     s>   )

zAlbertForPreTraining.forwardNNNNNNNNNNN)r   r   r   _tied_weights_keysr   r}   r   r   ro  rr  r~   rN  r   r   rY   r   r   r   r   r?  r   r   r   rm   rm   r   rn   rh    sV    
	

rh  c                       sB   e Zd Zdef fddZdejdejfddZdd	d
Z  Z	S )r7  r]   c                    sp   t    tj|j|jd| _tt|j	| _
t|j|j| _t|j|j	| _t|j | _| j
| j_
d S )Nrr   )r|   r}   r   r$   r   r   	ParameterrY   r   r   r8   r   r   r   rn  r
   r  r  r   r   rm   rn   r}   M  s   
zAlbertMLMHead.__init__r   r   c                 C   s0   |  |}| |}| |}| |}|}|S r   )r   r  r$   rn  )r   r   ry  rm   rm   rn   r   W  s   



zAlbertMLMHead.forwardNc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)rn  r8   r   r   rM  rm   rm   rn   _tie_weightsa  s   zAlbertMLMHead._tie_weights)r   N)
r   r   r   r   r}   rY   r   r   r  r   rm   rm   r   rn   r7  L  s    

r7  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )rk  r]   c                    s.   t    t|j| _t|j|j| _	d S r   )
r|   r}   r   r   classifier_dropout_probr   r   r   
num_labelsr:   r   r   rm   rn   r}   k  s   
zAlbertSOPHead.__init__rf  r   c                 C   s   |  |}| |}|S r   )r   r:   )r   rf  dropout_pooled_outputlogitsrm   rm   rn   r   q  s   

zAlbertSOPHead.forward)	r   r   r   r   r}   rY   r   r   r   rm   rm   r   rn   rk  j  s    rk  c                       s   e Zd ZddgZ fddZdejfddZdejdd	fd
dZdej	fddZ
e																				ddeej deej deej deej deej deej deej dee dee dee deeef fddZ  ZS )AlbertForMaskedLMri  rj  c                    s0   t  | t|dd| _t|| _|   d S NF)rE  )r|   r}   rD  r.  r7  r'   rJ  r   r   rm   rn   r}   {  s   
zAlbertForMaskedLM.__init__r   c                 C   rK  r   rm  rM  rm   rm   rn   ro    rO  z'AlbertForMaskedLM.get_output_embeddingsrp  Nc                 C   s   || j _|j| j _d S r   )r'   rn  r8   rq  rm   rm   rn   rr    s   z'AlbertForMaskedLM.set_output_embeddingsc                 C   rs  r   rt  rM  rm   rm   rn   rN    ru  z&AlbertForMaskedLM.get_input_embeddingsr   r   ry   rt   r   r   rv  r   r  r!  c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur8t }||d| j j|d}|
sN|f|dd  }|durL|f| S |S t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        N	r   r   ry   rt   r   r   r   r  r!  r   ru   r;   r@  r  r   r%  )
r]   rZ  r.  r'   r   r   r   r   r   r%  )r   r   r   ry   rt   r   r   rv  r   r  r!  r  sequence_outputsry  r}  r|  r  rm   rm   rn   r     s6   1
zAlbertForMaskedLM.forward
NNNNNNNNNN)r   r   r   r  r}   r   r   ro  rr  r~   rN  r   r   rY   r   r   r   r   r   r   r   r   rm   rm   r   rn   r  w  sP    		

r  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Zdef fddZe										ddeej deej	 deej deej d	eej	 d
eej	 deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForSequenceClassificationr]   c                    sR   t  | |j| _|| _t|| _t|j| _	t
|j| jj| _|   d S r   )r|   r}   r  r]   rD  r.  r   r   r  r   r   r   r:   rJ  r   r   rm   rn   r}     s   
z(AlbertForSequenceClassification.__init__Nr   r   ry   rt   r   r   rv  r   r  r!  r   c                 C   sr  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur| j jdu rV| jdkr<d| j _n| jdkrR|jtj	ksM|jtj
krRd| j _nd| j _| j jdkrtt }| jdkrn|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationru   r;   r  )r]   rZ  r.  r   r:   problem_typer  r{   rY   r   rT   r   squeezer   r   r   r   r   r%  )r   r   r   ry   rt   r   r   rv  r   r  r!  r  rf  r  r@  r|  r  rm   rm   rn   r     sV   



"


z'AlbertForSequenceClassification.forwardr  )r   r   r   r   r}   r   r   rY   r   r   r   r   r   r   r   r   rm   rm   r   rn   r    sH    	

r  c                       r  )AlbertForTokenClassificationr]   c                    sd   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j| jj| _|   d S r  )r|   r}   r  rD  r.  r  r   r   r   r   r   r   r]   r:   rJ  )r   r]   r  r   rm   rn   r}   @  s   
z%AlbertForTokenClassification.__init__Nr   r   ry   rt   r   r   rv  r   r  r!  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrx  r   ru   r;   r  )r]   rZ  r.  r   r:   r   r   r  r   r   r%  )r   r   r   ry   rt   r   r   rv  r   r  r!  r  re  r  r@  r|  r  rm   rm   rn   r   P  s8   

z$AlbertForTokenClassification.forwardr  )r   r   r   r   r}   r   r   rY   r   r   r   r   r   r   r   r   rm   rm   r   rn   r  >  sH    	

r  c                       s   e Zd Zdef fddZe											ddeej deej	 deej deej d	eej	 d
eej	 deej deej dee
 dee
 dee
 deeef fddZ  ZS )AlbertForQuestionAnsweringr]   c                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
r|   r}   r  rD  r.  r   r   r   
qa_outputsrJ  r   r   rm   rn   r}     s
   z#AlbertForQuestionAnswering.__init__Nr   r   ry   rt   r   r   start_positionsend_positionsr   r  r!  r   c                 C   sH  |d ur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nr  r   r   ru   r   )ignore_indexr;   )r@  start_logits
end_logitsr   r%  )r]   rZ  r.  r  rO   r  r   rN   r   clampr   r   r   r%  )r   r   r   ry   rt   r   r   r  r  r   r  r!  r  re  r  r  r  r{  ignored_indexr|  
start_lossend_lossr  rm   rm   rn   r     sP   






z"AlbertForQuestionAnswering.forwardr  r   r   r   r   r}   r   r   rY   r   r   r   r   r?  r   r   r   rm   rm   r   rn   r    sN    
	

r  c                       r  )AlbertForMultipleChoicer]   c                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r|   r}   rD  r.  r   r   r  r   r   r   r:   rJ  r   r   rm   rn   r}     s
   
z AlbertForMultipleChoice.__init__Nr   r   ry   rt   r   r   rv  r   r  r!  r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   ru   r   rx  r;   r  )r]   rZ  rV   r   r   r.  r   r:   r   r   r   r%  )r   r   r   ry   rt   r   r   rv  r   r  r!  num_choicesr  rf  r  reshaped_logitsr@  r|  r  rm   rm   rn   r     sL   ,


zAlbertForMultipleChoice.forwardr  r  rm   rm   r   rn   r    sH    
	

r  )	ro   r-  rD  rh  r  r  r  r  r  )?r   r   rC   dataclassesr   typingr   r   rY   r   torch.nnr   r   r   activationsr
   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   configuration_albertr   
get_loggerr   rA   ro   Modulerp   r   r   r  r   r  r  r-  r?  rD  rh  r7  rk  r  r  r  r  r  __all__rm   rm   rm   rn   <module>   sn   $	
~Dp7)#5 fiWGMf