o
    ei                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, e%-e.Z/G dd dej0Z1		dDdej0dej2dej2dej2dej2dB de3dB de3dee# fdd Z4G d!d" d"ej0Z5G d#d$ d$ej0Z6G d%d& d&ej0Z7G d'd( d(ej0Z8e$G d)d* d*eZ9ee$d+d,G d-d. d.e"Z:e$G d/d0 d0e9Z;e$d1d,G d2d3 d3e9Z<G d4d5 d5ej0Z=G d6d7 d7ej0Z>e$G d8d9 d9e9Z?e$d:d,G d;d< d<e9Z@e$G d=d> d>e9ZAe$G d?d@ d@e9ZBe$G dAdB dBe9ZCg dCZDdS )EzPyTorch ALBERT model.    )Callable)	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)create_bidirectional_mask)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )AlbertConfigc                       sh   e Zd ZdZdef fddZ				ddejdB dejdB dejdB d	ejdB d
ej	f
ddZ
  ZS )AlbertEmbeddingszQ
    Construct the embeddings from word, position and token_type embeddings.
    configc                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxepsposition_idsr   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandzerosr%   sizelongselfr!   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/albert/modeling_albert.pyr,   6   s   

zAlbertEmbeddings.__init__N	input_idsr)   r%   inputs_embedsreturnc                 C   s   |d ur	|  }n|  d d }|\}}|d u r$| jd d d |f }|d u rQt| drF| j|jd d}tj|d|d}|||}ntj|tj	| jj
d}|d u rZ| |}| |}	||	 }
| |}|
| }
| |
}
| |
}
|
S )Nr'   r)   r   r   )dimindex)r*   device)r@   r%   hasattrr)   r>   shaper<   gatherr?   rA   rM   r1   r5   r3   r6   r:   )rC   rH   r)   r%   rI   input_shape
batch_size
seq_lengthbuffered_token_type_idsr5   
embeddingsr3   rF   rF   rG   forwardG   s*   






zAlbertEmbeddings.forward)NNNN)__name__
__module____qualname____doc__r   r,   r<   
LongTensorFloatTensorTensorrV   __classcell__rF   rF   rD   rG   r    1   s$    r            modulequerykeyvalueattention_maskscalingr:   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr'            r   rK   )ptrainingr   )
r@   r<   matmul	transposer   
functionalsoftmaxr:   rk   
contiguous)
r`   ra   rb   rc   rd   re   r:   rf   attn_weightsattn_outputrF   rF   rG   eager_attention_forwardr   s   
rs   c                       sX   e Zd Zdef fddZ	ddejdejdB dee	 de
ejejf fd	d
Z  ZS )AlbertAttentionr!   c                    s   t    |j|j dkrt|dstd|j d|j || _|j| _|j| _|j|j | _| j| j | _| jd | _	t
|j| _t
|j| _t
|j| j| _t
|j| j| _t
|j| j| _t
|j|j| _t
j|j|jd| _d| _d S )Nr   r/   zThe hidden size (z6) is not a multiple of the number of attention heads (rg   r#   F)r+   r,   hidden_sizenum_attention_headsrN   
ValueErrorr!   attention_head_sizeall_head_sizere   r   r8   attention_probs_dropout_probattention_dropoutr9   output_dropoutLinearra   rb   rc   denser6   r7   	is_causalrB   rD   rF   rG   r,      s*   


zAlbertAttention.__init__Nhidden_statesrd   rf   rJ   c                 K   s   |j d d }g |d| jR }| |j| dd}| |j| dd}| |j| dd}t| j	j
t}	|	| ||||f| jsIdn| jj| jd|\}
}|
jg |dR   }
| |
}
| |
}
| ||
 }
|
|fS )Nr'   r   rh   r_   )r:   re   )rO   rx   ra   viewrm   rb   rc   r   get_interfacer!   _attn_implementationrs   rk   r{   rj   re   reshaperp   r~   r|   r6   )rC   r   rd   rf   rQ   hidden_shapequery_layer	key_layervalue_layerattention_interfacerr   rq   rF   rF   rG   rV      s2   



zAlbertAttention.forwardNrW   rX   rY   r   r,   r<   r]   r\   r   r   tuplerV   r^   rF   rF   rD   rG   rt      s    rt   c                       sn   e Zd Zdef fddZ	ddejdejdB dee	 de
ejejf fd	d
ZdejdejfddZ  ZS )AlbertLayerr!   c                    s~   t    || _|j| _d| _tj|j|jd| _	t
|| _t|j|j| _t|j|j| _t|j | _t|j| _d S )Nr   r#   )r+   r,   r!   chunk_size_feed_forwardseq_len_dimr   r6   ru   r7   full_layer_layer_normrt   	attentionr}   intermediate_sizeffn
ffn_outputr
   
hidden_act
activationr8   r9   r:   rB   rD   rF   rG   r,      s   

zAlbertLayer.__init__Nr   rd   rf   rJ   c                 K   s>   | j ||fi |\}}t| j| j| j|}| || }|S r   )r   r   ff_chunkr   r   r   )rC   r   rd   rf   attention_output_r   rF   rF   rG   rV      s   zAlbertLayer.forwardr   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rC   r   r   rF   rF   rG   r      s   


zAlbertLayer.ff_chunkr   )rW   rX   rY   r   r,   r<   r]   r\   r   r   r   rV   r   r^   rF   rF   rD   rG   r      s    
r   c                       s`   e Zd Zdef fddZ	ddejdejdB dee	 de
eje
ej B d	f fd
dZ  ZS )AlbertLayerGroupr!   c                    s.   t    t fddt jD | _d S )Nc                       g | ]}t  qS rF   )r   .0r   r!   rF   rG   
<listcomp>       z-AlbertLayerGroup.__init__.<locals>.<listcomp>)r+   r,   r   
ModuleListrangeinner_group_numalbert_layersrB   rD   r   rG   r,      s   
$zAlbertLayerGroup.__init__Nr   rd   rf   rJ   .c                 K   s*   t | jD ]\}}|||fi |}q|S r   )	enumerater   )rC   r   rd   rf   layer_indexalbert_layerrF   rF   rG   rV      s   zAlbertLayerGroup.forwardr   r   rF   rF   rD   rG   r      s    r   c                
       sP   e Zd Zdef fddZ	ddejdejdB dee	 de
eB fd	d
Z  ZS )AlbertTransformerr!   c                    sF   t     | _t j j| _t fddt	 j
D | _d S )Nc                    r   rF   )r   r   r   rF   rG   r     r   z.AlbertTransformer.__init__.<locals>.<listcomp>)r+   r,   r!   r   r}   r/   ru   embedding_hidden_mapping_inr   r   num_hidden_groupsalbert_layer_groupsrB   rD   r   rG   r,     s   
$zAlbertTransformer.__init__Nr   rd   rf   rJ   c                 K   sV   |  |}t| jjD ]}t|| jj| jj  }| j| ||fi |}qt|dS )N)last_hidden_state)r   r   r!   num_hidden_layersintr   r   r   )rC   r   rd   rf   i	group_idxrF   rF   rG   rV     s   

zAlbertTransformer.forwardr   )rW   rX   rY   r   r,   r<   r]   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r     s    
r   c                   @   s>   e Zd ZeZdZdZdZdZdZ	e
edZe dd ZdS )AlbertPreTrainedModelalbertT)r   
attentionsc                 C   s  t |tjr tj|jd| jjd |jdurt	|j dS dS t |tj
rLtj|jd| jjd |jdurHt|jddsJt	|j|j  dS dS dS t |tjr`t	|j t|j dS t |trmt	|j dS t |trt|jt|jjd d t	|j dS dS )zInitialize the weights.r_   )meanstdN_is_hf_initializedFr'   r&   )
isinstancer   r}   initnormal_weightr!   initializer_rangebiaszeros_r-   r"   getattrr6   ones_AlbertMLMHeadr    copy_r%   r<   r=   rO   r>   r)   )rC   r`   rF   rF   rG   _init_weights0  s&   


"z#AlbertPreTrainedModel._init_weightsN)rW   rX   rY   r   config_classbase_model_prefix_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   rt   _can_record_outputsr<   no_gradr   rF   rF   rF   rG   r   #  s    r   z2
    Output type of [`AlbertForPreTraining`].
    )custom_introc                   @   st   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dZeej dB ed< dS )AlbertForPreTrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
        before SoftMax).
    Nlossprediction_logits
sop_logitsr   r   )rW   rX   rY   rZ   r   r<   r\   __annotations__r   r   r   r   r   rF   rF   rF   rG   r   F  s   
 r   c                       s   e Zd ZeZdZddedef fddZdej	fdd	Z
d
ej	ddfddZeee					ddejdB dejdB dejdB dejdB dejdB dee deeB fddZ  ZS )AlbertModelr   Tr!   add_pooling_layerc                    sh   t  | || _t|| _t|| _|r$t|j	|j	| _
t | _nd| _
d| _|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r+   r,   r!   r    rU   r   encoderr   r}   ru   poolerTanhpooler_activationr   attn_implementation	post_init)rC   r!   r   rD   rF   rG   r,   d  s   

zAlbertModel.__init__rJ   c                 C      | j jS r   rU   r1   rC   rF   rF   rG   get_input_embeddingsz     z AlbertModel.get_input_embeddingsrc   Nc                 C      || j _d S r   r   )rC   rc   rF   rF   rG   set_input_embeddings}     z AlbertModel.set_input_embeddingsrH   rd   r)   r%   rI   rf   c                 K   s   |d u |d uA rt d| j||||d}t| j||d}| j||fd|i|}|d }	| jd ur@| | |	d d df nd }
t|	|
dS )Nz:You must specify exactly one of input_ids or inputs_embeds)r%   r)   rI   )r!   rI   rd   r%   r   )r   pooler_output)rw   rU   r   r!   r   r   r   r   )rC   rH   rd   r)   r%   rI   rf   embedding_outputencoder_outputssequence_outputpooled_outputrF   rF   rG   rV     s0   *zAlbertModel.forward)T)NNNNN)rW   rX   rY   r   r   r   boolr,   r   r-   r   r   r   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r   _  s:    r   z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       s   e Zd ZdddZdef fddZdejfdd	Zd
ejddfddZ	dej
fddZee							ddejdB dejdB dejdB dejdB dejdB dejdB dejdB dee deeB fddZ  ZS )AlbertForPreTraining(albert.embeddings.word_embeddings.weightpredictions.biaszpredictions.decoder.weightzpredictions.decoder.biasr!   c                    s6   t  | t|| _t|| _t|| _|   d S r   )	r+   r,   r   r   r   predictionsAlbertSOPHeadsop_classifierr   rB   rD   rF   rG   r,     s
   


zAlbertForPreTraining.__init__rJ   c                 C   r   r   r   decoderr   rF   rF   rG   get_output_embeddings  r   z*AlbertForPreTraining.get_output_embeddingsnew_embeddingsNc                 C   r   r   r   rC   r   rF   rF   rG   set_output_embeddings  r   z*AlbertForPreTraining.set_output_embeddingsc                 C   
   | j jjS r   r   rU   r1   r   rF   rF   rG   r        
z)AlbertForPreTraining.get_input_embeddingsrH   rd   r)   r%   rI   labelssentence_order_labelrf   c                 K   s   | j |f||||dd|}	|	dd \}
}| |
}| |}d}|durL|durLt }||d| jj|d}||dd|d}|| }t||||	j|	j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
            sequence B), `1` indicates switched order (sequence B, then sequence A).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, AlbertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> sop_logits = outputs.sop_logits
        ```Trd   r)   r%   rI   return_dictNrh   r'   )r   r   r   r   r   )
r   r   r   r   r   r!   r.   r   r   r   )rC   rH   rd   r)   r%   rI   r   r   rf   outputsr   r   prediction_scores
sop_scores
total_lossloss_fctmasked_lm_losssentence_order_lossrF   rF   rG   rV     s6   '


zAlbertForPreTraining.forwardNNNNNNN)rW   rX   rY   _tied_weights_keysr   r,   r   r}   r   r   r-   r   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r     sH    
	
r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )r   r!   c                    sf   t    tj|j|jd| _tt|j	| _
t|j|j| _t|j|j	| _t|j | _d S )Nr#   )r+   r,   r   r6   r/   r7   	Parameterr<   r?   r.   r   r}   ru   r~   r   r
   r   r   rB   rD   rF   rG   r,     s   
zAlbertMLMHead.__init__r   rJ   c                 C   s0   |  |}| |}| |}| |}|}|S r   )r~   r   r6   r   )rC   r   r   rF   rF   rG   rV     s   



zAlbertMLMHead.forward	rW   rX   rY   r   r,   r<   r]   rV   r^   rF   rF   rD   rG   r     s    	r   c                       r  )r   r!   c                    s.   t    t|j| _t|j|j| _	d S r   )
r+   r,   r   r8   classifier_dropout_probr:   r}   ru   
num_labels
classifierrB   rD   rF   rG   r,   %  s   
zAlbertSOPHead.__init__r   rJ   c                 C   s   |  |}| |}|S r   )r:   r	  )rC   r   dropout_pooled_outputlogitsrF   rF   rG   rV   +  s   

zAlbertSOPHead.forwardr  rF   rF   rD   rG   r   $  s    r   c                       s   e Zd ZdddZ fddZdejfddZd	ejdd
fddZdej	fddZ
ee	
	
	
	
	
	
ddejd
B dejd
B dejd
B dejd
B dejd
B dejd
B dee deeB fddZ  ZS )AlbertForMaskedLMr   r   r   c                    s0   t  | t|dd| _t|| _|   d S NF)r   )r+   r,   r   r   r   r   r   rB   rD   rF   rG   r,   8  s   
zAlbertForMaskedLM.__init__rJ   c                 C   r   r   r   r   rF   rF   rG   r   A  r   z'AlbertForMaskedLM.get_output_embeddingsr   Nc                 C   s   || j _|j| j _d S r   )r   r   r   r   rF   rF   rG   r   D  s   z'AlbertForMaskedLM.set_output_embeddingsc                 C   r   r   r   r   rF   rF   rG   r   H  r   z&AlbertForMaskedLM.get_input_embeddingsrH   rd   r)   r%   rI   r   rf   c              	   K   sr   | j d|||||dd|}|d }	| |	}
d}|dur/t }||
d| jj|d}t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlbertForMaskedLM

        >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
        >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2")

        >>> # add mask_token
        >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt")
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> # retrieve index of [MASK]
        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
        >>> tokenizer.decode(predicted_token_id)
        'france'
        ```

        ```python
        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
        >>> outputs = model(**inputs, labels=labels)
        >>> round(outputs.loss.item(), 2)
        0.81
        ```
        TrH   rd   r)   r%   rI   r   r   Nr'   r   r  r   r   rF   )	r   r   r   r   r!   r.   r   r   r   )rC   rH   rd   r)   r%   rI   r   rf   r   sequence_outputsr   r   r   rF   rF   rG   rV   K  s,   /	
zAlbertForMaskedLM.forwardNNNNNN)rW   rX   rY   r  r,   r   r}   r   r   r-   r   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r  1  sB    		r  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                          e Zd Zdef fddZee						ddejdB dej	dB dejdB dejdB d	ej	dB d
ejdB de
e deeB fddZ  ZS )AlbertForSequenceClassificationr!   c                    sR   t  | |j| _|| _t|| _t|j| _	t
|j| jj| _|   d S r   )r+   r,   r  r!   r   r   r   r8   r  r:   r}   ru   r	  r   rB   rD   rF   rG   r,     s   
z(AlbertForSequenceClassification.__init__NrH   rd   r)   r%   rI   r   rf   rJ   c              	   K   s4  | j d
|||||dd|}|d }	| |	}	| |	}
d}|dur| jjdu rM| jdkr3d| j_n| jdkrI|jtjksD|jtj	krId| j_nd| j_| jjdkrkt
 }| jdkre||
 | }n+||
|}n%| jjdkrt }||
d| j|d}n| jjdkrt }||
|}t||
|j|jd	S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Tr  r   N
regressionsingle_label_classificationmulti_label_classificationr'   r  rF   )r   r:   r	  r!   problem_typer  r*   r<   rA   r   r   squeezer   r   r   r   r   r   )rC   rH   rd   r)   r%   rI   r   rf   r   r   r  r   r   rF   rF   rG   rV     sL   




"


z'AlbertForSequenceClassification.forwardr  )rW   rX   rY   r   r,   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r    s6    	r  c                       r  )AlbertForTokenClassificationr!   c                    sd   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j| jj| _|   d S r  )r+   r,   r  r   r   r  r9   r   r8   r:   r}   ru   r!   r	  r   )rC   r!   r  rD   rF   rG   r,     s   
z%AlbertForTokenClassification.__init__NrH   rd   r)   r%   rI   r   rf   rJ   c                 K   sz   | j |f||||dd|}|d }	| |	}	| |	}
d}|dur3t }||
d| j|d}t||
|j|jdS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr   r   Nr'   r  )	r   r:   r	  r   r   r  r   r   r   )rC   rH   rd   r)   r%   rI   r   rf   r   r   r  r   r   rF   rF   rG   rV     s0   


z$AlbertForTokenClassification.forwardr  )rW   rX   rY   r   r,   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r    s6    	r  c                       s   e Zd Zdef fddZee							ddejdB dej	dB dejdB dejdB d	ej	dB d
ejdB dejdB de
e deeB fddZ  ZS )AlbertForQuestionAnsweringr!   c                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r  )
r+   r,   r  r   r   r   r}   ru   
qa_outputsr   rB   rD   rF   rG   r,   '  s
   z#AlbertForQuestionAnswering.__init__NrH   rd   r)   r%   rI   start_positionsend_positionsrf   rJ   c              	   K   s  | j d
|||||dd|}	|	d }
| |
}|jddd\}}|d }|d }d }|d ury|d uryt| dkrF|d}t| dkrS|d}|d}|d|}|d|}t|d}|||}|||}|| d }t	||||	j
|	jd	S )NTr  r   r   r'   ri   )ignore_indexrh   )r   start_logits
end_logitsr   r   rF   )r   r  splitr  rp   lenr@   clampr   r   r   r   )rC   rH   rd   r)   r%   rI   r  r  rf   r   r   r  r  r   r   ignored_indexr   
start_lossend_lossrF   rF   rG   rV   1  sF   







z"AlbertForQuestionAnswering.forwardr  rW   rX   rY   r   r,   r   r   r<   r[   r\   r   r   r   r   rV   r^   rF   rF   rD   rG   r  %  s<    
	
r  c                       r  )AlbertForMultipleChoicer!   c                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r+   r,   r   r   r   r8   r  r:   r}   ru   r	  r   rB   rD   rF   rG   r,   k  s
   
z AlbertForMultipleChoice.__init__NrH   rd   r)   r%   rI   r   rf   rJ   c                 K   s0  |dur	|j d n|j d }|dur|d|dnd}|dur*|d|dnd}|dur9|d|dnd}|durH|d|dnd}|dur[|d|d|dnd}| j|f||||dd|}	|	d }
| |
}
| |
}|d|}d}|durt }|||}t|||	j|	j	dS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        Nr   r'   Tr   r  )
rO   r   r@   r   r:   r	  r   r   r   r   )rC   rH   rd   r)   r%   rI   r   rf   num_choicesr   r   r  reshaped_logitsr   r   rF   rF   rG   rV   u  sD   *



zAlbertForMultipleChoice.forwardr  r'  rF   rF   rD   rG   r(  i  s6    
	r(  )r   r   r   r  r  r  r  r(  )Nr_   )ErZ   collections.abcr   dataclassesr   r<   r   torch.nnr   r   r    r	   r   activationsr
   masking_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_albertr   
get_loggerrW   loggerModuler    r]   floatrs   rt   r   r   r   r   r   r   r   r   r   r  r  r  r  r(  __all__rF   rF   rF   rG   <module>   s   $	
G
A&"J_bM=C]