o
    i                    @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 e,1e2Z3dKddZ4G dd de
j5Z6G dd de
j5Z7G dd de
j5Z8de7iZ9G dd de
j5Z:G dd  d e
j5Z;G d!d" d"e
j5Z<G d#d$ d$eZ=G d%d& d&e
j5Z>G d'd( d(e
j5Z?G d)d* d*e
j5Z@e+G d+d, d,e$ZAee+d-d.G d/d0 d0e*ZBe+G d1d2 d2eAZCG d3d4 d4e
j5ZDG d5d6 d6e
j5ZEe+d7d.G d8d9 d9eAZFe+d:d.G d;d< d<eAZGe+d=d.G d>d? d?eAZHe+d@d.G dAdB dBeAZIe+G dCdD dDeAZJe+G dEdF dFeAZKe+dGd.G dHdI dIeAeZLg dJZMdS )LzPyTorch ELECTRA model.    N)	dataclass)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging)deprecate_kwarg   )ElectraConfigdiscriminatorc                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }	g }
|D ] \}}t	d| d|  |j
||}|	| |
| q6t|	|
D ]#\}}|}z t| trp|dd}|d	kr|d
d}|dd
}|dd}|dd}|d}tdd |D rt	d|  W q\| }|D ]f}|d|r|d|}n|g}|d dks|d dkrt|d}n1|d dks|d dkrt|d}n|d dkrt|d}n|d dkrt|d}nt||d }t|d krt|d! }|| }q|d"rt|d}n
|dkr%||}z|j|jkr:td#|j d$|j d%W n tyT } z| j|j|jf7  _ d}~ww td&| | t||_ W q\ t!y } ztd| || W Y d}~q\d}~ww | S )'z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape zelectra/embeddings/zgenerator/embeddings/	generatorzelectra/zdiscriminator/z
generator/dense_1dense_predictionz!generator_predictions/output_biaszgenerator_lm_head/bias/c                 s   s    | ]}|d v V  qdS ))global_steptemperatureN ).0nr+   r+   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/electra/modeling_electra.py	<genexpr>\   s    z-load_tf_weights_in_electra.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r"   _embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )"renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzip
isinstanceElectraForMaskedLMreplacesplitany	fullmatchgetattrlenintendswith	transposeshape
ValueErrorargsprinttorch
from_numpydataAttributeError)modelconfigtf_checkpoint_pathdiscriminator_or_generatorr;   nptftf_path	init_varsnamesarraysnamerU   arrayoriginal_namepointerm_namescope_namesnumer+   r+   r.   load_tf_weights_in_electra2   s   




ro   c                       sh   e Zd ZdZ fddZ					ddeej deej deej d	eej d
e	dej
fddZ  ZS )ElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd t|dd| _| jd	tj| j tjd
dd d S )N)padding_idxepsposition_ids)r"   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrY   arangeexpandrP   rw   zerosrt   sizelongselfr^   	__class__r+   r.   r}      s   

zElectraEmbeddings.__init__Nr   	input_idsry   rt   inputs_embedspast_key_values_lengthreturnc                 C   s   |d ur	|  }n|  d d }|d }|d u r&| jd d ||| f }|d u rPt| drE| jd d d |f }||d |}	|	}ntj|tj| jjd}|d u rY| 	|}| 
|}
||
 }| jdkrp| |}||7 }| |}| |}|S )Nru   r"   ry   r   r{   devicerx   )r   rt   hasattrry   r   rY   r   r   r   r   r   rw   r   r   r   )r   r   ry   rt   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   r+   r+   r.   forward   s,   







zElectraEmbeddings.forward)NNNNr   )__name__
__module____qualname____doc__r}   r   rY   
LongTensorFloatTensorrR   Tensorr   __classcell__r+   r+   r   r.   rp      s*    rp   c                       s   e Zd Zd fdd	Zedddd						dd	ejd
eej deej deej dee	 dee
 deej deej fddZ  ZS )ElectraSelfAttentionNc                    s  t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _|| _d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()rw   rx   relative_keyrelative_key_queryr9   r"   )r|   r}   hidden_sizenum_attention_headsr   rV   rR   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rP   rw   r   r~   distance_embedding
is_decoder	layer_idxr   r^   rw   r   r   r+   r.   r}      s,   


zElectraSelfAttention.__init__past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionr   c                 C   s   |j \}}	}
| |}||d| j| jdd}d}|d u}|d ur;t|tr9|j	| j
}|r5|j}n|j}n|}|r?|n|}|rX|d urX|rX|j| j
 j}|j| j
 j}nJ| |}||d| j| jdd}| |}||d| j| jdd}|d ur|s|nd }|||| j
d|i\}}|rt|trd|j| j
< t||dd}| jdks| jd	kr4|j d |j d }}|d urtj|d tj|jd
dd}ntj|tj|jd
dd}tj|tj|jd
dd}|| }| || j d }|j|jd}| jdkrtd||}|| }n| jd	kr4td||}td||}|| | }|t | j }|d urE|| }t!j"j#|dd}| $|}|d ur[|| }t||}|%dddd& }|' d d | j(f }||}||fS )Nru   r"   r9   Fr   Tr   r   r   rz   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r
   ))rU   r   viewr   r   rT   rJ   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterY   matmulrw   tensorr   r   r   r   r   tor{   einsummathsqrtr   
functionalsoftmaxr   permute
contiguousr   r   )r   r   r   r   r   r   r   r   
batch_sizer   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shaper+   r+   r.   r      s   








zElectraSelfAttention.forwardNNNNNNFN)r   r   r   r}   r!   rY   r   r   r   r   booltupler   r   r+   r+   r   r.   r      s4    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )ElectraSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nrr   )r|   r}   r   r   r   denser   r   r   r   r   r   r   r+   r.   r}   N     
zElectraSelfOutput.__init__r   input_tensorr   c                 C   &   |  |}| |}| || }|S Nr   r   r   r   r   r   r+   r+   r.   r   T     

zElectraSelfOutput.forwardr   r   r   r}   rY   r   r   r   r+   r+   r   r.   r   M      $r   eagerc                       s   e Zd Zd fdd	Zdd Zedddd						
	ddejdeej	 deej	 deej	 dee
 dee deej deej fddZ  ZS )ElectraAttentionNc                    s6   t    t|j |||d| _t|| _t | _d S )Nrw   r   )	r|   r}   ELECTRA_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputsetpruned_headsr   r   r+   r.   r}   b  s   

zElectraAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r"   r   )rQ   r   r   r   r   r  r   r   r   r   r	  r   r   union)r   headsindexr+   r+   r.   prune_headsl  s   zElectraAttention.prune_headsr   r   r   r   Fr   r   r   r   r   r   r   c              	   C   s>   | j |||||||d}| |d |}	|	f|dd   }
|
S )Nr   r   r   r   r   r   r   r"   )r   r	  )r   r   r   r   r   r   r   r   self_outputsattention_outputoutputsr+   r+   r.   r   ~  s   	zElectraAttention.forwardr   r   )r   r   r   r}   r  r!   rY   r   r   r   r   r   r   r   r   r+   r+   r   r.   r  a  s6    
	r  c                       s2   e Zd Z fddZdejdejfddZ  ZS )ElectraIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r|   r}   r   r   r   intermediate_sizer   rJ   
hidden_actstrr   intermediate_act_fnr   r   r+   r.   r}     s
   
zElectraIntermediate.__init__r   r   c                 C   s   |  |}| |}|S r   )r   r  )r   r   r+   r+   r.   r     s   

zElectraIntermediate.forwardr  r+   r+   r   r.   r    s    r  c                       r   )ElectraOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r|   r}   r   r   r  r   r   r   r   r   r   r   r   r   r+   r.   r}     r   zElectraOutput.__init__r   r   r   c                 C   r   r   r   r   r+   r+   r.   r     r  zElectraOutput.forwardr  r+   r+   r   r.   r    r  r  c                       s   e Zd Zd fdd	Zedddd							dd	ejd
eej deej deej deej dee	 dee
 deej deej fddZdd Z  ZS )ElectraLayerNc                    sx   t    |j| _d| _t||d| _|j| _|j| _| jr0| js(t|  dt|d|d| _	t
|| _t|| _d S )Nr"   r   z> should be used as a decoder model if cross attention is addedrx   r  )r|   r}   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionrV   crossattentionr  intermediater  r	  r   r^   r   r   r+   r.   r}     s   

zElectraLayer.__init__r   r   r   r   Fr   r   r   r   encoder_attention_maskr   r   r   c	              	   C   s   | j ||||||d}	|	d }
|	dd  }| jrA|d urAt| ds)td|  d| j|
||||||d}|d }
||dd   }t| j| j| j|
}|f| }|S )N)r   r   r   r   r   r   r"   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )	r  r   r   rV   r   r   feed_forward_chunkr  r  )r   r   r   r   r   r#  r   r   r   self_attention_outputsr  r  cross_attention_outputslayer_outputr+   r+   r.   r     s>   

	
zElectraLayer.forwardc                 C   s   |  |}| ||}|S r   )r!  r	  )r   r  intermediate_outputr'  r+   r+   r.   r$    s   
zElectraLayer.feed_forward_chunkr   )NNNNNFN)r   r   r   r}   r!   rY   r   r   r   r   r   r   r   r$  r   r+   r+   r   r.   r    s<    	
0r  c                       s   e Zd Zd fdd	Z										ddejdeej deej d	eej d
eej dee dee	 dee	 dee	 dee	 deej de
eej ef fddZ  ZS )ElectraEncoderNc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |d qS )r  )r  )r,   ir^   r+   r.   
<listcomp>  s    z+ElectraEncoder.__init__.<locals>.<listcomp>F)	r|   r}   r^   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr"  r   r+  r.   r}     s   
 
zElectraEncoder.__init__FTr   r   r   r   r#  r   	use_cacher   output_hidden_statesreturn_dictr   r   c                 C   s`  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r<| j jr<|d u r<tt| j dt| j d}|rQ| j jrQt	|t
rQtd t|}t| jD ]9\}}|	ra||f }|d uri|| nd }|||||||||d}|d }|r||d f }| j jr||d	 f }qV|	r||f }|
st
d
d |||||fD S t|||||dS )Nr+   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr+  zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r#  r   r   r   r   r"   r9   c                 s   s    | ]	}|d ur|V  qd S r   r+   )r,   vr+   r+   r.   r/   D  s    z)ElectraEncoder.forward.<locals>.<genexpr>)last_hidden_stater   r   
attentionscross_attentions)r^   r  r1  trainingr?   warning_oncer   r   r   rJ   r   from_legacy_cache	enumerater0  r   )r   r   r   r   r   r#  r   r2  r   r3  r4  r   all_hidden_statesall_self_attentionsall_cross_attentionsr*  layer_modulelayer_head_masklayer_outputsr+   r+   r.   r     sl   


zElectraEncoder.forwardr   )
NNNNNNFFTN)r   r   r   r}   rY   r   r   r   r   r   r   r   r   r   r   r+   r+   r   r.   r)    sJ    		
r)  c                       (   e Zd ZdZ fddZdd Z  ZS )ElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.c                    sB   t    t|j|j| _t|j| _t|jd| _	|| _
d S Nr"   )r|   r}   r   r   r   r   r   r  
activationr'   r^   r   r   r+   r.   r}   [  s
   

z(ElectraDiscriminatorPredictions.__init__c                 C   s(   |  |}| |}| |d}|S )Nru   )r   rF  r'   squeeze)r   discriminator_hidden_statesr   logitsr+   r+   r.   r   c  s   

z'ElectraDiscriminatorPredictions.forwardr   r   r   r   r}   r   r   r+   r+   r   r.   rD  X  s    rD  c                       rC  )ElectraGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    s>   t    td| _tj|j|jd| _t|j	|j| _
d S )Ngelurr   )r|   r}   r   rF  r   r   r   r   r   r   r   r   r   r+   r.   r}   n  s   

z$ElectraGeneratorPredictions.__init__c                 C   s"   |  |}| |}| |}|S r   )r   rF  r   )r   generator_hidden_statesr   r+   r+   r.   r   u  s   


z#ElectraGeneratorPredictions.forwardrJ  r+   r+   r   r.   rK  k  s    rK  c                   @   s*   e Zd ZU eed< eZdZdZdd Z	dS )ElectraPreTrainedModelr^   electraTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)rJ   r   r   r2   r[   normal_r^   initializer_ranger5   zero_r~   rq   r   fill_)r   moduler+   r+   r.   _init_weights  s   

z$ElectraPreTrainedModel._init_weightsN)
r   r   r   r#   __annotations__ro   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingrW  r+   r+   r+   r.   rN  }  s   
 rN  z3
    Output type of [`ElectraForPreTraining`].
    )custom_introc                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )ElectraForPreTrainingOutputa+  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    NlossrI  r   r7  )r   r   r   r   r^  r   rY   r   rX  rI  r   r   r7  r+   r+   r+   r.   r]    s   
 r]  c                        s   e Zd Z fddZdd Zdd Zdd Ze																										dd
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee dee dee deee	j
 ef fddZ  ZS )ElectraModelc                    sP   t  | t|| _|j|jkrt|j|j| _t	|| _
|| _|   d S r   )r|   r}   rp   r   r   r   r   r   embeddings_projectr)  encoderr^   	post_initr   r   r+   r.   r}     s   

zElectraModel.__init__c                 C   s   | j jS r   r   r   r   r+   r+   r.   get_input_embeddings  s   z!ElectraModel.get_input_embeddingsc                 C   s   || j _d S r   rc  )r   r   r+   r+   r.   set_input_embeddings  s   z!ElectraModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsra  r0  r  r  )r   heads_to_pruner0  r  r+   r+   r.   _prune_heads  s   zElectraModel._prune_headsNr   r   ry   rt   r   r   r   r#  r   r2  r   r3  r4  r   c                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }n|d urF| d d }ntd|\}}|d urU|jn|j}d}|	d urpt|	t	sl|	d d j
d n|	 }|d u r{tj||d}|d u rt| jdr| jjd d d |f }|||}|}n	tj|tj|d}| ||}| j jr|d ur| \}}}||f}|d u rtj||d}| |}nd }| || j j}| j|||||d	}t| d
r| |}| j||||||	|
|||d
}|S )NzDYou cannot specify both input_ids and inputs_embeds at the same timeru   z5You have to specify either input_ids or inputs_embedsr   r   )r   ry   r   )r   rt   ry   r   r   r`  )	r   r   r   r#  r   r2  r   r3  r4  )r^   r   r3  use_return_dictrV   %warn_if_padding_and_no_attention_maskr   r   rJ   r   rU   get_seq_lengthrY   onesr   r   ry   r   r   r   get_extended_attention_maskr   invert_attention_maskget_head_maskr/  r`  ra  )r   r   r   ry   rt   r   r   r   r#  r   r2  r   r3  r4  r   r   r   r   r   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   r+   r+   r.   r     sv   


zElectraModel.forward)NNNNNNNNNNNNN)r   r   r   r}   re  rf  ri  r   r   rY   r   r   r   r   r   r   r   r   r+   r+   r   r.   r_    s`    	
r_  c                       rC  )ElectraClassificationHeadz-Head for sentence-level classification tasks.c                    s^   t    t|j|j| _|jd ur|jn|j}td| _	t
|| _t|j|j| _d S )NrL  )r|   r}   r   r   r   r   classifier_dropoutr   r   rF  r   r   
num_labelsout_projr   r^   rw  r   r+   r.   r}   (  s   

z"ElectraClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}| |}|  |}| |}|S )Nr   )r   r   rF  ry  )r   featureskwargsxr+   r+   r.   r   2  s   




z!ElectraClassificationHead.forwardrJ  r+   r+   r   r.   rv  %  s    
rv  c                       sJ   e Zd ZdZdef fddZ	ddejdeej	 dejfd	d
Z
  ZS )ElectraSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`ElectraConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r^   c                    s   t    t|dd| _| jdkrtt | _t|dr<|j	r<t|dr1|j
r1|jdkr1|j}n|j}t|j|| _t|dd }|rHt|nt | _t | _t|drc|jdkrct|j| _t | _t|d	r{|jdkr}t|j| _d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r|   r}   rP   r  NotImplementedErrorr   Identitysummaryr   r  r  rx  r   r   r   rF  first_dropoutr  r   last_dropoutr  )r   r^   num_classesactivation_stringr   r+   r.   r}   W  s&   




zElectraSequenceSummary.__init__Nr   	cls_indexr   c                 C   s  | j dkr|dddf }ne| j dkr|dddf }nW| j dkr(|jdd}nK| j d	krl|du rItj|d
ddddf |jd d tjd}n|dd}|d| d  |	df }|
d|d}n| j dkrst| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r  Nru   firstr   rP  r"   r   r  .r   rz   )ru   r  )r  rP  rY   	full_likerU   r   	unsqueezer   r   r   gatherrG  r  r  r  rF  r  )r   r   r  r	  r+   r+   r.   r   t  s.   



"




zElectraSequenceSummary.forwardr   )r   r   r   r   r#   r}   rY   r   r   r   r   r   r+   r+   r   r.   r~  =  s    r~  z
    ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee dee dee de	e
ej ef fddZ  ZS ) ElectraForSequenceClassificationc                    s:   t  | |j| _|| _t|| _t|| _|   d S r   )	r|   r}   rx  r^   r_  rO  rv  r8   rb  r   r   r+   r.   r}     s   

z)ElectraForSequenceClassification.__init__Nr   r   ry   rt   r   r   labelsr   r3  r4  r   c                 C   sh  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur| j jdu rQ| jdkr7d| j _n| jdkrM|jtjksH|jtj	krMd| j _nd| j _| j jdkrot
 }| jdkri|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   ry   rt   r   r   r   r3  r4  r   r"   
regressionsingle_label_classificationmulti_label_classificationru   r^  rI  r   r7  )r^   rj  rO  r8   problem_typerx  r{   rY   r   rR   r	   rG  r   r   r   r   r   r7  )r   r   r   ry   rt   r   r   r  r   r3  r4  rH  sequence_outputrI  r^  loss_fctr	  r+   r+   r.   r     sT   


"


z(ElectraForSequenceClassification.forward
NNNNNNNNNN)r   r   r   r}   r   r   rY   r   r   r   r   r   r   r   r+   r+   r   r.   r    sH    
	
r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                       r  )ElectraForPreTrainingc                    s,   t  | t|| _t|| _|   d S r   )r|   r}   r_  rO  rD  discriminator_predictionsrb  r   r   r+   r.   r}     s   

zElectraForPreTraining.__init__Nr   r   ry   rt   r   r   r  r   r3  r4  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur_t }|durQ|d|jd dk}|d|jd | }|| }||| }n||d|jd | }|
su|f|dd  }|durs|f| S |S t	|||j
|jdS )am  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
            Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import ElectraForPreTraining, AutoTokenizer
        >>> import torch

        >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")

        >>> sentence = "The quick brown fox jumps over the lazy dog"
        >>> fake_sentence = "The quick brown fox fake over the lazy dog"

        >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True)
        >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
        >>> discriminator_outputs = discriminator(fake_inputs)
        >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

        >>> fake_tokens
        ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]']

        >>> predictions.squeeze().tolist()
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        ```Nr  r   ru   r"   r  )r^   rj  rO  r  r   r   r   rU   floatr]  r   r7  )r   r   r   ry   rt   r   r   r  r   r3  r4  rH  discriminator_sequence_outputrI  r^  r  active_lossactive_logitsactive_labelsr	  r+   r+   r.   r   	  s@   -
zElectraForPreTraining.forwardr  )r   r   r   r}   r   r   rY   r   r   r   r   r]  r   r   r+   r+   r   r.   r    sH    	
r  z
    Electra model with a language modeling head on top.

    Even though both the discriminator and generator may be loaded into this model, the generator is the only model of
    the two to have been trained for the masked language modeling task.
    c                       s   e Zd ZdgZ fddZdd Zdd Ze										dd	ee	j
 d
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deee	j
 ef fddZ  ZS )rK   generator_lm_head.weightc                    s>   t  | t|| _t|| _t|j|j	| _
|   d S r   )r|   r}   r_  rO  rK  generator_predictionsr   r   r   r   generator_lm_headrb  r   r   r+   r.   r}   i  s
   

zElectraForMaskedLM.__init__c                 C      | j S r   r  rd  r+   r+   r.   get_output_embeddingss     z(ElectraForMaskedLM.get_output_embeddingsc                 C   
   || _ d S r   r  )r   r   r+   r+   r.   set_output_embeddingsv     
z(ElectraForMaskedLM.set_output_embeddingsNr   r   ry   rt   r   r   r  r   r3  r4  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur>t }||d| j j|d}|
sT|f|dd  }|durR|f| S |S t	|||j
|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   ru   r"   r  )r^   rj  rO  r  r  r   r   r   r   r   r   r7  )r   r   r   ry   rt   r   r   r  r   r3  r4  rM  generator_sequence_outputprediction_scoresr^  r  r	  r+   r+   r.   r   y  s8   

zElectraForMaskedLM.forwardr  )r   r   r   _tied_weights_keysr}   r  r  r   r   rY   r   r   r   r   r   r   r   r+   r+   r   r.   rK   ^  sN    	
	
rK   z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                       r  )ElectraForTokenClassificationc                    s^   t  | |j| _t|| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r   )r|   r}   rx  r_  rO  rw  r   r   r   r   r   r   r8   rb  rz  r   r+   r.   r}     s   
z&ElectraForTokenClassification.__init__Nr   r   ry   rt   r   r   r  r   r3  r4  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|dur<t }||d| j|d}|
sR|f|dd  }|durP|f| S |S t|||j	|j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   ru   r"   r  )r^   rj  rO  r   r8   r   r   rx  r   r   r7  )r   r   r   ry   rt   r   r   r  r   r3  r4  rH  r  rI  r^  r  r	  r+   r+   r.   r     s8   

z%ElectraForTokenClassification.forwardr  )r   r   r   r}   r   r   rY   r   r   r   r   r   r   r   r+   r+   r   r.   r    sH    	
r  c                       s   e Zd ZU eed< dZ fddZe											ddee	j
 dee	j
 dee	j
 d	ee	j
 d
ee	j
 dee	j
 dee	j
 dee	j
 dee dee dee deee	j
 ef fddZ  ZS )ElectraForQuestionAnsweringr^   rO  c                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r|   r}   rx  r_  rO  r   r   r   
qa_outputsrb  r   r   r+   r.   r}      s
   
z$ElectraForQuestionAnswering.__init__Nr   r   ry   rt   r   r   start_positionsend_positionsr   r3  r4  r   c              
   C   sF  |d ur|n| j j}| j|||||||	|
d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrN|d}t| dkr[|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd   }|d ur|f| S |S t||||j|jdS )	N)r   ry   rt   r   r   r   r3  r   r"   ru   r   )ignore_indexr9   )r^  start_logits
end_logitsr   r7  )r^   rj  rO  r  rM   rG  r   rQ   r   clampr   r   r   r7  )r   r   r   ry   rt   r   r   r  r  r   r3  r4  rH  r  rI  r  r  
total_lossignored_indexr  
start_lossend_lossr	  r+   r+   r.   r   
  sV   







z#ElectraForQuestionAnswering.forward)NNNNNNNNNNN)r   r   r   r#   rX  rZ  r}   r   r   rY   r   r   r   r   r   r   r   r+   r+   r   r.   r    sR   
 
	
r  c                       r  )ElectraForMultipleChoicec                    s<   t  | t|| _t|| _t|jd| _	| 
  d S rE  )r|   r}   r_  rO  r~  sequence_summaryr   r   r   r8   rb  r   r   r+   r.   r}   P  s
   

z!ElectraForMultipleChoice.__init__Nr   r   ry   rt   r   r   r  r   r3  r4  r   c                 C   sn  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|durt }|||}|
s|f|dd  }|dur|f| S |S t	|||j
|jdS )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr"   ru   r   r  r   r  )r^   rj  rU   r   r   rO  r  r8   r   r   r   r7  )r   r   r   ry   rt   r   r   r  r   r3  r4  num_choicesrH  r  pooled_outputrI  reshaped_logitsr^  r  r	  r+   r+   r.   r   Z  sL   ,


z ElectraForMultipleChoice.forwardr  )r   r   r   r}   r   r   rY   r   r   r   r   r   r   r   r+   r+   r   r.   r  N  sH    
	
r  zS
    ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.
    c                "       s   e Zd ZdgZ fddZdd Zdd Ze														dd	ee	j
 d
ee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dee dee dee dee deee	j
 ef fddZ  ZS )ElectraForCausalLMr  c                    sN   t  | |jstd t|| _t|| _t	
|j|j| _|   d S )NzOIf you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`)r|   r}   r   r?   warningr_  rO  rK  r  r   r   r   r   r  init_weightsr   r   r+   r.   r}     s   


zElectraForCausalLM.__init__c                 C   r  r   r  rd  r+   r+   r.   r    r  z(ElectraForCausalLM.get_output_embeddingsc                 C   r  r   r  )r   new_embeddingsr+   r+   r.   r    r  z(ElectraForCausalLM.set_output_embeddingsNr   r   ry   rt   r   r   r   r#  r  r   r2  r   r3  r4  r   c                 K   s   |dur|n| j j}|	durd}| j|||||||||
||||d}|d }| | |}d}|	durB| j||	fd| j ji|}|sX|f|dd  }|durV|f| S |S t|||j|j	|j
|jdS )a3  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
        >>> config = ElectraConfig.from_pretrained("google/electra-base-generator")
        >>> config.is_decoder = True
        >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r   ry   rt   r   r   r   r#  r   r2  r   r3  r4  r   r   r"   )r^  rI  r   r   r7  r8  )r^   rj  rO  r  r  loss_functionr   r   r   r   r7  r8  )r   r   r   ry   rt   r   r   r   r#  r  r   r2  r   r3  r4  r|  r  r  r  lm_lossr	  r+   r+   r.   r     sR   )zElectraForCausalLM.forward)NNNNNNNNNNNNNN)r   r   r   r  r}   r  r  r   r   rY   r   r   r   r   r   r   r   r   r+   r+   r   r.   r    sf    	
r  )
r  rK   r  r  r  r  r  r_  rN  ro   )r$   )Nr   r   rA   dataclassesr   typingr   r   r   rY   r   torch.nnr   r   r	   activationsr   r   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r    utils.deprecationr!   configuration_electrar#   
get_loggerr   r?   ro   Modulerp   r   r   r  r  r  r  r  r)  rD  rK  rN  r]  r_  rv  r~  r  r  rK   r  r  r  r  __all__r+   r+   r+   r.   <module>   s   (


RC 7GZ{cS^KCRgl