o
    i&                    @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. e*/e0Z1G dd dej2Z3G dd dej2Z4G dd de4Z5G dd dej2Z6e4e5dZ7G dd dej2Z8G dd dej2Z9G d d! d!ej2Z:G d"d# d#eZ;G d$d% d%ej2Z<G d&d' d'ej2Z=e)G d(d) d)e#Z>G d*d+ d+ej2Z?G d,d- d-ej2Z@e)G d.d/ d/e>ZAe)G d0d1 d1e>ZBe)d2d3G d4d5 d5e>ZCe)G d6d7 d7e>ZDe)G d8d9 d9e>ZEe)G d:d; d;e>ZFe)d<d3G d=d> d>e>eZGdBd?d@ZHg dAZIdS )CzPyTorch CamemBERT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging)deprecate_kwarg   )CamembertConfigc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r!   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr'   register_buffertorcharangeexpandzerosr)   sizelongr$   selfconfig	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/camembert/modeling_camembert.pyr0   9   s"   
zCamembertEmbeddings.__init__Nr   c                 C   s   |d u r|d urt || j|}n| |}|d ur| }n| d d }|d }|d u rTt| drI| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r]| |}| |}
||
 }| jdkrt| |}||7 }| |}| |}|S )Nr*   r!   r,   r   r.   devicer(   )"create_position_ids_from_input_idsr$   &create_position_ids_from_inputs_embedsrE   hasattrr,   rC   rA   rD   rF   r)   rO   r5   r9   r'   r7   r:   r>   )rH   	input_idsr,   r)   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr9   
embeddingsr7   rL   rL   rM   forwardR   s0   








zCamembertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr*   r!   rN   r   )rE   rA   rB   r$   rF   rO   	unsqueezerC   )rH   rT   rV   sequence_lengthr)   rL   rL   rM   rQ   z   s   	z:CamembertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r0   r[   rQ   __classcell__rL   rL   rJ   rM   r#   3   s    
(r#   c                       s   e Zd Zd fdd	Zedddd						dd	ejd
eej deej deej dee	 dee
 deej deej fddZ  ZS )CamembertSelfAttentionNc                    s  t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _|| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r'   r(   relative_keyrelative_key_query   r!   )r/   r0   r3   num_attention_headsrR   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer<   attention_probs_dropout_probr>   r?   r'   r6   r1   distance_embedding
is_decoder	layer_idxrH   rI   r'   ru   rJ   rL   rM   r0      s,   


zCamembertSelfAttention.__init__past_key_valuepast_key_values4.58new_nameversionFhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionreturnc                 C   s   |j \}}	}
| |}||d| j| jdd}d}|d u}|d ur;t|tr9|j	| j
}|r5|j}n|j}n|}|r?|n|}|rX|d urX|rX|j| j
 j}|j| j
 j}nJ| |}||d| j| jdd}| |}||d| j| jdd}|d ur|s|nd }|||| j
d|i\}}|rt|trd|j| j
< t||dd}| jdks| jd	kr4|j d |j d }}|d urtj|d tj|jd
dd}ntj|tj|jd
dd}tj|tj|jd
dd}|| }| || j d }|j|jd}| jdkrtd||}|| }n| jd	kr4td||}td||}|| | }|t | j }|d urE|| }t!j"j#|dd}| $|}|d ur[|| }t||}|%dddd& }|' d d | j(f }||}||fS )Nr*   r!   rh   Fr   Trf   rg   rN   r-   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   ))shapero   viewri   rl   	transpose
isinstancer   
is_updatedgetru   cross_attention_cacheself_attention_cachelayerskeysvaluesrp   rq   updaterA   matmulr'   tensorrF   rO   rB   rs   r6   tor.   einsummathsqrtr   
functionalsoftmaxr>   permute
contiguousrE   rm   )rH   r}   r~   r   r   rx   r   r   
batch_sizerW   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shaperL   rL   rM   r[      s   








zCamembertSelfAttention.forwardNNNNNNFNr^   r_   r`   r0   r    rA   Tensorr   FloatTensorr   booltupler[   rb   rL   rL   rJ   rM   rc      s4    	rc   c                       s   e Zd Zd fdd	Zedddd						dd	ejd
eej deej deej dee	 dee
 deej deej f fddZ  ZS )CamembertSdpaSelfAttentionNc                    s   t  j|||d |j| _d S Nr'   ru   )r/   r0   rr   dropout_probrv   rJ   rL   rM   r0     s   z#CamembertSdpaSelfAttention.__init__rw   rx   ry   rz   Fr}   r~   r   r   r   r   r   c              	      s  | j dks|s|d urtd t |||||||S | \}}	}
| ||d| j| j	
dd}d}|d u}|r>|n|}|d ur\t|trZ|j| j}|rV|j}n|j}n|}|r`|n|}|ry|d ury|ry|j| j j}|j| j j}nF| ||d| j| j	
dd}| ||d| j| j	
dd}|d ur|s|nd }|||| jd|i\}}|rt|trd|j| j< | jo| o|d u o|	dk}tjjj||||| jr| jnd	|d
}|
dd}|||	| j }|d fS )Nr(   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r*   r!   rh   Fr   T        )	attn_mask	dropout_p	is_causal)!r'   loggerwarning_oncer/   r[   rE   ro   r   ri   rl   r   r   r   r   r   ru   r   r   r   r   r   rp   rq   r   rt   rA   r   r   scaled_dot_product_attentiontrainingr   reshaperm   )rH   r}   r~   r   r   rx   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r   attn_outputrJ   rL   rM   r[     sr   
 
	z"CamembertSdpaSelfAttention.forwardr   r   r   rL   rL   rJ   rM   r     s4    	r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )CamembertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr%   )r/   r0   r   rn   r3   denser:   r;   r<   r=   r>   rG   rJ   rL   rM   r0   }     
zCamembertSelfOutput.__init__r}   input_tensorr   c                 C   &   |  |}| |}| || }|S Nr   r>   r:   rH   r}   r   rL   rL   rM   r[        

zCamembertSelfOutput.forwardr^   r_   r`   r0   rA   r   r[   rb   rL   rL   rJ   rM   r   |      $r   )eagersdpac                       s   e Zd Zd fdd	Zdd Zedddd						
	ddejdeej	 deej	 deej	 dee
 dee deej deej fddZ  ZS )CamembertAttentionNc                    s6   t    t|j |||d| _t|| _t | _d S r   )	r/   r0    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrH   r   outputsetpruned_headsrv   rJ   rL   rM   r0     s   

zCamembertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r!   r   )lenr   rH   ri   rl   r   r   ro   rp   rq   r   r   rm   union)rH   headsindexrL   rL   rM   prune_heads  s   zCamembertAttention.prune_headsrw   rx   ry   rz   Fr}   r~   r   r   r   r   r   c              	   C   s>   | j |||||||d}| |d |}	|	f|dd   }
|
S )Nr~   r   r   rx   r   r   r   r!   )rH   r   )rH   r}   r~   r   r   rx   r   r   self_outputsattention_outputoutputsrL   rL   rM   r[     s   	zCamembertAttention.forwardr   r   )r^   r_   r`   r0   r   r    rA   r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r     s6    
	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )CamembertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r/   r0   r   rn   r3   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnrG   rJ   rL   rM   r0     s
   
zCamembertIntermediate.__init__r}   r   c                 C   s   |  |}| |}|S r   )r   r   )rH   r}   rL   rL   rM   r[     s   

zCamembertIntermediate.forwardr   rL   rL   rJ   rM   r     s    r   c                       r   )CamembertOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r/   r0   r   rn   r   r3   r   r:   r;   r<   r=   r>   rG   rJ   rL   rM   r0     r   zCamembertOutput.__init__r}   r   r   c                 C   r   r   r   r   rL   rL   rM   r[     r   zCamembertOutput.forwardr   rL   rL   rJ   rM   r     r   r   c                       s   e Zd Zd fdd	Zedddd							dd	ejd
eej deej deej deej dee	 dee
 deej deej fddZdd Z  ZS )CamembertLayerNc                    sx   t    |j| _d| _t||d| _|j| _|j| _| jr0| js(t|  dt|d|d| _	t
|| _t|| _d S )Nr!   ru   z> should be used as a decoder model if cross attention is addedr(   r   )r/   r0   chunk_size_feed_forwardseq_len_dimr   	attentionrt   add_cross_attentionrj   crossattentionr   intermediater   r   rH   rI   ru   rJ   rL   rM   r0     s   

zCamembertLayer.__init__rw   rx   ry   rz   Fr}   r~   r   r   encoder_attention_maskr   r   r   c	              	   C   s   | j ||||||d}	|	d }
|	dd  }| jrA|d urAt| ds)td|  d| j|
||||||d}|d }
||dd   }t| j| j| j|
}|f| }|S )N)r~   r   r   rx   r   r   r!   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rt   rR   rj   r   r   feed_forward_chunkr   r   )rH   r}   r~   r   r   r   rx   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputrL   rL   rM   r[     s>   

	
zCamembertLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rH   r   intermediate_outputr   rL   rL   rM   r   '  s   
z!CamembertLayer.feed_forward_chunkr   )NNNNNFN)r^   r_   r`   r0   r    rA   r   r   r   r   r   r   r[   r   rb   rL   rL   rJ   rM   r     s<    	
0r   c                       s   e Zd Zd fdd	Z										ddejdeej deej d	eej d
eej dee dee	 dee	 dee	 dee	 deej de
eej ef fddZ  ZS )CamembertEncoderNc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irI   rL   rM   
<listcomp>2  s    z-CamembertEncoder.__init__.<locals>.<listcomp>F)	r/   r0   rI   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rJ   r   rM   r0   /  s   
 
zCamembertEncoder.__init__FTr}   r~   r   r   r   rx   	use_cacher   output_hidden_statesreturn_dictr   r   c                 C   s`  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r<| j jr<|d u r<tt| j dt| j d}|rQ| j jrQt	|t
rQtd t|}t| jD ]9\}}|	ra||f }|d uri|| nd }|||||||||d}|d }|r||d f }| j jr||d	 f }qV|	r||f }|
st
d
d |||||fD S t|||||dS )NrL   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r   rx   r   r   r   r!   rh   c                 s   s    | ]	}|d ur|V  qd S r   rL   )r   vrL   rL   rM   	<genexpr>t  s    z+CamembertEncoder.forward.<locals>.<genexpr>)last_hidden_staterx   r}   
attentionscross_attentions)rI   r   r  r   r   r   rt   r   r   r   r   from_legacy_cache	enumerater  r   )rH   r}   r~   r   r   r   rx   r  r   r  r	  r   all_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_head_masklayer_outputsrL   rL   rM   r[   5  sl   


zCamembertEncoder.forwardr   )
NNNNNNFFTN)r^   r_   r`   r0   rA   r   r   r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r   .  sJ    		
r   c                       r   )CamembertPoolerc                    s*   t    t|j|j| _t | _d S r   )r/   r0   r   rn   r3   r   Tanh
activationrG   rJ   rL   rM   r0     s   
zCamembertPooler.__init__r}   r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r  )rH   r}   first_token_tensorpooled_outputrL   rL   rM   r[     s   

zCamembertPooler.forwardr   rL   rL   rJ   rM   r    s    r  c                   @   s*   e Zd ZU eed< dZdZdZdd ZdS )CamembertPreTrainedModelrI   robertaTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   rn   weightdatanormal_rI   initializer_rangebiaszero_r1   r$   r:   fill_CamembertLMHead)rH   modulerL   rL   rM   _init_weights  s    


z&CamembertPreTrainedModel._init_weightsN)	r^   r_   r`   r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_sdpar*  rL   rL   rL   rM   r    s   
 r  c                       s(   e Zd ZdZ fddZdd Z  ZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _d S r   )r/   r0   r   rn   r3   r   classifier_dropoutr=   r<   r>   
num_labelsout_projrH   rI   r0  rJ   rL   rM   r0     s   
z$CamembertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r  )r>   r   rA   tanhr2  rH   featureskwargsxrL   rL   rM   r[     s   




z#CamembertClassificationHead.forward)r^   r_   r`   ra   r0   r[   rb   rL   rL   rJ   rM   r/    s    	r/  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r(  z,Camembert Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S r   )r/   r0   r   rn   r3   r   r:   r;   
layer_normr2   decoder	ParameterrA   rD   r%  rG   rJ   rL   rM   r0     s   
zCamembertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r
   r9  r:  r5  rL   rL   rM   r[     s
   


zCamembertLMHead.forwardc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r:  r%  rO   typerH   rL   rL   rM   _tie_weights  s   zCamembertLMHead._tie_weights)r^   r_   r`   ra   r0   r[   r?  rb   rL   rL   rJ   rM   r(    s
    	
r(  c                "       s   e Zd ZdZg Zd fdd	Zdd Zdd Zd	d
 Ze															dde
ej de
ej de
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e de
e de
e de
ej deeej ef fddZ  ZS )CamembertModela1  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    Tc                    sT   t  | || _t|| _t|| _|rt|nd| _|j	| _
|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r/   r0   rI   r#   rZ   r   encoderr  poolerr   attn_implementationr'   	post_init)rH   rI   add_pooling_layerrJ   rL   rM   r0     s   

zCamembertModel.__init__c                 C      | j jS r   rZ   r5   r>  rL   rL   rM   get_input_embeddings     z#CamembertModel.get_input_embeddingsc                 C      || j _d S r   rG  )rH   rq   rL   rL   rM   set_input_embeddings     z#CamembertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrA  r  r   r   )rH   heads_to_pruner  r   rL   rL   rM   _prune_heads  s   zCamembertModel._prune_headsNrS   r~   r,   r)   r   rT   r   r   rx   r  r   r  r	  r   r   c           !      C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j jr-|
d ur(|
n| j j}
nd}
|d ur;|d ur;td|d urJ| || | }n|d urW| d d }ntd|\}}|d urf|j	n|j	}d}|	d urt
|	ts}|	d d jd n|	 }|d u rt| jdr| jjd d d |f }|||}|}n	tj|tj|d}| j|||||d	}|d u rtj||| f|d
}| jdko| jdko|d u o| }|r| dkr| j jrt||||}nt||j|d}n| ||}| j jr0|d ur0| \}}}||f}|d u rtj||d
}|r*| dkr*t||j|d}n| |}nd }| || j j}| j ||||||	|
||||d}|d }| j!d urY| !|nd } |sh|| f|dd   S t"|| |j#|j$|j%|j&dS )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embedsr   r   r,   rN   )rS   r)   r,   rT   rU   )rO   r   r(   rh   )r   )
r~   r   r   r   rx   r  r   r  r	  r   r!   )r  pooler_outputrx   r}   r  r  )'rI   r   r  use_return_dictrt   r  rj   %warn_if_padding_and_no_attention_maskrE   rO   r   r   r   get_seq_lengthrR   rZ   r,   rC   rA   rD   rF   onesrC  r'   r   r   r   r.   get_extended_attention_maskinvert_attention_maskget_head_maskr  rA  rB  r   rx   r}   r  r  )!rH   rS   r~   r,   r)   r   rT   r   r   rx   r  r   r  r	  r   rV   r   rW   rO   rU   rX   rY   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputr  rL   rL   rM   r[     s   


zCamembertModel.forward)TNNNNNNNNNNNNNN)r^   r_   r`   ra   _no_split_modulesr0   rH  rK  rO  r   r   rA   r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r@    sj    	
r@  c                       s   e Zd ZddgZ fddZdd Zdd Ze																								dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 dee dee dee deee	j ef fddZ  ZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.FrE  
r/   r0   rt   r   warningr@  r  r(  lm_headrD  rG   rJ   rL   rM   r0     s   
zCamembertForMaskedLM.__init__c                 C   rF  r   ri  r:  r>  rL   rL   rM   get_output_embeddings  rI  z*CamembertForMaskedLM.get_output_embeddingsc                 C   rJ  r   rj  rH   new_embeddingsrL   rL   rM   set_output_embeddings  rL  z*CamembertForMaskedLM.set_output_embeddingsNrS   r~   r,   r)   r   rT   r   r   labelsr   r  r	  r   c                 C   s   |dur|n| j j}| j|||||||||
||d}|d }| |}d}|	dur@|	|j}	t }||d| j j|	d}|sV|f|dd  }|durT|f| S |S t	|||j
|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r~   r,   r)   r   rT   r   r   r   r  r	  r   r*   rh   losslogitsr}   r  )rI   rQ  r  ri  r   rO   r   r   r2   r   r}   r  )rH   rS   r~   r,   r)   r   rT   r   r   ro  r   r  r	  r   r`  prediction_scoresmasked_lm_lossloss_fctr   rL   rL   rM   r[     s<   
zCamembertForMaskedLM.forward)NNNNNNNNNNNN)r^   r_   r`   _tied_weights_keysr0   rk  rn  r   r   rA   
LongTensorr   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   rc    sZ    	
rc  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )"CamembertForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFrf  )	r/   r0   r1  rI   r@  r  r/  
classifierrD  rG   rJ   rL   rM   r0     s   
z+CamembertForSequenceClassification.__init__NrS   r~   r,   r)   r   rT   ro  r   r  r	  r   c                 C   st  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur||j}| j jdu rW| jdkr=d| j _n| jdkrS|jt	j
ksN|jt	jkrSd| j _nd| j _| j jdkrut }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr~   r,   r)   r   rT   r   r  r	  r   r!   
regressionsingle_label_classificationmulti_label_classificationr*   rh   rp  )rI   rQ  r  r|  r   rO   problem_typer1  r.   rA   rF   rk   r   squeezer   r   r   r   r}   r  rH   rS   r~   r,   r)   r   rT   ro  r   r  r	  r   r`  rr  rq  ru  r   rL   rL   rM   r[   (  sV   


"


z*CamembertForSequenceClassification.forward
NNNNNNNNNN)r^   r_   r`   r0   r   r   rA   rw  r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   rz    sH    	
rz  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr!   )r/   r0   r@  r  r   r<   r=   r>   rn   r3   r|  rD  rG   rJ   rL   rM   r0   }  s
   
z#CamembertForMultipleChoice.__init__NrS   r,   r~   ro  r)   r   rT   r   r  r	  r   c                 C   sz  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dur||j	}t
 }|||}|
s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr!   r*   r   )r)   r,   r~   r   rT   r   r  r	  rh   rp  )rI   rQ  r   r   rE   r  r>   r|  r   rO   r   r   r}   r  )rH   rS   r,   r~   ro  r)   r   rT   r   r  r	  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r  rr  reshaped_logitsrq  ru  r   rL   rL   rM   r[     sN   -


z"CamembertForMultipleChoice.forwardr  )r^   r_   r`   r0   r   r   rA   rw  r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r  z  sH    
	
r  c                       ry  )CamembertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S r{  )r/   r0   r1  r@  r  r0  r=   r   r<   r>   rn   r3   r|  rD  r3  rJ   rL   rM   r0     s   z(CamembertForTokenClassification.__init__NrS   r~   r,   r)   r   rT   ro  r   r  r	  r   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|durB||j}t }||d| j	|d}|
sX|f|dd  }|durV|f| S |S t
|||j|jdS )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr}  r   r*   rh   rp  )rI   rQ  r  r>   r|  r   rO   r   r   r1  r   r}   r  r  rL   rL   rM   r[     s:   

z'CamembertForTokenClassification.forwardr  )r^   r_   r`   r0   r   r   rA   rw  r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r    sH    	
r  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S r{  )
r/   r0   r1  r@  r  r   rn   r3   
qa_outputsrD  rG   rJ   rL   rM   r0   :  s
   z&CamembertForQuestionAnswering.__init__NrS   r~   r,   r)   r   rT   start_positionsend_positionsr   r  r	  r   c                 C   sH  |dur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nr}  r   r!   r*   r   )ignore_indexrh   )rq  start_logits
end_logitsr}   r  )rI   rQ  r  r  splitr  r   r   rE   clampr   r   r}   r  )rH   rS   r~   r,   r)   r   rT   r  r  r   r  r	  r   r`  rr  r  r  
total_lossignored_indexru  
start_lossend_lossr   rL   rL   rM   r[   D  sP   






z%CamembertForQuestionAnswering.forward)NNNNNNNNNNN)r^   r_   r`   r0   r   r   rA   rw  r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r  7  sN    
	
r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c                "       s   e Zd ZddgZ fddZdd Zdd Ze																												dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 dee dee dee dee dee deee	j ef fddZ  ZS )CamembertForCausalLMrd  re  c                    s@   t  | |jstd t|dd| _t|| _| 	  d S )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`Frf  rg  rG   rJ   rL   rM   r0     s   

zCamembertForCausalLM.__init__c                 C   rF  r   rj  r>  rL   rL   rM   rk    rI  z*CamembertForCausalLM.get_output_embeddingsc                 C   rJ  r   rj  rl  rL   rL   rM   rn    rL  z*CamembertForCausalLM.set_output_embeddingsNrS   r~   r,   r)   r   rT   r   r   ro  rx   r  r   r  r	  r   c                 K   s   |dur|n| j j}|	durd}| j|||||||||
||||d}|d }| |}d}|	durE|	|j}	| j||	fd| j ji|}|s[|f|dd  }|durY|f| S |S t|||j	|j
|j|jdS )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r~   r,   r)   r   rT   r   r   rx   r  r   r  r	  r   r2   rh   )rq  rr  rx   r}   r  r  )rI   rQ  r  ri  r   rO   loss_functionr2   r   rx   r}   r  r  )rH   rS   r~   r,   r)   r   rT   r   r   ro  rx   r  r   r  r	  r7  r   r`  rs  lm_lossr   rL   rL   rM   r[     sT   2
zCamembertForCausalLM.forwardra  )r^   r_   r`   rv  r0   rk  rn  r   r   rA   rw  r   r   r   r   r   r   r   r[   rb   rL   rL   rJ   rM   r    sf    	
r  c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r!   r   )nerk   rA   cumsumtype_asrF   )rS   r$   rU   maskincremental_indicesrL   rL   rM   rP     s   rP   )r  rc  r  r  rz  r  r@  r  )r   )Jra   r   typingr   r   rA   r   torch.nnr   r   r   activationsr	   r
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   utils.deprecationr    configuration_camembertr"   
get_loggerr^   r   Moduler#   rc   r   r   r   r   r   r   r   r   r  r  r/  r(  r@  rc  rz  r  r  r  r  rP   __all__rL   rL   rL   rM   <module>   sr   (

Z i7G[ M\^iPX
x