o
    wiz                    @   s  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, e*-e.Z/G dd de	j0Z1G dd de	j0Z2G dd de2Z3G dd de	j0Z4e2e3dZ5G dd de	j0Z6G dd de	j0Z7G dd  d e	j0Z8G d!d" d"eZ9G d#d$ d$e	j0Z:G d%d& d&e	j0Z;e(G d'd( d(e"Z<G d)d* d*e	j0Z=G d+d, d,e	j0Z>e(G d-d. d.e<Z?e(G d/d0 d0e<Z@e(d1d2G d3d4 d4e<ZAe(G d5d6 d6e<ZBe(G d7d8 d8e<ZCe(G d9d: d:e<ZDe(d;d2G d<d= d=e<eZEdAd>d?ZFg d@ZGdS )BzPyTorch CamemBERT model.    N)OptionalUnion)version)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringget_torch_versionlogging   )CamembertConfigc                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )CamembertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr%   register_buffertorcharangeexpandzerosr'   sizelongr"   selfconfig	__class__ m/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/camembert/modeling_camembert.pyr.   9   s"   
zCamembertEmbeddings.__init__Nr   c                 C   s   |d u r|d urt || j|}n| |}|d ur| }n| d d }|d }|d u rTt| drI| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r]| |}| |}
||
 }| jdkrt| |}||7 }| |}| |}|S )Nr(   r   r*   r   r,   devicer&   )"create_position_ids_from_input_idsr"   &create_position_ids_from_inputs_embedsrC   hasattrr*   rA   r?   rB   rD   r'   rM   r3   r7   r%   r5   r8   r<   )rF   	input_idsr*   r'   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr7   
embeddingsr5   rJ   rJ   rK   forwardR   s0   








zCamembertEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr(   r   rL   r   )rC   r?   r@   r"   rD   rM   	unsqueezerA   )rF   rR   rT   sequence_lengthr'   rJ   rJ   rK   rO   z   s   	z:CamembertEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )__name__
__module____qualname____doc__r.   rY   rO   __classcell__rJ   rJ   rH   rK   r!   3   s    
(r!   c                       s   e Zd Zd fdd	ZdejdejfddZ						dd	ejd
eej deej deej deej dee	e	ej   dee
 de	ej fddZ  ZS )CamembertSelfAttentionNc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|p\t|dd| _| jdksh| jd	kry|j| _t	d
|j d | j| _|j| _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r%   r&   relative_keyrelative_key_query   r   )r-   r.   r1   num_attention_headsrP   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer:   attention_probs_dropout_probr<   r=   r%   r4   r/   distance_embedding
is_decoderrF   rG   r%   rH   rJ   rK   r.      s*   

zCamembertSelfAttention.__init__xreturnc                 C   s6   |  d d | j| jf }||}|ddddS )Nr(   r   rf   r   r	   )rC   rg   rj   viewpermute)rF   rt   new_x_shaperJ   rJ   rK   transpose_for_scores   s   
z+CamembertSelfAttention.transpose_for_scoresFhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 C   s  |  |}|d u}	|	r|d ur|d }
|d }|}nP|	r/| | |}
| | |}|}n;|d urZ| | |}
| | |}tj|d |
gdd}
tj|d |gdd}n| | |}
| | |}| |}|d u}| jrz|
|f}t||
dd}| j	dks| j	dkr	|j
d |
j
d }}|rtj|d tj|jd	dd}ntj|tj|jd	dd}tj|tj|jd	dd}|| }| || j d }|j|jd
}| j	dkrtd||}|| }n| j	dkr	td||}td|
|}|| | }|t| j }|d ur|| }tjj|dd}| |}|d ur0|| }t||}|dddd }| d d | jf }||}|rX||fn|f}| jrd||f }|S )Nr   r   rf   dimr(   rd   re   rL   r+   zbhld,lrd->bhlrzbhrd,lrd->bhlrr	   ) rm   ry   rn   ro   r?   catrr   matmul	transposer%   shapetensorrD   rM   rv   r@   rq   r4   tor,   einsummathsqrtrj   r   
functionalsoftmaxr<   rw   
contiguousrC   rk   )rF   rz   r{   r|   r}   r~   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputsrJ   rJ   rK   rY      sn   









zCamembertSelfAttention.forwardNNNNNNF)r\   r]   r^   r.   r?   Tensorry   r   FloatTensortupleboolrY   r`   rJ   rJ   rH   rK   ra      s4    	ra   c                       s   e Zd Zd fdd	Z						ddejdeej deej deej d	eej d
eeeej   dee	 deej f fddZ
  ZS )CamembertSdpaSelfAttentionNc                    s4   t  j||d |j| _tt tdk | _d S )Nr%   z2.2.0)r-   r.   rp   dropout_probr   parser   require_contiguous_qkvrs   rH   rJ   rK   r.     s   z#CamembertSdpaSelfAttention.__init__Frz   r{   r|   r}   r~   r   r   ru   c              	      s  | j dks|s|d urtd t |||||||S | \}}	}
| | |}|d u}|r3|n|}|r9|n|}|rP|rP|d jd |jd krP|\}}n,| | 	|}| | 
|}|d ur||s|tj|d |gdd}tj|d |gdd}| jr||f}| jr|jjdkr|d ur| }| }| }| jr|s|d u r|	dkrdnd	}tjjj||||| jr| jnd
|d}|dd}|||	| j}|f}| jr||f }|S )Nr&   a  CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   rf   r   r   cudaTF        )	attn_mask	dropout_p	is_causal)r%   loggerwarning_oncer-   rY   rC   ry   rm   r   rn   ro   r?   r   rr   r   rM   typer   r   r   scaled_dot_product_attentiontrainingr   r   reshaperk   )rF   rz   r{   r|   r}   r~   r   r   bsztgt_len_r   r   current_statesr   r   r   attn_outputr   rH   rJ   rK   rY     s^   

 
 	
z"CamembertSdpaSelfAttention.forwardr   r   )r\   r]   r^   r.   r?   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r     s2    		r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )CamembertSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr#   )r-   r.   r   rl   r1   denser8   r9   r:   r;   r<   rE   rH   rJ   rK   r.   {     
zCamembertSelfOutput.__init__rz   input_tensorru   c                 C   &   |  |}| |}| || }|S r   r   r<   r8   rF   rz   r   rJ   rJ   rK   rY        

zCamembertSelfOutput.forwardr\   r]   r^   r.   r?   r   rY   r`   rJ   rJ   rH   rK   r   z      $r   )eagersdpac                       s   e Zd Zd fdd	Zdd Z						ddejdeej d	eej d
eej deej dee	e	ej   dee
 de	ej fddZ  ZS )CamembertAttentionNc                    s4   t    t|j ||d| _t|| _t | _d S )Nr   )	r-   r.    CAMEMBERT_SELF_ATTENTION_CLASSES_attn_implementationrF   r   outputsetpruned_headsrs   rH   rJ   rK   r.     s   

zCamembertAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   rF   rg   rj   r   r   rm   rn   ro   r   r   rk   union)rF   headsindexrJ   rJ   rK   prune_heads  s   zCamembertAttention.prune_headsFrz   r{   r|   r}   r~   r   r   ru   c              	   C   s<   |  |||||||}| |d |}	|	f|dd   }
|
S )Nr   r   )rF   r   )rF   rz   r{   r|   r}   r~   r   r   self_outputsattention_outputr   rJ   rJ   rK   rY     s   
	zCamembertAttention.forwardr   r   )r\   r]   r^   r.   r   r?   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r     s4    	r   c                       2   e Zd Z fddZdejdejfddZ  ZS )CamembertIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r-   r.   r   rl   r1   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrE   rH   rJ   rK   r.     s
   
zCamembertIntermediate.__init__rz   ru   c                 C   s   |  |}| |}|S r   )r   r   )rF   rz   rJ   rJ   rK   rY     s   

zCamembertIntermediate.forwardr   rJ   rJ   rH   rK   r     s    r   c                       r   )CamembertOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r-   r.   r   rl   r   r1   r   r8   r9   r:   r;   r<   rE   rH   rJ   rK   r.     r   zCamembertOutput.__init__rz   r   ru   c                 C   r   r   r   r   rJ   rJ   rK   rY     r   zCamembertOutput.forwardr   rJ   rJ   rH   rK   r     r   r   c                       s   e Zd Z fddZ						ddejdeej deej deej d	eej d
eeeej   dee	 deej fddZ
dd Z  ZS )CamembertLayerc                    sr   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _d S )Nr   z> should be used as a decoder model if cross attention is addedr&   r   )r-   r.   chunk_size_feed_forwardseq_len_dimr   	attentionrr   add_cross_attentionrh   crossattentionr   intermediater   r   rE   rH   rJ   rK   r.     s   


zCamembertLayer.__init__NFrz   r{   r|   r}   r~   r   r   ru   c              	   C   s  |d ur
|d d nd }| j |||||d}	|	d }
| jr(|	dd }|	d }n|	dd  }d }| jro|d urot| dsDtd|  d|d urN|d	d  nd }| |
||||||}|d }
||dd  }|d }|| }t| j| j| j|
}|f| }| jr||f }|S )
Nrf   )r   r   r   r   r(   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rr   rP   rh   r   r   feed_forward_chunkr   r   )rF   rz   r{   r|   r}   r~   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputrJ   rJ   rK   rY     sP   


	

zCamembertLayer.forwardc                 C   s   |  |}| ||}|S r   )r   r   )rF   r   intermediate_outputr   rJ   rJ   rK   r   2  s   
z!CamembertLayer.feed_forward_chunkr   )r\   r]   r^   r.   r?   r   r   r   r   r   rY   r   r`   rJ   rJ   rH   rK   r     s4    	
Ar   c                       s   e Zd Z fddZ									ddejdeej deej d	eej d
eej deeeej   dee	 dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS rJ   )r   ).0r   rG   rJ   rK   
<listcomp>=  s    z-CamembertEncoder.__init__.<locals>.<listcomp>F)	r-   r.   rG   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrE   rH   r   rK   r.   :  s   
 
zCamembertEncoder.__init__NFTrz   r{   r|   r}   r~   past_key_valuesr   r   output_hidden_statesreturn_dictru   c              
   C   s8  |	rdnd }|r
dnd }|r| j jrdnd }| jr%| jr%|r%td d}|r)dnd }t| jD ]K\}}|	r;||f }|d urC|| nd }|d urM|| nd }||||||||d}|d }|rg||d f7 }|r{||d f }| j jr{||d f }q0|	r||f }|
std	d
 |||||fD S t	|||||dS )NrJ   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r~   r   r   r   r(   r   rf   c                 s   s    | ]	}|d ur|V  qd S r   rJ   )r   vrJ   rJ   rK   	<genexpr>v  s    z+CamembertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   rz   
attentionscross_attentions)
rG   r   r   r   r   r   	enumerater   r   r   )rF   rz   r{   r|   r}   r~   r   r   r   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputsrJ   rJ   rK   rY   @  sd   


zCamembertEncoder.forward)	NNNNNNFFT)r\   r]   r^   r.   r?   r   r   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r   9  sD    		
r   c                       r   )CamembertPoolerc                    s*   t    t|j|j| _t | _d S r   )r-   r.   r   rl   r1   r   Tanh
activationrE   rH   rJ   rK   r.     s   
zCamembertPooler.__init__rz   ru   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r  )rF   rz   first_token_tensorpooled_outputrJ   rJ   rK   rY     s   

zCamembertPooler.forwardr   rJ   rJ   rH   rK   r    s    r  c                   @   s$   e Zd ZeZdZdZdZdd ZdS )CamembertPreTrainedModelrobertaTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS t |tre|jj	  dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   rl   weightdatanormal_rG   initializer_rangebiaszero_r/   r"   r8   fill_CamembertLMHead)rF   modulerJ   rJ   rK   _init_weights  s    


z&CamembertPreTrainedModel._init_weightsN)	r\   r]   r^   r    config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  rJ   rJ   rJ   rK   r    s    r  c                       s(   e Zd ZdZ fddZdd Z  ZS )CamembertClassificationHeadz-Head for sentence-level classification tasks.c                    sT   t    t|j|j| _|jd ur|jn|j}t|| _	t|j|j
| _d S r   )r-   r.   r   rl   r1   r   classifier_dropoutr;   r:   r<   
num_labelsout_projrF   rG   r%  rH   rJ   rK   r.     s   
z$CamembertClassificationHead.__init__c                 K   sL   |d d dd d f }|  |}| |}t|}|  |}| |}|S r  )r<   r   r?   tanhr'  rF   featureskwargsrt   rJ   rJ   rK   rY     s   




z#CamembertClassificationHead.forward)r\   r]   r^   r_   r.   rY   r`   rJ   rJ   rH   rK   r$    s    	r$  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )r  z,Camembert Head for masked language modeling.c                    sd   t    t|j|j| _tj|j|jd| _t|j|j	| _
tt|j	| _| j| j
_d S r   )r-   r.   r   rl   r1   r   r8   r9   
layer_normr0   decoder	Parameterr?   rB   r  rE   rH   rJ   rK   r.     s   
zCamembertLMHead.__init__c                 K   s*   |  |}t|}| |}| |}|S r   )r   r   r-  r.  r*  rJ   rJ   rK   rY     s
   


zCamembertLMHead.forwardc                 C   s,   | j jjjdkr| j| j _d S | j j| _d S )Nmeta)r.  r  rM   r   rF   rJ   rJ   rK   _tie_weights  s   zCamembertLMHead._tie_weights)r\   r]   r^   r_   r.   rY   r2  r`   rJ   rJ   rH   rK   r    s
    	
r  c                        s   e Zd ZdZg Zd fdd	Zdd Zdd Zd	d
 Ze														dde
ej de
ej de
ej de
ej de
ej de
ej de
ej de
ej de
eej  de
e de
e de
e de
e deeej ef fddZ  ZS )CamembertModela1  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

    Tc                    sT   t  | || _t|| _t|| _|rt|nd| _|j	| _
|j| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)r-   r.   rG   r!   rX   r   encoderr  poolerr   attn_implementationr%   	post_init)rF   rG   add_pooling_layerrH   rJ   rK   r.      s   

zCamembertModel.__init__c                 C      | j jS r   rX   r3   r1  rJ   rJ   rK   get_input_embeddings     z#CamembertModel.get_input_embeddingsc                 C      || j _d S r   r:  )rF   ro   rJ   rJ   rK   set_input_embeddings     z#CamembertModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr4  r   r   r   )rF   heads_to_pruner   r   rJ   rJ   rK   _prune_heads  s   zCamembertModel._prune_headsNrQ   r{   r*   r'   r|   rR   r}   r~   r   r   r   r   r   ru   c                  C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j jr-|
d ur(|
n| j j}
nd}
|d ur;|d ur;td|d urJ| || | }n|d urW| d d }ntd|\}}|d urf|j	n|j	}|	d urv|	d d j
d nd}|d u rt| jdr| jjd d d |f }|||}|}n	tj|tj|d}| j|||||d	}|d u rtj||| f|d
}| jdko| jdko|d u o| }|r| dkr| j jrt||||}nt||j|d}n| ||}| j jr'|d ur'| \}}}||f}|d u rtj||d
}|r!| dkr!t||j|d}n| |}nd }| || j j}| j||||||	|
|||d
}|d }| jd urO| |nd }|s^||f|dd   S t|||j |j!|j"|j#dS )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer(   z5You have to specify either input_ids or inputs_embedsr   rf   r*   rL   )rQ   r'   r*   rR   rS   )rM   r   r&   )r   )	r{   r|   r}   r~   r   r   r   r   r   r   )r   pooler_outputr   rz   r  r  )$rG   r   r   use_return_dictrr   r   rh   %warn_if_padding_and_no_attention_maskrC   rM   r   rP   rX   r*   rA   r?   rB   rD   onesr6  r%   r   r   r   r,   get_extended_attention_maskinvert_attention_maskget_head_maskr   r4  r5  r   r   rz   r  r  ) rF   rQ   r{   r*   r'   r|   rR   r}   r~   r   r   r   r   r   rT   
batch_sizerU   rM   rS   rV   rW   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputr  rJ   rJ   rK   rY   !  s   


zCamembertModel.forward)T)NNNNNNNNNNNNN)r\   r]   r^   r_   _no_split_modulesr.   r;  r>  rB  r   r   r?   r   listr   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r3    sd    	
r3  c                       s   e Zd ZddgZ fddZdd Zdd Ze																								dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 dee dee dee deee	j ef fddZ  ZS )CamembertForMaskedLMlm_head.decoder.weightlm_head.decoder.biasc                    s@   t  | |jrtd t|dd| _t|| _| 	  d S )NzpIf you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr8  
r-   r.   rr   r   warningr3  r  r  lm_headr7  rE   rH   rJ   rK   r.     s   
zCamembertForMaskedLM.__init__c                 C   r9  r   r\  r.  r1  rJ   rJ   rK   get_output_embeddings  r<  z*CamembertForMaskedLM.get_output_embeddingsc                 C   r=  r   r]  rF   new_embeddingsrJ   rJ   rK   set_output_embeddings  r?  z*CamembertForMaskedLM.set_output_embeddingsNrQ   r{   r*   r'   r|   rR   r}   r~   labelsr   r   r   ru   c                 C   s   |dur|n| j j}| j|||||||||
||d}|d }| |}d}|	dur@|	|j}	t }||d| j j|	d}|sV|f|dd  }|durT|f| S |S t	|||j
|jdS )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        N)
r{   r*   r'   r|   rR   r}   r~   r   r   r   r   r(   rf   losslogitsrz   r  )rG   rD  r  r\  r   rM   r   rv   r0   r   rz   r  )rF   rQ   r{   r*   r'   r|   rR   r}   r~   rb  r   r   r   r   rS  prediction_scoresmasked_lm_lossloss_fctr   rJ   rJ   rK   rY     s<   
zCamembertForMaskedLM.forward)NNNNNNNNNNNN)r\   r]   r^   _tied_weights_keysr.   r^  ra  r   r   r?   
LongTensorr   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   rV    sZ    	
rV  z
    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                          e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )"CamembertForSequenceClassificationc                    s>   t  | |j| _|| _t|dd| _t|| _|   d S NFrY  )	r-   r.   r&  rG   r3  r  r$  
classifierr7  rE   rH   rJ   rK   r.     s   
z+CamembertForSequenceClassification.__init__NrQ   r{   r*   r'   r|   rR   rb  r   r   r   ru   c                 C   st  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}d}|dur||j}| j jdu rW| jdkr=d| j _n| jdkrS|jt	j
ksN|jt	jkrSd| j _nd| j _| j jdkrut }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|d	d  }|dur|f| S |S t|||j|jd
S )a  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr{   r*   r'   r|   rR   r   r   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr(   rf   rc  )rG   rD  r  ro  r   rM   problem_typer&  r,   r?   rD   ri   r   squeezer   rv   r   r   rz   r  rF   rQ   r{   r*   r'   r|   rR   rb  r   r   r   r   rS  re  rd  rh  r   rJ   rJ   rK   rY   #  sV   


"


z*CamembertForSequenceClassification.forward
NNNNNNNNNN)r\   r]   r^   r.   r   r   r?   rj  r   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   rm    sH    	
rm  c                       s   e Zd Z fddZe										ddeej deej deej deej deej d	eej d
eej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForMultipleChoicec                    s@   t  | t|| _t|j| _t|j	d| _
|   d S )Nr   )r-   r.   r3  r  r   r:   r;   r<   rl   r1   ro  r7  rE   rH   rJ   rK   r.   x  s
   
z#CamembertForMultipleChoice.__init__NrQ   r*   r{   rb  r'   r|   rR   r   r   r   ru   c                 C   sz  |
dur|
n| j j}
|dur|jd n|jd }|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durR|d|dnd}|dure|d|d|dnd}| j||||||||	|
d	}|d }| |}| |}|d|}d}|dur||j	}t
 }|||}|
s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r(   r   )r'   r*   r{   r|   rR   r   r   r   rf   rc  )rG   rD  r   rv   rC   r  r<   ro  r   rM   r   r   rz   r  )rF   rQ   r*   r{   rb  r'   r|   rR   r   r   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   r  re  reshaped_logitsrd  rh  r   rJ   rJ   rK   rY     sN   -


z"CamembertForMultipleChoice.forwardrw  )r\   r]   r^   r.   r   r   r?   rj  r   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   rx  u  sH    
	
rx  c                       rl  )CamembertForTokenClassificationc                    sb   t  | |j| _t|dd| _|jd ur|jn|j}t|| _	t
|j|j| _|   d S rn  )r-   r.   r&  r3  r  r%  r;   r   r:   r<   rl   r1   ro  r7  r(  rH   rJ   rK   r.     s   z(CamembertForTokenClassification.__init__NrQ   r{   r*   r'   r|   rR   rb  r   r   r   ru   c                 C   s   |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}| |}d}|durB||j}t }||d| j	|d}|
sX|f|dd  }|durV|f| S |S t
|||j|jdS )a-  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrp  r   r(   rf   rc  )rG   rD  r  r<   ro  r   rM   r   rv   r&  r   rz   r  rv  rJ   rJ   rK   rY     s:   

z'CamembertForTokenClassification.forwardrw  )r\   r]   r^   r.   r   r   r?   rj  r   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r    sH    	
r  c                       s   e Zd Z fddZe											ddeej deej deej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eej ef fddZ  ZS )CamembertForQuestionAnsweringc                    s@   t  | |j| _t|dd| _t|j|j| _| 	  d S rn  )
r-   r.   r&  r3  r  r   rl   r1   
qa_outputsr7  rE   rH   rJ   rK   r.   5  s
   z&CamembertForQuestionAnswering.__init__NrQ   r{   r*   r'   r|   rR   start_positionsend_positionsr   r   r   ru   c                 C   sH  |dur|n| j j}| j|||||||	|
|d	}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrO|d}t| dkr\|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a[  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        Nrp  r   r   r(   r   )ignore_indexrf   )rd  start_logits
end_logitsrz   r  )rG   rD  r  r  splitru  r   r   rC   clampr   r   rz   r  )rF   rQ   r{   r*   r'   r|   rR   r  r  r   r   r   r   rS  re  r  r  
total_lossignored_indexrh  
start_lossend_lossr   rJ   rJ   rK   rY   ?  sP   






z%CamembertForQuestionAnswering.forward)NNNNNNNNNNN)r\   r]   r^   r.   r   r   r?   rj  r   r   r   r   r   r   rY   r`   rJ   rJ   rH   rK   r  2  sN    
	
r  zU
    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c                "       s  e Zd ZddgZ fddZdd Zdd Ze																												dd
ee	j
 dee	j dee	j
 dee	j
 dee	j dee	j dee	j dee	j dee	j
 deeee	j   dee dee dee dee deee	j ef fddZdd Z  ZS )CamembertForCausalLMrW  rX  c                    s@   t  | |jstd t|dd| _t|| _| 	  d S )NzQIf you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`FrY  rZ  rE   rH   rJ   rK   r.     s   

zCamembertForCausalLM.__init__c                 C   r9  r   r]  r1  rJ   rJ   rK   r^    r<  z*CamembertForCausalLM.get_output_embeddingsc                 C   r=  r   r]  r_  rJ   rJ   rK   ra    r?  z*CamembertForCausalLM.set_output_embeddingsNrQ   r{   r*   r'   r|   rR   r}   r~   rb  r   r   r   r   r   ru   c                 K   s   |dur|n| j j}|	durd}| j|||||||||
||||d}|d }| |}d}|	durE|	|j}	| j||	fd| j ji|}|s[|f|dd  }|durY|f| S |S t|||j	|j
|j|jdS )aq  
        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
            >= 2. All the value in this tensor should be always < type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
        >>> config.is_decoder = True
        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NF)r{   r*   r'   r|   rR   r}   r~   r   r   r   r   r   r   r0   rf   )rd  re  r   rz   r  r  )rG   rD  r  r\  r   rM   loss_functionr0   r   r   rz   r  r  )rF   rQ   r{   r*   r'   r|   rR   r}   r~   rb  r   r   r   r   r   r,  r   rS  rf  lm_lossr   rJ   rJ   rK   rY     sT   2
zCamembertForCausalLM.forwardc                    s.   d}|D ]}|t  fdd|D f7 }q|S )NrJ   c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectr   rM   )r   
past_statebeam_idxrJ   rK   r     s   " z6CamembertForCausalLM._reorder_cache.<locals>.<genexpr>)r   )rF   r   r  reordered_past
layer_pastrJ   r  rK   _reorder_cache  s   z#CamembertForCausalLM._reorder_cache)NNNNNNNNNNNNNN)r\   r]   r^   ri  r.   r^  ra  r   r   r?   rj  r   r   r   r   r   r   rY   r  r`   rJ   rJ   rH   rK   r    sh    	
`r  c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )neri   r?   cumsumtype_asrD   )rQ   r"   rS   maskincremental_indicesrJ   rJ   rK   rN     s   rN   )r  rV  rx  r  rm  r  r3  r  )r   )Hr_   r   typingr   r   r?   torch.utils.checkpoint	packagingr   r   torch.nnr   r   r   activationsr
   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_camembertr    
get_loggerr\   r   Moduler!   ra   r   r   r   r   r   r   r   r   r  r  r$  r  r3  rV  rm  rx  r  r  r  rN   __all__rJ   rJ   rJ   rK   <module>   st   (

Z f4WR F\^iPX 
