o
    i_                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' G dd dej(Z)G dd dej(Z*G dd dej(Z+dej,de-dej,fddZ.	d9d ej(d!ej,d"ej,d#ej,d$eej, d%e/d&e/d'ee fd(d)Z0d*d+ Z1d:d,d-Z2G d.d/ d/ej(Z3G d0d1 d1eZ4e G d2d3 d3eZ5e G d4d5 d5e5Z6e G d6d7 d7e5eZ7g d8Z8dS );    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )CohereConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )	CohereLayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__ g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/cohere/modeling_cohere.pyr   4   s   

zCohereLayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )NT)keepdim   )	dtypetor!   float32meanpowrsqrtr$   r#   )r%   hidden_statesinput_dtyper3   variancer+   r+   r,   forward:   s   
zCohereLayerNorm.forward)Nr   F__name__
__module____qualname__r   r9   __classcell__r+   r+   r)   r,   r   3   s    r   c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	CohereRotaryEmbeddinginv_freqNconfigc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr@   F)
persistent)r   r   hasattr
isinstancerB   dictgetrC   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrA   r   rope_init_fnattention_scalingregister_bufferr@   original_inv_freq)r%   rA   devicer@   r)   r+   r,   r   G   s   
zCohereRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd* | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 shw   Y  |j|jd
|	j|jd
fS )Nr   r-   r   mpscpuF)device_typeenabledr/   dimr0   )r@   floatexpandshaperH   rR   rD   strr!   autocast	transposerepeat_interleavecosrO   sinr1   r0   )
r%   xposition_idsinv_freq_expandedposition_ids_expandedrU   freqsembra   rb   r+   r+   r,   r9   X   s   (&zCohereRotaryEmbedding.forwardN)r;   r<   r=   r!   Tensor__annotations__r   r   no_gradr   r9   r>   r+   r+   r)   r,   r?   D   s   
 
r?   c                       s$   e Zd Z fddZdd Z  ZS )	CohereMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr(   )r   r   rA   r&   intermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnr%   rA   r)   r+   r,   r   i   s   
zCohereMLP.__init__c                 C   s$   |  | | || | }|S ri   )rt   rv   rr   rs   )r%   rc   rt   r+   r+   r,   r9   s   s    zCohereMLP.forwardr:   r+   r+   r)   r,   rm   h   s    
rm   r6   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r\   r[   reshape)r6   rx   batchnum_key_value_headsslenhead_dimr+   r+   r,   	repeat_kvx   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr/   r   r-   )rX   r0   )ptrainingr   )r   num_key_value_groupsr!   matmulr_   r\   r   
functionalsoftmaxr2   r1   r0   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputr+   r+   r,   eager_attention_forward   s   
&r   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r/   r   r-   rW   r   )r!   stackflatten)rc   x1x2rot_xr+   r+   r,   rotate_half   s   r   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rY   )r0   rZ   	unsqueezer   r1   )	qkra   rb   rd   unsqueeze_dimr0   q_embedk_embedr+   r+   r,   apply_rotary_pos_emb   s   

r   c                       s   e Zd ZdZddedee f fddZeddd	d
		dde	j
dee	j
e	j
f dee	j
 dee dee	j dee dee	j
ee	j
 f fddZ  ZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrA   	layer_idxc                    s  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _|j| _| jrt|j| jf|jd| _t|j| jf|jd| _d S d S )Nr~   g      Tro   r&   r'   )r   r   rA   r   getattrr&   num_attention_headsr~   r|   r   r   attention_dropout	is_causalr   rq   attention_biasq_projk_projv_projo_projuse_qk_normr   layer_norm_epsq_normk_normr%   rA   r   r)   r+   r,   r      s:   
zCohereAttention.__init__past_key_valuepast_key_values4.58new_nameversionr6   position_embeddingsr   cache_positionr   ry   c                 K   sJ  |j d d }g |d| jR }| ||}	| ||}
| ||}| jr6| |	}	| |
}
|		dd}	|
	dd}
|	dd}|\}}t
|	|
||\}	}
|d urj|||d}||
|| j|\}
}t}| jjdkrxt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr-   r   r/   )rb   ra   r   eagerr   )r   r   )r\   r~   r   viewr   r   r   r   r   r_   r   updater   r   rA   _attn_implementationr   r   r   r   rz   r   r   )r%   r6   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   ra   rb   cache_kwargsattention_interfacer   r   r+   r+   r,   r9      sD   




zCohereAttention.forwardri   )NN)r;   r<   r=   __doc__r   r   intr   r   r!   rj   tupler   
LongTensorr   r   r9   r>   r+   r+   r)   r,   r      s*     r   c                       s   e Zd Zdedef fddZedddd							
				ddejde	ej de	ej
 de	e de	e de	ej
 de	eejejf  dee deeje	eejejf  f fddZ  ZS )CohereDecoderLayerrA   r   c                    s@   t    |j| _t||d| _t|| _t|j|jd| _	d S )N)rA   r   r   )
r   r   r&   r   	self_attnrm   mlpr   r   input_layernormr   r)   r+   r,   r     s
   

zCohereDecoderLayer.__init__r   r   r   r   NFr6   r   rd   	use_cacher   r   r   ry   c              
   K   sL   |}	|  |}| jd|||||||d|\}
}| |}|	|
 | }|S )ar  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r6   r   rd   r   r   r   r   Nr+   )r   r   r   )r%   r6   r   rd   r   r   r   r   r   residualhidden_states_attention_hidden_states_mlpr+   r+   r,   r9   $  s    


zCohereDecoderLayer.forward)NNNFNN)r;   r<   r=   r   r   r   r   r!   rj   r   r   r   boolr   r   r   FloatTensorr9   r>   r+   r+   r)   r,   r     s8    	
r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )CoherePreTrainedModelrA   modelTr   r   )r6   
attentionsN)r;   r<   r=   r   rk   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr+   r+   r+   r,   r   V  s   
 
r   c                       s   e Zd Zdef fddZee							ddeej	 deej
 deej	 dee d	eej d
eej	 dee dee defddZ  ZS )CohereModelrA   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r+   )r   ).0r   rA   r+   r,   
<listcomp>r  s    z(CohereModel.__init__.<locals>.<listcomp>r   r   F)r   r   pad_token_idpadding_idx
vocab_sizer   	Embeddingr&   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr?   
rotary_embgradient_checkpointing	post_initrw   r)   r   r,   r   k  s   zCohereModel.__init__N	input_idsr   rd   r   inputs_embedsr   r   r   ry   c              	   K   s   |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f|
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rR   )rA   input_embedsr   r   r   rd   )r   rd   r   r   r   )last_hidden_stater   )
ValueErrorr   r	   rA   get_seq_lengthr!   aranger\   rR   r   r   r   r   r   r   r   )r%   r   r   rd   r   r   r   r   r   past_seen_tokensr   r6   r   decoder_layerr+   r+   r,   r9   {  sP   

	

zCohereModel.forward)NNNNNNN)r;   r<   r=   r   r   r   r   r   r!   r   rj   r   r   r   r   r   r   r9   r>   r+   r+   r)   r,   r   i  s<    	
r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZee												dd
e	e
j de	e
j de	e
j de	eeee
j f  de	e
j de	e
j de	e de	e de	e de	e
j deee
jf dee defddZ  ZS )CohereForCausalLMzlm_head.weightlm_headcolwise_repr6   logitsc                    sP   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|   d S rn   )r   r   r   r   r   r   rq   r&   r   logit_scaletie_word_embeddingsr   rw   r)   r+   r,   r     s   
zCohereForCausalLM.__init__Nr   r   r   rd   r   r   labelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   ry   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }|| j	 }d}|dur]| j
d||| j jd|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   r   rd   r   r   r   r  r  r   )r   r  r   )lossr   r   r6   r   r+   )rA   r  r  r   r   rH   r   slicer   r   loss_functionr   r   r   r6   r   )r%   r   r   rd   r   r   r  r   r  r  r   r  r   outputsr6   slice_indicesr   r  r+   r+   r,   r9     s<   '

zCohereForCausalLM.forward)NNNNNNNNNNr   )r;   r<   r=   _tied_weights_keys_tp_plan_pp_planr   r   r   r   r!   r   rj   r   r   listr   r   r   r   r   r   r9   r>   r+   r+   r)   r,   r     sZ    	
r   )r   r   r   )r   )Nr   )9typingr   r   r   r!   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   configuration_coherer   Moduler   r?   rm   rj   r   r   rZ   r   r   r   r   r   r   r   r   __all__r+   r+   r+   r,   <module>   sd   $

X:N]