o
    	۷i=?                     @   sf  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z# e$e%Z&G dd dej'Z(G dd de Z)dd Z*d&ddZ+G dd deZ,G dd deZ-G dd  d eZ.G d!d" d"eZ/G d#d$ d$eZ0g d%Z1dS )'zPyTorch Cohere model.    )CallableOptionalUnionN)nn   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )	CohereLayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__ _/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/cohere/modular_cohere.pyr   6   s   

zCohereLayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )NT)keepdimr   )	dtypetor    float32meanpowrsqrtr#   r"   )r$   hidden_statesinput_dtyper1   variancer*   r*   r+   forward<   s   
zCohereLayerNorm.forward)Nr   F)__name__
__module____qualname__r   r7   __classcell__r*   r*   r(   r+   r   5   s    r   c                   @   s    e Zd Ze edd ZdS )CohereRotaryEmbeddingc           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd* | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 shw   Y  |j|jd
|	j|jd
fS )Nr   r,   r   mpscpuF)device_typeenabledr   dimr.   )inv_freqfloatexpandshape
isinstancedevicetypestrr    autocast	transposerepeat_interleavecosattention_scalingsinr/   r.   )
r$   xposition_idsinv_freq_expandedposition_ids_expandedr?   freqsembrO   rQ   r*   r*   r+   r7   G   s   (&zCohereRotaryEmbedding.forwardN)r8   r9   r:   r    no_gradr   r7   r*   r*   r*   r+   r<   F   s    r<   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r   r   r,   rA   )r    stackflatten)rR   x1x2rot_xr*   r*   r+   rotate_halfW   s   r_   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rC   )r.   rE   	unsqueezer_   r/   )	qkrO   rQ   rS   unsqueeze_dimr.   q_embedk_embedr*   r*   r+   apply_rotary_pos_emb_   s   

rf   c                       s   e Zd Z fddZ  ZS )	CohereMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)r'   )	r   r   r   Linearr%   intermediate_size	gate_projup_proj	down_projr$   configr(   r*   r+   r   ~   s   zCohereMLP.__init__)r8   r9   r:   r   r;   r*   r*   r(   r+   rg   }   s    rg   c                       s   e Zd ZdZddedee f fddZeddd	d
		dde	j
dee	j
e	j
f dee	j
 dee dee	j dee dee	j
ee	j
 f fddZ  ZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrn   	layer_idxc                    sT   t  || |j| _| jr(t|j| jf|jd| _t|j| jf|jd| _	d S d S )Nr%   r&   )
r   r   use_qk_normr   num_attention_headshead_dimlayer_norm_epsq_normnum_key_value_headsk_normr$   rn   rp   r(   r*   r+   r      s   zCohereAttention.__init__past_key_valuepast_key_values4.58new_nameversionr4   position_embeddingsattention_maskcache_positionkwargsreturnc                 K   sJ  |j d d }g |d| jR }| ||}	| ||}
| ||}| jr6| |	}	| |
}
|		dd}	|
	dd}
|	dd}|\}}t
|	|
||\}	}
|d urj|||d}||
|| j|\}
}t}| jjdkrxt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr,   r   r   )rQ   rO   r   eagerg        )dropoutscaling)rG   rt   q_projviewk_projv_projrr   rv   rx   rM   rf   updaterp   r   rn   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r$   r4   r   r   r{   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesrO   rQ   cache_kwargsattention_interfaceattn_outputattn_weightsr*   r*   r+   r7      sD   




zCohereAttention.forwardN)NN)r8   r9   r:   __doc__r   r   intr   r   r    Tensortupler   
LongTensorr   r   r7   r;   r*   r*   r(   r+   ro      s*    ro   c                       s   e Zd Zdedef fddZedddd							
				ddejde	ej de	ej
 de	e de	e de	ej
 de	eejejf  dee deeje	eejejf  f fddZ  ZS )CohereDecoderLayerrn   rp   c                    s@   t    |j| _t||d| _t|| _t|j|jd| _	d S )N)rn   rp   rq   )
r   r   r%   ro   	self_attnrg   mlpr   ru   input_layernormry   r(   r*   r+   r      s
   

zCohereDecoderLayer.__init__rz   r{   r|   r}   NFr4   r   rS   	use_cacher   r   r   r   c              
   K   sL   |}	|  |}| jd|||||||d|\}
}| |}|	|
 | }|S )ar  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r4   r   rS   r{   r   r   r   Nr*   )r   r   r   )r$   r4   r   rS   r{   r   r   r   r   residualhidden_states_attention_hidden_states_mlpr*   r*   r+   r7      s    


zCohereDecoderLayer.forward)NNNFNN)r8   r9   r:   r   r   r   r   r    r   r   r   r   boolr   r   r   FloatTensorr7   r;   r*   r*   r(   r+   r      s8    	
r   c                       s"   e Zd Zdef fddZ  ZS )CohereModelrn   c                    sN   t    t fddt jD | _t d| _t	 j
 jd| _d S )Nc                    s   g | ]}t  |qS r*   )r   ).0rp   rn   r*   r+   
<listcomp>  s    z(CohereModel.__init__.<locals>.<listcomp>r   rq   )r   r   r   
ModuleListrangenum_hidden_layerslayersr<   
rotary_embr   r%   ru   normrm   r(   r   r+   r     s   zCohereModel.__init__)r8   r9   r:   r   r   r;   r*   r*   r(   r+   r     s    r   c                       s   e Zd Z fddZ											ddeej deej deej deee	e
ej f  d	eej d
eej dee dee dee deej deeejf dee defddZ  ZS )CohereForCausalLMc                    s*   t  | t|| _|j| _|j| _d S r   )r   r   r   modellogit_scaletie_word_embeddingsrm   r(   r*   r+   r     s   
zCohereForCausalLM.__init__Nr   	input_idsr   rS   r{   inputs_embedslabelsr   output_attentionsoutput_hidden_statesr   logits_to_keepr   r   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }|| j	 }d}|dur]| j
d||| j jd|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   r   rS   r{   r   r   r   r   r   )logitsr   
vocab_size)lossr   r{   r4   
attentionsr*   )rn   r   r   r   last_hidden_staterH   r   slicelm_headr   loss_functionr   r   r{   r4   r   )r$   r   r   rS   r{   r   r   r   r   r   r   r   r   outputsr4   slice_indicesr   r   r*   r*   r+   r7     s<   %

zCohereForCausalLM.forward)NNNNNNNNNNr   )r8   r9   r:   r   r   r    r   r   r   r   listr   r   r   r   r   r   r7   r;   r*   r*   r(   r+   r     sP    	
r   )r   r   CoherePreTrainedModel)Nr   )2r   typingr   r   r   r    r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   r   configuration_coherer   
get_loggerr8   loggerModuler   r<   r_   rf   rg   ro   r   r   r   __all__r*   r*   r*   r+   <module>   s4    

D:
R