o
    wiB                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZmZmZmZm Z m!Z! ddl"m#Z# e$e%Z&G dd dej'Z(G dd de Z)dd Z*d)ddZ+G dd deZ,G dd deZ-G dd deZ.G d d! d!eZ/G d"d# d#eZ0G d$d% d%eeZ1G d&d' d'eZ2g d(Z3dS )*zPyTorch Cohere model.    )CallableOptionalUnionN)nn   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)
LossKwargslogging   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )	CohereLayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__ f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/cohere/modular_cohere.pyr   7   s   

zCohereLayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )NT)keepdimr   )	dtypetor    float32meanpowrsqrtr#   r"   )r$   hidden_statesinput_dtyper1   variancer*   r*   r+   forward=   s   
zCohereLayerNorm.forward)Nr   F)__name__
__module____qualname__r   r7   __classcell__r*   r*   r(   r+   r   6   s    r   c                   @   s    e Zd Ze edd ZdS )CohereRotaryEmbeddingc           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd* | |  
dd}tj|ddd	}| | j }| | j }	W d    n1 shw   Y  |j|jd
|	j|jd
fS )Nr   r,   r   mpscpuF)device_typeenabledr   dimr.   )inv_freqfloatexpandshape
isinstancedevicetypestrr    autocast	transposerepeat_interleavecosattention_scalingsinr/   r.   )
r$   xposition_idsinv_freq_expandedposition_ids_expandedr?   freqsembrO   rQ   r*   r*   r+   r7   H   s   (&zCohereRotaryEmbedding.forwardN)r8   r9   r:   r    no_gradr   r7   r*   r*   r*   r+   r<   G   s    r<   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r   r   r,   rA   )r    stackflatten)rR   x1x2rot_xr*   r*   r+   rotate_halfX   s   r_   c           	      C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rC   )r.   rE   	unsqueezer_   r/   )	qkrO   rQ   rS   unsqueeze_dimr.   q_embedk_embedr*   r*   r+   apply_rotary_pos_emb`   s   

rf   c                       s   e Zd Z fddZ  ZS )	CohereMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)r'   )	r   r   r   Linearr%   intermediate_size	gate_projup_proj	down_projr$   configr(   r*   r+   r      s   zCohereMLP.__init__)r8   r9   r:   r   r;   r*   r*   r(   r+   rg   ~   s    rg   c                       s   e Zd ZdZddedee f fddZ		ddej	de
ej	ej	f d	eej	 d
ee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrn   	layer_idxc                    sT   t  || |j| _| jr(t|j| jf|jd| _t|j| jf|jd| _	d S d S )Nr%   r&   )
r   r   use_qk_normr   num_attention_headshead_dimlayer_norm_epsq_normnum_key_value_headsk_normr$   rn   rp   r(   r*   r+   r      s   zCohereAttention.__init__r4   position_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   sJ  |j d d }g |d| jR }| ||}	| ||}
| ||}| jr6| |	}	| |
}
|		dd}	|
	dd}
|	dd}|\}}t
|	|
||\}	}
|d urj|||d}||
|| j|\}
}t}| jjdkrxt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr,   r   r   )rQ   rO   r}   eager        )dropoutscaling)rG   rt   q_projviewk_projv_projrr   rv   rx   rM   rf   updaterp   r   rn   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r$   r4   rz   r{   r|   r}   r~   input_shapehidden_shapequery_states
key_statesvalue_statesrO   rQ   cache_kwargsattention_interfaceattn_outputattn_weightsr*   r*   r+   r7      sD   	



zCohereAttention.forwardN)NN)r8   r9   r:   __doc__r   r   intr   r    Tensortupler   
LongTensorr   r   r7   r;   r*   r*   r(   r+   ro      s(    ro   c                       s   e Zd Zdedef fddZ							ddejdeej d	eej	 d
ee
 dee dee deej	 deeejejf  dee deejeeejejf  f fddZ  ZS )CohereDecoderLayerrn   rp   c                    s@   t    |j| _t||d| _t|| _t|j|jd| _	d S )N)rn   rp   rq   )
r   r   r%   ro   	self_attnrg   mlpr   ru   input_layernormry   r(   r*   r+   r      s
   

zCohereDecoderLayer.__init__NFr4   r{   rS   r|   output_attentions	use_cacher}   rz   r~   r   c	                 K   sb   |}
|  |}| jd||||||||d|	\}}| |}|
| | }|f}|r/||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r4   r{   rS   r|   r   r   r}   rz   Nr*   )r   r   r   )r$   r4   r{   rS   r|   r   r   r}   rz   r~   residualhidden_states_attentionself_attn_weightshidden_states_mlpoutputsr*   r*   r+   r7      s(   
	


zCohereDecoderLayer.forward)NNNFFNN)r8   r9   r:   r   r   r   r    r   r   r   r   boolr   r   r   FloatTensorr7   r;   r*   r*   r(   r+   r      s<    
	
r   c                   @   s   e Zd Zdd ZdS )CoherePreTrainedModelc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )Nr   )r1   stdg      ?)rn   initializer_rangerH   r   rh   r"   datanormal_r'   zero_	Embeddingpadding_idxr   fill_)r$   moduler   r*   r*   r+   _init_weights  s   


z#CoherePreTrainedModel._init_weightsN)r8   r9   r:   r   r*   r*   r*   r+   r     s    r   c                       s"   e Zd Zdef fddZ  ZS )CohereModelrn   c                    sN   t    t fddt jD | _t d| _t	 j
 jd| _d S )Nc                    s   g | ]}t  |qS r*   )r   ).0rp   rn   r*   r+   
<listcomp>!  s    z(CohereModel.__init__.<locals>.<listcomp>r   rq   )r   r   r   
ModuleListrangenum_hidden_layerslayersr<   
rotary_embr   r%   ru   normrm   r(   r   r+   r     s   zCohereModel.__init__)r8   r9   r:   r   r   r;   r*   r*   r(   r+   r     s    r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r8   r9   r:   r*   r*   r*   r+   r   '  s    r   c                       s   e Zd Z fddZ											ddeej deej deej deee	e
ej f  d	eej d
eej dee dee dee deej deeejf dee defddZ  ZS )CohereForCausalLMc                    s*   t  | t|| _|j| _|j| _d S r   )r   r   r   modellogit_scaletie_word_embeddingsrm   r(   r*   r+   r   +  s   
zCohereForCausalLM.__init__Nr   	input_idsr{   rS   past_key_valuesinputs_embedslabelsr   r   output_hidden_statesr}   logits_to_keepr~   r   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }|| j	 }d}|dur]| j
d||| j jd|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   r{   rS   r   r   r   r   r   r}   )logitsr   
vocab_size)lossr   r   r4   
attentionsr*   )rn   r   r   r   last_hidden_staterH   r   slicelm_headr   loss_functionr   r   r   r4   r   )r$   r   r{   rS   r   r   r   r   r   r   r}   r   r~   r   r4   slice_indicesr   r   r*   r*   r+   r7   1  s<   %

zCohereForCausalLM.forward)NNNNNNNNNNr   )r8   r9   r:   r   r   r    r   r   r   r   listr   r   r   r   r   r   r7   r;   r*   r*   r(   r+   r   *  sP    	
r   )r   r   r   )Nr   )4r   typingr   r   r   r    torch.utils.checkpointr   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   configuration_coherer   
get_loggerr8   loggerModuler   r<   r_   rf   rg   ro   r   r   r   r   r   __all__r*   r*   r*   r+   <module>   s8   $	

CE
R