o
    ei=                     @   s^  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z! e"e#Z$G dd dej%Z&G dd deZ'dd Z(d&ddZ)G dd deZ*G dd deZ+G dd  d e
Z,G d!d" d"eZ-G d#d$ d$eZ.g d%Z/dS )'zPyTorch Cohere model.    )CallableN)nn   )Cache)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)maybe_autocast   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigc                       s&   e Zd Zd fdd	Zdd Z  ZS )	CohereLayerNormNh㈵>Fc                    s&   t    tt|| _|| _dS )zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__ g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cohere/modular_cohere.pyr   5   s   

zCohereLayerNorm.__init__c                 C   sl   |j }|tj}|jddd}|| djddd}|| t|| j  }| jtj| }||S )NT)keepdimr   )	dtypetor   float32meanpowrsqrtr!   r    )r"   hidden_statesinput_dtyper/   variancer(   r(   r)   forward;   s   
zCohereLayerNorm.forward)Nr   F)__name__
__module____qualname__r   r5   __classcell__r(   r(   r&   r)   r   4   s    r   c                   @   s    e Zd Ze edd ZdS )CohereRotaryEmbeddingc           
      C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}t|dd* | |  	dd}t
j|ddd	}| | j }| | j }	W d    n1 sgw   Y  |j|jd
|	j|jd
fS )Nr   r*   r   mpscpuF)device_typeenabledr   dimr,   )inv_freqfloatexpandshape
isinstancedevicetypestrr   	transposer   repeat_interleavecosattention_scalingsinr-   r,   )
r"   xposition_idsinv_freq_expandedposition_ids_expandedr=   freqsembrL   rN   r(   r(   r)   r5   F   s   (&zCohereRotaryEmbedding.forwardN)r6   r7   r8   r   no_gradr
   r5   r(   r(   r(   r)   r:   E   s    r:   c                 C   sB   | dd d df }| ddd df }t j| |gddd}|S )N.r   r   r*   r?   )r   stackflatten)rO   x1x2rot_xr(   r(   r)   rotate_halfV   s   r\   c                 C   sj   | j }|  } | }||}||}| | t| |  }|| t||  }|j|d|j|dfS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rA   )r,   rC   	unsqueezer\   r-   )qkrL   rN   unsqueeze_dimr,   q_embedk_embedr(   r(   r)   apply_rotary_pos_emb^   s   

rc   c                       s   e Zd Z fddZ  ZS )	CohereMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NF)r%   )	r   r   r   Linearr#   intermediate_size	gate_projup_proj	down_projr"   configr&   r(   r)   r   {   s   zCohereMLP.__init__)r6   r7   r8   r   r9   r(   r(   r&   r)   rd   z   s    rd   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB f fddZ  ZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrk   	layer_idxc                    sT   t  || |j| _| jr(t|j| jf|jd| _t|j| jf|jd| _	d S d S )Nr#   r$   )
r   r   use_qk_normr   num_attention_headshead_dimlayer_norm_epsq_normnum_key_value_headsk_normr"   rk   rm   r&   r(   r)   r      s   zCohereAttention.__init__r2   position_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 K   s>  |j d d }g |d| jR }| ||}	| ||}
| ||}| jr6| |	}	| |
}
|		dd}	|
	dd}
|	dd}|\}}t
|	|
||\}	}
|d urj|||d}||
|| j|\}
}t| jjt}|| |	|
||f| js~dn| j| jd|\}}|jg |dR   }| |}||fS )Nr*   r   r   )rN   rL   rz   g        )dropoutscaling)rE   rq   q_projviewk_projv_projro   rs   ru   rJ   rc   updaterm   r   get_interfacerk   _attn_implementationr   trainingattention_dropoutr~   reshape
contiguouso_proj)r"   r2   rw   rx   ry   rz   r{   input_shapehidden_shapequery_states
key_statesvalue_statesrL   rN   cache_kwargsattention_interfaceattn_outputattn_weightsr(   r(   r)   r5      sD   	



zCohereAttention.forwardN)NN)r6   r7   r8   __doc__r   intr   r   Tensortupler   
LongTensorr   r   r5   r9   r(   r(   r&   r)   rl      s(    rl   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )CohereDecoderLayerrk   rm   c                    s@   t    |j| _t||d| _t|| _t|j|jd| _	d S )N)rk   rm   rn   )
r   r   r#   rl   	self_attnrd   mlpr   rr   input_layernormrv   r&   r(   r)   r      s
   

zCohereDecoderLayer.__init__NFr2   rx   rP   ry   	use_cacherz   rw   r{   r|   c              
   K   sL   |}	|  |}| jd|||||||d|\}
}| |}|	|
 | }|S )ar  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r2   rx   rP   ry   r   rz   rw   Nr(   )r   r   r   )r"   r2   rx   rP   ry   r   rz   rw   r{   residualhidden_states_attention_hidden_states_mlpr(   r(   r)   r5      s    


zCohereDecoderLayer.forward)NNNFNN)r6   r7   r8   r   r   r   r   r   r   r   boolr   r   r   FloatTensorr5   r9   r(   r(   r&   r)   r      s6    
	
r   c                       s"   e Zd Zdef fddZ  ZS )CohereModelrk   c                    sB   t    t fddt jD | _t j j	d| _
d S )Nc                    s   g | ]}t  |qS r(   )r   ).0rm   rk   r(   r)   
<listcomp>  s    z(CohereModel.__init__.<locals>.<listcomp>rn   )r   r   r   
ModuleListrangenum_hidden_layerslayersr   r#   rr   normrj   r&   r   r)   r      s
   zCohereModel.__init__)r6   r7   r8   r   r   r9   r(   r(   r&   r)   r      s    r   c                       s   e Zd Z fddZ											ddejdB dejdB dejdB dedB d	ejdB d
ejdB de	dB de	dB de	dB dejdB de
ejB dee defddZ  ZS )CohereForCausalLMc                    s*   t  | t|| _|j| _|j| _d S r   )r   r   r   modellogit_scaletie_word_embeddingsrj   r&   r(   r)   r     s   
zCohereForCausalLM.__init__Nr   	input_idsrx   rP   ry   inputs_embedslabelsr   output_attentionsoutput_hidden_statesrz   logits_to_keepr{   r|   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }|| j	 }d}|dur]| j
d||| j jd|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   rx   rP   ry   r   r   r   r   rz   )logitsr   
vocab_size)lossr   ry   r2   
attentionsr(   )rk   r   r   r   last_hidden_staterF   r   slicelm_headr   loss_functionr   r	   ry   r2   r   )r"   r   rx   rP   ry   r   r   r   r   r   rz   r   r{   outputsr2   slice_indicesr   r   r(   r(   r)   r5     s<   %

zCohereForCausalLM.forward)NNNNNNNNNNr   )r6   r7   r8   r   r   r   r   r   r   r   r   r   r   r	   r5   r9   r(   r(   r&   r)   r     sP    	
r   )r   r   CoherePreTrainedModel)r   )0r   collections.abcr   r   r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_rope_utilsr
   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   llama.modeling_llamar   r   r   r   r   r   configuration_coherer   
get_loggerr6   loggerModuler   r:   r\   rc   rd   rl   r   r   r   __all__r(   r(   r(   r)   <module>   s4    

C9	R