o
    eik                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e$)e*Z+G dd dej,Z-dd Z.d6ddZ/G dd dej,Z0	d7dej,dej1d ej1d!ej1d"ej1dB d#e2d$e2fd%d&Z3G d'd( d(ej,Z4G d)d* d*eZ5e"G d+d, d,eZ6e"G d-d. d.e6Z7G d/d0 d0e6eZ8G d1d2 d2ee6Z9G d3d4 d4ee6Z:g d5Z;dS )8zPyTorch Persimmon model.    )Callable)OptionalN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)maybe_autocast   )PersimmonConfigc                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )PersimmonRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr   devicerope_init_fnr   	__class__ n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/persimmon/modeling_persimmon.pyr$   ;   s   


z!PersimmonRotaryEmbedding.__init__r.   ztorch.deviceseq_lenreturnztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr      dtype)r.   r;   )r(   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r.   r4   baser7   r8   dimattention_factorr   r2   r2   r3   r)   K   s   
&z8PersimmonRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabledr9   rG   r:   )r   rE   expandshaperD   r.   
isinstancetypestrr   	transposerA   catcosr*   sinr;   )
r-   xposition_idsinv_freq_expandedposition_ids_expandedrL   freqsembrV   rW   r2   r2   r3   forwardl   s   0&z PersimmonRotaryEmbedding.forwardN)NNN)__name__
__module____qualname__rA   Tensor__annotations__r   r$   staticmethodr   r@   tuplerE   r)   no_gradr   r^   __classcell__r2   r2   r0   r3   r   8   s&   
 

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrI   r9   rN   )rP   rA   rU   )rX   x1x2r2   r2   r3   rotate_half}   s   rk   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerk   )qkrV   rW   unsqueeze_dimq_embedk_embedr2   r2   r3   apply_rotary_pos_emb   s
   

rr   c                       s$   e Zd Z fddZdd Z  ZS )PersimmonMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S r_   )r#   r$   r   Linearr>   intermediate_sizedense_h_to_4hdense_4h_to_hr   
hidden_actactr-   r   r0   r2   r3   r$      s   
zPersimmonMLP.__init__c                 C   s"   |  |}| |}| |}|S r_   )rv   ry   rw   )r-   hidden_statesr2   r2   r3   r^      s   


zPersimmonMLP.forward)r`   ra   rb   r$   r^   rh   r2   r2   r0   r3   rs      s    rs           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr9   r   rI   )rG   r;   )ptrainingr   )rA   matmulrT   r   
functionalsoftmaxfloat32rD   r;   r   r   
contiguous)
r}   r~   r   r   r   r   r   kwargsattn_weightsattn_outputr2   r2   r3   eager_attention_forward   s   
r   c                       s   e Zd ZdZddededB f fddZdejde	ejejejf fd	d
Z
							ddejdejdB dejdB dedB dededejdB de	ejejf dB dee de	ejejdB e	ej dB f fddZ  ZS )PersimmonAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    s4  t    || _|| _|d u rtd| jj d |j| _|j	| _
| j| j
 | _t| j|jd  | _d| _| j| j
 | jkrMtd| j d| j
 dtj| jd| j dd	| _tj| j
| j | jdd	| _|j| _| jd
 | _| jrtj|j| j
 |jdd| _tj|j| j
 |jdd| _t|j| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r7   Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   biasg      )epselementwise_affine)r#   r$   r   r   loggerwarning_oncer1   r`   r>   r?   	num_headsr8   r@   r(   rotary_ndims	is_causal
ValueErrorr   rt   query_key_valuedenseqk_layernormr   	LayerNormlayer_norm_epsq_layernormk_layernormDropoutattention_dropoutr-   r   r   r0   r2   r3   r$      s<   

zPersimmonAttention.__init__	fused_qkvr5   c                 C   sV   |j \}}}|||| jd| j}|ddddf |ddddf |ddddf fS )a  
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r   .r   Nr   r9   )rP   viewr   r8   )r-   r   
batch_size
seq_lengththree_times_hidden_sizer2   r2   r3   _split_heads   s   4zPersimmonAttention._split_headsFr{   r   rY   past_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingsr   c	                 K   s  |  \}
}}| |}| |\}}}| jr!| |}| |}|dd}|dd}|dd}|\}}|dd | jf |d| jd f }}|dd | jf |d| jd f }}t||||\}}t	j
||fdd}t	j
||fdd}|d ur||| j|d}|||| j|\}}t| jjt}|| ||||f| jsdn| jj| jd|	\}}||
|d}| |}|sd }||fS )	Nr   r9   .rI   rN   )rW   rV   partial_rotation_sizer   r|   )r   r   )sizer   r   r   r   r   rT   r   rr   rA   rU   updater   r   get_interfacer   _attn_implementationr   r   r   r   reshaper   )r-   r{   r   rY   r   r   r   r   r   r   bszq_len_r   query_states
key_statesvalue_statesrV   rW   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfacer   r   r2   r2   r3   r^      s\   




zPersimmonAttention.forwardr_   NNNFFNN)r`   ra   rb   __doc__r   r@   r$   rA   rc   rf   r   
LongTensorr   boolr   r   r^   rh   r2   r2   r0   r3   r      s@    $%	
r   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB dee deejeejejf dB f fddZ  ZS )PersimmonDecoderLayerr   r   c                    sd   t    |j| _t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _t|j| _d S )N)r   r   r   )r#   r$   r>   r   	self_attnrs   mlpr   r   r   input_layernormpost_attention_layernormr   hidden_dropoutr   r   r0   r2   r3   r$   K  s   

zPersimmonDecoderLayer.__init__NFr{   r   rY   r   r   r   r   r   r   r5   c	                 K   s~   |}
|  |}| jd||||||||d|	\}}|
| }|}
| |}| |}| |}||
 }|f}|r=||f7 }|S )an  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.
                [What are position IDs?](../glossary#position-ids)
            past_key_values (`Cache`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r{   r   rY   r   r   r   r   r   Nr2   )r   r   r   r   r   )r-   r{   r   rY   r   r   r   r   r   r   residualself_attn_weightsoutputsr2   r2   r3   r^   T  s0   $
	




zPersimmonDecoderLayer.forwardr   )r`   ra   rb   r   r@   r$   rA   rc   r   r   r   rf   r   r   FloatTensorr^   rh   r2   r2   r0   r3   r   J  s<    	
r   c                   @   s8   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdS )PersimmonPreTrainedModelr   modelTr   r   N)r`   ra   rb   r   rd   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_sdpa_supports_flash_attn_supports_attention_backendr2   r2   r2   r3   r     s   
 r   c                       s   e Zd ZdZdef fddZee									ddej	dB dej
dB dej	dB d	edB d
ejdB dedB dedB dedB dej	dB dee defddZ  ZS )PersimmonModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    r   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _t| jd| _d| _|   d S )Nc                    s   g | ]}t  |qS r2   )r   ).0r   r   r2   r3   
<listcomp>  s    z+PersimmonModel.__init__.<locals>.<listcomp>r   r   F)r#   r$   pad_token_idpadding_idx
vocab_sizer   	Embeddingr>   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   final_layernormr   r   
rotary_embgradient_checkpointing	post_initrz   r0   r   r3   r$     s   zPersimmonModel.__init__N	input_idsr   rY   r   inputs_embedsr   r   output_hidden_statesr   r   r5   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|rE|d u rEt	| j d}|d u rN| 
|}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| j |||	||d}|}| j||d	}|rd
nd }|rd
nd }| jD ]&}|r||f7 }||f||||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r.   )r   r   r   r   r   rY   )rY   r2   )r   rY   r   r   r   r   r   )last_hidden_stater   r{   
attentions)r   r   r   r   r   r   r   r   r   r   r   get_seq_lengthrA   rB   rP   r.   rl   r
   r   r   r   r   )r-   r   r   rY   r   r   r   r   r   r   r   past_seen_tokenscausal_maskr{   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr2   r2   r3   r^     s   

	

	

zPersimmonModel.forward)	NNNNNNNNN)r`   ra   rb   r   r   r$   r   r   rA   r   rc   r   r   r   r   r   r   r^   rh   r2   r2   r0   r3   r     sJ    	
r   c                       s   e Zd ZddiZ fddZee											ddejdB dej	dB d	ejdB d
e
dB dejdB dejdB dedB dedB dedB dejdB deej	B defddZ  ZS )PersimmonForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFr   )
r#   r$   r   r   r   r   rt   r>   lm_headr   rz   r0   r2   r3   r$   #  s
   
zPersimmonForCausalLM.__init__Nr   r   r   rY   r   r   labelsr   r   r   r   logits_to_keepr5   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durX| j	||fd| j j
i|}t|||j|j|jdS )uk  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PersimmonForCausalLM

        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```N)	r   r   rY   r   r   r   r   r   r   r   )losslogitsr   r{   r   r2   )r   r   r   r   r   rQ   r@   slicer   loss_functionr   r   r   r{   r   )r-   r   r   rY   r   r   r   r   r   r   r   r   r   r   r{   slice_indicesr   r   r2   r2   r3   r^   ,  sH   (
zPersimmonForCausalLM.forward)NNNNNNNNNNr   )r`   ra   rb   _tied_weights_keysr$   r   r   rA   r   rc   r   r   r   r@   r   r^   rh   r2   r2   r0   r3   r     sR    		
r   c                   @      e Zd ZdS )"PersimmonForSequenceClassificationNr`   ra   rb   r2   r2   r2   r3   r  ~      r  c                   @   r  )PersimmonForTokenClassificationNr  r2   r2   r2   r3   r    r  r  )r   r   r   r  r  )r   )r|   )<r   collections.abcr   typingr   rA   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_persimmonr   
get_loggerr`   r   Moduler   rk   rr   rs   rc   rE   r   r   r   r   r   r   r  r  __all__r2   r2   r2   r3   <module>   sd   
E

 Pw_