o
    i                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) e$ rddl*m+Z+ ddl,m-Z- e%.e/Z0G dd dej1Z2dd Z3d7ddZ4G dd dej1Z5	d8dej1d ej6d!ej6d"ej6d#eej6 d$e7d%e7fd&d'Z8G d(d) d)ej1Z9G d*d+ d+eZ:e"G d,d- d-eZ;e"G d.d/ d/e;Z<G d0d1 d1e;eZ=G d2d3 d3ee;Z>G d4d5 d5ee;Z?g d6Z@dS )9zPyTorch Persimmon model.    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )PersimmonConfig)	BlockMask)make_flex_block_causal_maskc                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	PersimmonRotaryEmbeddinginv_freqNconfigc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr!   F)
persistent)super__init__hasattr
isinstancer#   dictgetr$   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   r   rope_init_fnattention_scalingregister_bufferr!   original_inv_freq)selfr"   devicer!   	__class__ m/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/persimmon/modeling_persimmon.pyr)   >   s   
z!PersimmonRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabled   dim)dtype)r!   floatexpandshapetor6   r+   r%   strtorchautocast	transposecatcosr2   sinrC   )
r5   xposition_idsinv_freq_expandedposition_ids_expandedr>   freqsembrM   rN   r9   r9   r:   forwardO   s   0&z PersimmonRotaryEmbedding.forwardN)__name__
__module____qualname__rI   Tensor__annotations__r   r)   no_gradr   rU   __classcell__r9   r9   r7   r:   r    ;   s   
 
r    c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr;   r@   rA   )rF   rI   rL   )rO   x1x2r9   r9   r:   rotate_half`   s   r`   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer`   )qkrM   rN   rP   unsqueeze_dimq_embedk_embedr9   r9   r:   apply_rotary_pos_embh   s
   

rg   c                       s$   e Zd Z fddZdd Z  ZS )PersimmonMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S rV   )r(   r)   r   Linearhidden_sizeintermediate_sizedense_h_to_4hdense_4h_to_hr   
hidden_actactr5   r"   r7   r9   r:   r)      s   
zPersimmonMLP.__init__c                 C   s"   |  |}| |}| |}|S rV   )rl   ro   rm   )r5   hidden_statesr9   r9   r:   rU      s   


zPersimmonMLP.forward)rW   rX   rY   r)   rU   r]   r9   r9   r7   r:   rh      s    rh           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t ||dd| }|d ur'|d d d d d d d |jd f }	||	 }tjj|dt jd|j	}tjj
||| jd}t ||}
|
dd }
|
|fS )Nr@   r   r;   )rB   rC   )ptrainingr   )rI   matmulrK   rF   r   
functionalsoftmaxfloat32rG   rC   ry   r|   
contiguous)rs   rt   ru   rv   rw   rx   ry   kwargsattn_weightscausal_maskattn_outputr9   r9   r:   eager_attention_forward   s   
&r   c                       s   e Zd ZdZddedee f fddZdej	de
ej	ej	ej	f fd	d
Zedddd							ddej	deej	 deej dee dededeej dee
ej	ej	f  dee de
ej	eej	 ee
ej	  f fddZ  ZS )PersimmonAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr"   	layer_idxc                    sF  t    || _|| _|d u rtd| jj d |j| _|j	| _
| j| j
 | _|j| _t| j|j | _d| _| j| j
 | jkrOtd| j d| j
 dtj| jd| j dd| _tj| j
| j | jdd| _|j| _| jd	 | _| jrtj|j| j
 |jdd
| _tj|j| j
 |jdd
| _t|j| _t| jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   biasg      )epselementwise_affiner"   ) r(   r)   r"   r   loggerwarning_oncer8   rW   rj   num_attention_heads	num_headshead_dim
rope_thetaintpartial_rotary_factorrotary_ndims	is_causal
ValueErrorr   ri   query_key_valuedenseqk_layernormrx   	LayerNormlayer_norm_epsq_layernormk_layernormDropoutattention_dropoutr    
rotary_embr5   r"   r   r7   r9   r:   r)      s@   

zPersimmonAttention.__init__	fused_qkvreturnc                 C   sV   |j \}}}|||| jd| j}|ddddf |ddddf |ddddf fS )a  
        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
        storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r   .r   Nr   r@   )rF   viewr   r   )r5   r   
batch_size
seq_lengththree_times_hidden_sizer9   r9   r:   _split_heads   s   4zPersimmonAttention._split_headspast_key_valuepast_key_values4.58new_nameversionFrq   rw   rP   output_attentions	use_cachecache_positionposition_embeddingsr   c	                 K   s  |  \}
}}| |}| |\}}}| jr!| |}| |}|dd}|dd}|dd}|\}}|dd | jf |d| jd f }}|dd | jf |d| jd f }}t||||\}}t	j
||fdd}t	j
||fdd}|d ur||| j|d}|||| j|\}}t}| jjdkrt| jj }|| ||||f| jsdn| jj| jd	|	\}}||
|d}| |}|sd }||fS )
Nr   r@   .r;   rA   )rN   rM   partial_rotation_sizer   eagerrr   )ry   rx   )sizer   r   r   r   r   rK   r   rg   rI   rL   updater   r   r"   _attn_implementationr   r|   r   rx   reshaper   )r5   rq   rw   rP   r   r   r   r   r   r   bszq_len_r   query_states
key_statesvalue_statesrM   rN   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfacer   r   r9   r9   r:   rU      s\   




zPersimmonAttention.forwardrV   NNNFFNN)rW   rX   rY   __doc__r   r   r   r)   rI   rZ   tupler   r   
LongTensorr   boolr   r   rU   r]   r9   r9   r7   r:   r      sB    $&	
r   c                       s   e Zd Zdedef fddZedddd							
	
				ddejde	ej de	ej
 de	e de	e de	e de	ej
 de	eejejf  dee deeje	eejejf  f fddZ  ZS )PersimmonDecoderLayerr"   r   c                    sd   t    |j| _t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _t|j| _d S )N)r"   r   r   )r(   r)   rj   r   	self_attnrh   mlpr   r   r   input_layernormpost_attention_layernormr   hidden_dropoutry   r   r7   r9   r:   r)   5  s   

zPersimmonDecoderLayer.__init__r   r   r   r   NFrq   rw   rP   r   r   r   r   r   r   c	                 K   s~   |}
|  |}| jd||||||||d|	\}}|
| }|}
| |}| |}| |}||
 }|f}|r=||f7 }|S )an  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.
                [What are position IDs?](../glossary#position-ids)
            past_key_values (`Cache`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )rq   rw   rP   r   r   r   r   r   Nr9   )r   r   r   r   ry   )r5   rq   rw   rP   r   r   r   r   r   r   residualself_attn_weightsoutputsr9   r9   r:   rU   >  s0   %
	




zPersimmonDecoderLayer.forwardr   )rW   rX   rY   r   r   r)   r   rI   rZ   r   r   r   r   r   r   r   FloatTensorrU   r]   r9   r9   r7   r:   r   4  s>    		
r   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdd ZdS )	PersimmonPreTrainedModelr"   modelTr   r   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )Nrr   )meanstdg      ?)r"   initializer_ranger+   r   ri   weightdatanormal_r   zero_	Embeddingpadding_idxr   fill_)r5   rs   r   r9   r9   r:   _init_weights  s   

z&PersimmonPreTrainedModel._init_weightsN)rW   rX   rY   r   r[   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_sdpa_supports_flash_attn_supports_attention_backendr   r9   r9   r9   r:   r     s   
 r   c                       s  e Zd ZdZdef fddZee									ddee	j
 dee	j dee	j
 d	ee d
ee	j dee dee dee dee	j
 dee defddZ	ddee	jdf de	jde	jd	edef
ddZede	jdedede	jde	jdefddZ  ZS ) PersimmonModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PersimmonDecoderLayer`]

    Args:
        config: PersimmonConfig
    r"   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r9   )r   ).0r   r   r9   r:   
<listcomp>  s    z+PersimmonModel.__init__.<locals>.<listcomp>r   r   F)r(   r)   pad_token_idr   
vocab_sizer   r   rj   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   final_layernormr    r   gradient_checkpointing	post_initrp   r7   r   r:   r)     s   zPersimmonModel.__init__N	input_idsrw   rP   r   inputs_embedsr   r   output_hidden_statesr   r   r   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|rE|d u rEt	| j d}|d u rN| 
|}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}| |||	||}|}| ||}|rdnd }|rdnd }| jD ]&}|r||f7 }||f||||||	|d	|
}|d }|r||d f7 }q| |}|r||f7 }t||||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r6   r9   )rw   rP   r   r   r   r   r   )last_hidden_stater   rq   
attentions)r"   r   r   r   r   r   r|   r   r   r	   r   get_seq_lengthrI   arangerF   r6   ra   _update_causal_maskr   r   r   r   )r5   r   rw   rP   r   r   r   r   r   r   r   past_seen_tokensr   rq   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr9   r9   r:   rU     sv   




	

zPersimmonModel.forwardFr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2rr   flex_attentionr   Fsdpa)r   past_key_values_lengthis_trainingr   r;   )sequence_lengthtarget_lengthrC   r   r   )cudaxpunpu)r"   r   anyr+   rI   rZ   r   r   is_compileabler   _ignore_causal_mask_sdpar|   rC   rF   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr6   r%   finfomin_unmask_unattended)r5   rw   r  r   r   r   r  using_compilable_cacherC   r  r  r   	min_dtyper9   r9   r:   r    sT   




z"PersimmonModel._update_causal_maskr  r  rC   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerC   r6   r   )diagonalr   r;   r   )rB   rI   r  r  fullr6   triur   r   rE   clonerF   rG   masked_fill)rw   r  r  rC   r   r   r   r   r  mask_lengthpadding_maskr9   r9   r:   r  \  s,    $
6  zDPersimmonModel._prepare_4d_causal_attention_mask_with_cache_position)	NNNNNNNNN)F)rW   rX   rY   r   r   r)   r   r   r   rI   r   rZ   r   r   r   r   r   r   rU   r   r  staticmethodr   rC   r  r]   r9   r9   r7   r:   r     s~    	
a
Dr   c                       s   e Zd ZdgZ fddZee											ddeej	 deej
 deej	 d	ee d
eej deej	 dee dee dee deej	 deeej
f defddZ  ZS )PersimmonForCausalLMzlm_head.weightc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFr   )
r(   r)   r   r   r   r   ri   rj   lm_headr   rp   r7   r9   r:   r)     s
   
zPersimmonForCausalLM.__init__Nr   r   rw   rP   r   r   labelsr   r   r   r   logits_to_keepr   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| jd||||||||	|
d	|}|j}t|tr4t| dn|}| |dd|ddf }d}|durX| j	||fd| j j
i|}t|||j|j|jdS )uk  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PersimmonForCausalLM

        >>> model = PersimmonForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```N)	r   rw   rP   r   r   r   r   r   r   r   )losslogitsr   rq   r   r9   )r"   r   r   r   r   r+   r   slicer'  loss_functionr   r   r   rq   r   )r5   r   rw   rP   r   r   r(  r   r   r   r   r)  r   r   rq   slice_indicesr+  r*  r9   r9   r:   rU     sH   (
zPersimmonForCausalLM.forward)NNNNNNNNNNr   )rW   rX   rY   _tied_weights_keysr)   r   r   r   rI   r   rZ   r   r   r   r   r   r   rU   r]   r9   r9   r7   r:   r&    sR    		
r&  c                   @      e Zd ZdS )"PersimmonForSequenceClassificationNrW   rX   rY   r9   r9   r9   r:   r1        r1  c                   @   r0  )PersimmonForTokenClassificationNr2  r9   r9   r9   r:   r4    r3  r4  )r&  r   r   r1  r4  )Nr   )rr   )Ar   typingr   r   r   rI   r   activationsr   cache_utilsr   r	   
generationr
   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   configuration_persimmonr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerrW   r   Moduler    r`   rg   rh   rZ   rD   r   r   r   r   r   r&  r1  r4  __all__r9   r9   r9   r:   <module>   sj   
%

 Q t_