o
    i}                     @   sh  d dl mZmZmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* e( rd dl+m,Z,m-Z- nd\Z,Z-edG dd dej.Z/G dd dej.Z0G dd dej.Z1G dd dZ2dd  Z3dBd!d"Z4d#ej5d$e6d%ej5fd&d'Z7	(dCd)ej.d*ej5d+ej5d,ej5d-eej5 d.e8d/e8d0ee  fd1d2Z9G d3d4 d4ej.Z:d5d6 Z;e,e-fZ<e=e<Z>G d7d8 d8ej.Z?G d9d: d:eZ@e!G d;d< d<eZAe!G d=d> d>eAZBe!G d?d@ d@eAeZCg dAZDdS )D    )AnyCallableOptionalUnionN)nn   )Cache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs)is_causal_conv1d_available   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	Lfm2RMSNormư>c                    s&   t    tt|| _|| _dS )z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/lfm2/modeling_lfm2.pyr#   2   s   

zLfm2RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor%   float32powmeanrsqrtr(   r'   )r)   hidden_statesinput_dtypevariancer.   r.   r/   forward:   s
   zLfm2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler'   shaper(   r)   r.   r.   r/   
extra_reprA   s   zLfm2RMSNorm.extra_repr)r!   )__name__
__module____qualname__r#   r<   r@   __classcell__r.   r.   r,   r/   r    0   s    r    c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	Lfm2RotaryEmbeddinginv_freqNconfigc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrF   F)
persistent)r"   r#   hasattr
isinstancerH   dictgetrI   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrG   r   rope_init_fnattention_scalingregister_bufferrF   original_inv_freq)r)   rG   devicerF   r,   r.   r/   r#   H   s   
zLfm2RotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r1   r   mpscpuF)device_typeenabledr0   dim)r3   )rF   floatexpandr>   r4   rX   rN   rJ   strr%   autocast	transposecatcosrU   sinr3   )
r)   xposition_idsinv_freq_expandedposition_ids_expandedr[   freqsembre   rf   r.   r.   r/   r<   Y   s   0&zLfm2RotaryEmbedding.forwardN)rA   rB   rC   r%   Tensor__annotations__r   r#   no_gradr   r<   rD   r.   r.   r,   r/   rE   E   s   
 
rE   c                       s*   e Zd Zdef fddZdd Z  ZS )Lfm2MLPrG   c                    s   t    |j}|jr,td| d }|jd ur,t|j| }|j||j d |j  }tj|j	|dd| _
tj|j	|dd| _tj||j	dd| _d S )Nr0   r   r   Fbias)r"   r#   intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearr*   w1w3w2)r)   rG   rt   r,   r.   r/   r#   j   s   

zLfm2MLP.__init__c                 C   s    |  t| || | S rm   )r|   Fsilurz   r{   )r)   rg   r.   r.   r/   r<   y   s    zLfm2MLP.forward)rA   rB   rC   r   r#   r<   rD   r.   r.   r,   r/   rq   i   s    rq   c                   @   s  e Zd ZdZdZdZdZdZej	dfde
dedejdeejedf fdd	Z	d#d
ejdejdedeeeef  deejejf f
ddZdejfddZd$dee defddZdejdedeeef fddZdefddZdefddZdedeejejf fdd Zd!d" ZdS )%Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrG   max_batch_sizer3   rX   c                 C   s   g | _ g | _|| _|j| _| jd| _|j| _|| _g | _|d ur't	
|nd }t|jD ]}t	j| j|j| j| j|d}t	j| | j| q.d S )Nfull_attention)r3   rX   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher%   rX   rangenum_hidden_layerszerosr*   _dynamomark_static_addressappend)r)   rG   r   r3   rX   _
conv_stater.   r.   r/   r#      s(   zLfm2HybridConvCache.__init__
key_statesvalue_states	layer_idxcache_kwargsreturnc                 C   s   |duret | j|kr5tt | j|D ]}| jtg  | jtg  q| j| | j| n0| j|  sG|| j|< || j|< ntj| j| |gdd| j|< tj| j| |gdd| j|< | j| | j| fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        Nr]   )	lenr   r   r   r%   tensorr   numelrd   )r)   r   r   r   r   r   r.   r.   r/   update   s   
zLfm2HybridConvCache.updatebeam_idxc                 C   s   t t| jD ]A}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)r   r   r   rX   index_selectr4   r   r   )r)   r   r   rX   r.   r.   r/   reorder_cache   s    z!Lfm2HybridConvCache.reorder_cacher   c                 C   sL   | j | dkr
| jn|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   r>   r)   r   r.   r.   r/   get_seq_length   s    z"Lfm2HybridConvCache.get_seq_lengthcache_positionc                 C   s&   d}|j d }|  }|| }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )r>   r   )r)   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengthr.   r.   r/   get_mask_sizes   s
   
z"Lfm2HybridConvCache.get_mask_sizes
max_lengthc                 C   s   |dk r|   t| }|   |krdS tt| jD ]+}| j|  rF| j| dd|ddf | j|< | j| dd|ddf | j|< qdS )z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r)   r   idxr.   r.   r/   crop   s   ""zLfm2HybridConvCache.cropc                 C   s
   t | jS rm   )r   r   r?   r.   r.   r/   __len__  s   
zLfm2HybridConvCache.__len__c                 C   s   | j | | j| fS rm   )r   r   r   r.   r.   r/   __getitem__  s   zLfm2HybridConvCache.__getitem__c                 C   s&   t t| jD ]	}| j|   qd S rm   )r   r   r   zero_r   r.   r.   r/   reset  s   zLfm2HybridConvCache.resetrm   )r   )rA   rB   rC   __doc__r   is_compileabler   r   r%   r5   r   rv   r3   r   rX   ra   r#   rn   r   rO   r   r=   r   
LongTensorr   r   r   r   r   r   r   r.   r.   r.   r/   r   }   sH    	
"
+ r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr1   r0   r]   )r>   r%   rd   )rg   x1x2r.   r.   r/   rotate_half  s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkre   rf   rh   unsqueeze_dimq_embedk_embedr.   r.   r/   apply_rotary_pos_emb  s
   

r   r9   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r>   r`   reshape)r9   r   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kv/  s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr0   r   r   r1   )r^   r3   )ptrainingr   )r   num_key_value_groupsr%   matmulrc   r>   r   
functionalsoftmaxr5   r4   r3   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightscausal_maskattn_outputr.   r.   r/   eager_attention_forward;  s   
&r   c                       s   e Zd ZdZdedef fddZedddd		
	
ddej	de
ej	ej	f deej	 dee deej de
ej	eej	 f fddZ  ZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrG   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _t| j|jd| _t| j|jd| _d S )Nr   g      TFrr   r+   )r"   r#   rG   r   getattrr*   num_attention_headsr   r   r   r   	is_causalr   ry   q_projk_projv_projout_projr    norm_epsq_layernormk_layernormr)   rG   r   r,   r.   r/   r#   X  s   
zLfm2Attention.__init__past_key_valuepast_key_values4.58new_nameversionNr9   position_embeddingsr   r   r   c                 K   s$  |j d d }g |d| jR }| | |j| dd}	| | |j| dd}
| |j| dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t}| jjdkrkt| jj }|| |	|
||fd| jd|\}}|jg |dR   }| |}||fS )Nr1   r   r0   )rf   re   r   eagerr   )r   r   )r>   r   r   r   viewrc   r   r   r   r   r   r   r   rG   _attn_implementationr   r   r   r   r   )r)   r9   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   re   rf   r   attention_interfacer   r   outputr.   r.   r/   r<   g  s8   



zLfm2Attention.forwardr   )rA   rB   rC   r   r   rv   r#   r   r%   rn   r=   r   r   r   r<   rD   r.   r.   r,   r/   r   U  s&    r   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r>   r3   r4   )r9   r   r3   r.   r.   r/   apply_mask_to_padding_states  s   $ r   c                       s   e Zd Zdedef fddZedddd						dd
ejde	e
 de	ej de	ej fddZedddd						dd
ejde	e
 de	ej de	ej fddZedddd						ddejde	e
 de	ej de	ej fddZ  ZS )Lfm2ShortConvrG   r   c                    s   t    || _|| _|j| _|j| _tj	|j
|j
| j|j
| j| jd d| _tj|j
d|j
 | jd| _tj|j
|j
| jd| _d S )Nr   )in_channelsout_channelskernel_sizegroupsrs   paddingr   rr   )r"   r#   rG   r   r   L_cache	conv_biasrs   r   Conv1dr*   convry   in_projr   r   r,   r.   r/   r#     s   
zLfm2ShortConv.__init__r   r   r   r   Nrg   r   r   c                 C   s  t ||}| |dd}|jddd\}}}|| }| jj| jjd| jjd}	|d urO|d dkrOt|	d|j
| j |	| jjd }
|
d}
n&|d urktj|| j|jd  df}|j
| j | t||	| jjd d}
||
 }| |dd }|S )Nr1   r   r   r]   r   r0   )
activation)r   r   rc   chunkr   r'   r   sizer   squeezer   r   rs   r   r   r   padr   r>   copy_r   r   r   )r)   rg   r   r   r   BCxBCBxconv_weightsconv_outr   yr.   r.   r/   cuda_kernels_forward  s*   
$
z"Lfm2ShortConv.cuda_kernels_forwardc                 C   sz  |j d }t||}| |dd}|jddd\}}}|| }	|d ur|d dkr|j| j }
|d| jd }|
j	ddd}
|	j
|
j|
jd|
d d d d |f< |j| j |
 tj|

|	j| jjd d dd d f  dd}| jr}|| jj7 }|d}n'|d urtj|	| j|	j d  df}
|j| j |
 | |	d	d |f }|| }|dd }| |}|S )
Nr   r1   r   r   r]   r   )shiftsdims)rX   r3   .)r>   r   r   rc   r   r   r   clampr   rollr4   rX   r3   r  r%   sumr   r'   rs   r   r   r   r  r   r   )r)   rg   r   r   r   seqlenr  r  r  r  r   r
  r  r.   r.   r/   slow_forward  s.   

$0
zLfm2ShortConv.slow_forwardr9   c                 C   s:   t rd|jjv rtj s| ||||S | ||||S )Ncuda)is_fast_path_availablerX   rJ   r%   r   is_compilingr  r  )r)   r9   r   r   r   r.   r.   r/   r<     s   zLfm2ShortConv.forward)NNN)rA   rB   rC   r   rv   r#   r   r%   rn   r   r   r   r  r  r<   rD   r.   r.   r,   r/   r     sZ    "&r   c                       s   e Zd Zdedef fddZedddd								dd
ejde	ejejf de
ej de
ej de
e de
ej dejfddZ  ZS )Lfm2DecoderLayerrG   r   c                    sl   t    |j| dk| _| jrt||| _nt||| _t|| _	t
|j|jd| _t
|j|jd| _d S )Nr   r   )r"   r#   r   is_attention_layerr   	self_attnr   r   rq   feed_forwardr    r*   r   operator_normffn_normr   r,   r.   r/   r#     s   

zLfm2DecoderLayer.__init__r   r   r   r   Nr9   r   r   rh   r   r   c           
   	   K   sl   |}| j r| jd| ||||||d|\}}	n| j| ||||d}|| }|| | | }|S )N)r9   r   r   rh   r   r   )r9   r   r   r   r.   )r  r  r  r   r  r  )
r)   r9   r   r   rh   r   r   r   residualr   r.   r.   r/   r<     s*   
zLfm2DecoderLayer.forward)NNNN)rA   rB   rC   r   rv   r#   r   r%   rn   r=   r   r   r   r<   rD   r.   r.   r,   r/   r    s,    	r  c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )	Lfm2PreTrainedModelrG   modelTr  r   F)r9   
attentionsN)rA   rB   rC   r   ro   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr.   r.   r.   r/   r  ?  s   
 
r  c                       s   e Zd Zdef fddZee							ddeej	 deej
 deej	 dee d	eej d
ee deej	 dee defddZ  ZS )	Lfm2ModelrG   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t d| _d| _t | _t j jd| _|   d S )Nc                    s   g | ]}t  |qS r.   )r  ).0r   rG   r.   r/   
<listcomp>Z  s    z&Lfm2Model.__init__.<locals>.<listcomp>r-  Fr   )r"   r#   pad_token_idpadding_idx
vocab_sizer   	Embeddingr*   embed_tokens
ModuleListr   r   layersrE   
rotary_embgradient_checkpointingpos_embr    r   embedding_norm	post_initr)   rG   r,   r-  r/   r#   S  s   
zLfm2Model.__init__N	input_idsr   rh   r   inputs_embeds	use_cacher   r   r   c              	   K   s  |d u |d uA rt d|d u r| |}|r+|d u r+|jd }	t| j|	| j| jd}|d u rG|d ur7| nd}
tj	|
|
|jd  |jd}|d u rP|
d}t| j|||||d}|}| ||}| jd | jj D ]}||f|||||d|}ql| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )rG   r   r3   rX   r   )rX   )rG   input_embedsr   r   r   rh   )r   rh   r   r   r   )last_hidden_stater   )
ValueErrorr3  r>   r   rG   r3   rX   r   r%   aranger   r   r8  r5  r   r9  r   )r)   r<  r   rh   r   r=  r>  r   r   
batch_sizer   r   r9   r   decoder_layerr.   r.   r/   r<   d  sV   


	

zLfm2Model.forward)NNNNNNN)rA   rB   rC   r   r#   r   r   r   r%   r   rn   r   FloatTensorboolr   r   r   r<   rD   r.   r.   r,   r/   r+  Q  s<    	
r+  c                       s   e Zd ZdgZddiZddgdgfiZ fddZee										dd
e	e
j de	e
j de	e
j de	e de	e
j de	e
j de	e de	e
j deee
jf dee defddZ  ZS )Lfm2ForCausalLMzlm_head.weightlm_headcolwise_repr9   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFrr   )
r"   r#   r+  r  r1  r   ry   r*   rH  r:  r;  r,   r.   r/   r#     s
   
zLfm2ForCausalLM.__init__Nr   r<  r   rh   r   r=  labelsr>  r   logits_to_keepr   r   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r<  r   rh   r   r=  r>  r   N)rJ  rK  r1  )lossrJ  r   r9   r   r.   )r  r@  rN   rv   slicerH  loss_functionrG   r1  r   r   r9   r   )r)   r<  r   rh   r   r=  rK  r>  r   rL  r   outputsr9   slice_indicesrJ  rM  r.   r.   r/   r<     s0    zLfm2ForCausalLM.forward)	NNNNNNNNr   )rA   rB   rC   _tied_weights_keys_tp_plan_pp_planr#   r   r   r   r%   r   rn   r   rE  rF  r   rv   r   r   r   r<   rD   r.   r.   r,   r/   rG    sN    		
rG  )rG  r+  r  )Nr   )r   )Etypingr   r   r   r   r%   torch.nn.functionalr   r   r}   cache_utilsr   
generationr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   utils.import_utilsr   configuration_lfm2r   causal_conv1dr   r   Moduler    rE   rq   r   r   r   rn   rv   r   r_   r   r   r   kernel_modulesallr  r   r  r  r+  rG  __all__r.   r.   r.   r/   <module>   sz   $ 

=n0TK