o
    ei                     @   s  d dl mZ d dlmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. e) rd dl/m0Z0m1Z1 nd\Z0Z1edG dd dej2Z3G dd dej2Z4G dd dej2Z5G dd dZ6d d! Z7ed"dDd#d$Z8d%ej9d&e:d'ej9fd(d)Z;	*dEd+ej2d,ej9d-ej9d.ej9d/ej9dB d0e<d1e<d2e e" fd3d4Z=ee8G d5d6 d6ej2Z>d7d8 Z?e0e1fZ@eAe@ZBG d9d: d:ej2ZCG d;d< d<eZDe#G d=d> d>eZEe#G d?d@ d@eEZFe#G dAdB dBeEeZGg dCZHdS )F    )Callable)AnyOptionalN)nn   )Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)is_causal_conv1d_availableis_torchdynamo_compiling)capture_outputs   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )Lfm2RMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer%   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/lfm2/modeling_lfm2.pyr(   3   s   

zLfm2RMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   r4   input_dtypevariancer2   r2   r3   forward;   s
   zLfm2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler,   shaper-   r.   r2   r2   r3   
extra_reprB   s   zLfm2RMSNorm.extra_repr)r$   )
__name__
__module____qualname__floatr(   r*   Tensorr@   rD   __classcell__r2   r2   r0   r3   r#   1   s    r#   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Lfm2RotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrL   F)
persistentoriginal_inv_freq)r'   r(   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrM   rope_parametersrN   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r.   rM   devicerope_init_fnrL   r0   r2   r3   r(   I   s   


zLfm2RotaryEmbedding.__init__rZ   ztorch.deviceseq_lenr&   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r5   r8   rZ   r8   )	rU   getattrr/   num_attention_headsr*   arangeint64r9   rH   )rM   rZ   r\   basedimattention_factorrL   r2   r2   r3   rV   Y   s   
&z3Lfm2RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r6   r   mpscpuF)device_typeenabledr5   rf   r_   )rL   rH   expandrB   r9   rZ   
isinstancetypestrr   	transposer*   catcosrW   sinr8   )
r.   xposition_idsinv_freq_expandedposition_ids_expandedrj   freqsembrs   rt   r2   r2   r3   r@   w   s   0&zLfm2RotaryEmbedding.forwardNNNN)rE   rF   rG   r*   rI   __annotations__r   r(   staticmethodr   intrA   rH   rV   no_gradr   r@   rJ   r2   r2   r0   r3   rK   F   s&   
 

rK   c                       s*   e Zd Zdef fddZdd Z  ZS )Lfm2MLPrM   c                    s   t    |j}|jr,td| d }|jd ur,t|j| }|j||j d |j  }tj|j	|dd| _
tj|j	|dd| _tj||j	dd| _d S )Nr5   r   r   Fbias)r'   r(   intermediate_sizeblock_auto_adjust_ff_dimr   block_ffn_dim_multiplierblock_multiple_ofr   Linearr/   w1w3w2)r.   rM   r   r0   r2   r3   r(      s   

zLfm2MLP.__init__c                 C   s    |  t| || | S r{   )r   Fsilur   r   )r.   ru   r2   r2   r3   r@      s    zLfm2MLP.forward)rE   rF   rG   r   r(   r@   rJ   r2   r2   r0   r3   r      s    r   c                   @   s   e Zd ZdZdZdZdZdZej	dfde
dedejdejeB dB fdd	Z	d!d
ejdejdedeeef dB deejejf f
ddZdejfddZd"dedB defddZdejdedeeef fddZdefddZdefddZdd  ZdS )#Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrM   max_batch_sizer8   rZ   c                 C   s   g | _ g | _|| _|j| _| jd| _|j| _|| _g | _|d ur't	
|nd }t|jD ](}t	j| j|j| j| j|d}| j| | j t	g  | jt	g  q.d S )Nfull_attention)r8   rZ   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher*   rZ   rangenum_hidden_layerszerosr/   appendtensor)r.   rM   r   r8   rZ   _
conv_stater2   r2   r3   r(      s*   zLfm2HybridConvCache.__init__
key_statesvalue_states	layer_idxcache_kwargsr&   c                 C   sx   | j |  dkr|| j |< || j|< ntj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        r   rl   )r   numelr   r*   rr   )r.   r   r   r   r   r2   r2   r3   update   s   
zLfm2HybridConvCache.updatebeam_idxc                 C   s   t t| jD ]O}| j|  r:| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j|  rV| j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r   rZ   index_selectr9   r   r   )r.   r   r   rZ   r2   r2   r3   reorder_cache   s   z!Lfm2HybridConvCache.reorder_cacher   c                 C   sL   | j | dkr
| jn|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rB   r.   r   r2   r2   r3   get_seq_length   s    z"Lfm2HybridConvCache.get_seq_lengthcache_positionc                 C   s&   d}|j d }|  }|| }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rB   r   )r.   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengthr2   r2   r3   get_mask_sizes   s
   
z"Lfm2HybridConvCache.get_mask_sizes
max_lengthc                 C   s   |dk r|   t| }|   |krdS tt| jD ]+}| j|  rF| j| dd|ddf | j|< | j| dd|ddf | j|< qdS )z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r.   r   idxr2   r2   r3   crop  s   ""zLfm2HybridConvCache.cropc                 C   s
   t | jS r{   )r   r   rC   r2   r2   r3   __len__  s   
zLfm2HybridConvCache.__len__c                 C   s&   t t| jD ]	}| j|   qd S r{   )r   r   r   zero_r   r2   r2   r3   reset  s   zLfm2HybridConvCache.resetr{   )r   )rE   rF   rG   __doc__r   is_compileabler   r   r*   r:   r   r   r8   rZ   rp   r(   rI   dictr   rA   r   
LongTensorr   r   r   r   r   r   r2   r2   r2   r3   r      sF    	
#
! r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r5   rl   )rB   r*   rr   )ru   x1x2r2   r2   r3   rotate_half!  s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrs   rt   unsqueeze_dimq_embedk_embedr2   r2   r3   apply_rotary_pos_emb(  s
   

r   r4   n_repr&   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rB   rm   reshape)r4   r   batchnum_key_value_headsslenr^   r2   r2   r3   	repeat_kvB  s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr5   r   r6   )rf   r8   )ptrainingr   )r   num_key_value_groupsr*   matmulrq   r   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightsattn_outputr2   r2   r3   eager_attention_forwardN  s   
r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB de	ejejdB f fddZ  ZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrM   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
d| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _t| j|jd| _t| j|jd| _d S )Nr^   g      TFr   r%   )r'   r(   rM   r   ra   r/   rb   r^   r   r   r   	is_causalr   r   q_projk_projv_projout_projr#   norm_epsq_layernormk_layernormr.   rM   r   r0   r2   r3   r(   k  s   
zLfm2Attention.__init__Nr4   position_embeddingsr   past_key_valuesr   r&   c                 K   s  |j d d }g |d| jR }| | |j| dd}	| | |j| dd}
| |j| dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t| jjt}|| |	|
||fd| jd|\}}|jg |dR   }| |}||fS )Nr6   r   r5   )rt   rs   r   r   )r   r   )rB   r^   r   r   viewrq   r   r   r   r   r   r   r   get_interfacerM   _attn_implementationr   r   r   r   r   )r.   r4   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rs   rt   r   attention_interfacer   r   outputr2   r2   r3   r@   z  s8   	


zLfm2Attention.forwardr!   )rE   rF   rG   r   r   r   r(   r*   rI   rA   r   r   r@   rJ   r2   r2   r0   r3   r   g  s$    r   c                 C   sN   |dur%|j d dkr%|j d dkr%| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rB   r8   r9   )r4   r   r8   r2   r2   r3   apply_mask_to_padding_states  s   $ r   c                
       s   e Zd Zdedef fddZ			ddejdedB dej	dB d	ejdB fd
dZ
			ddejdedB dej	dB d	ejdB fddZ			ddejdedB dej	dB d	ejdB fddZ  ZS )Lfm2ShortConvrM   r   c                    s   t    || _|| _|j| _|j| _tj	|j
|j
| j|j
| j| jd d| _tj|j
d|j
 | jd| _tj|j
|j
| jd| _d S )Nr   )in_channelsout_channelskernel_sizegroupsr   paddingr   r   )r'   r(   rM   r   r   L_cache	conv_biasr   r   Conv1dr/   convr   in_projr   r   r0   r2   r3   r(     s   
zLfm2ShortConv.__init__Nru   r   r   r   c                 C   s  t ||}| |dd}|jddd\}}}|| }| jj| jjd| jjd}	|d urO|d dkrOt|	d|j
| j |	| jjd }
|
d}
n&|d urktj|| j|jd  df}|j
| j | t||	| jjd d}
||
 }| |dd }|S )Nr6   r   r   rl   r   r5   )
activation)r   r  rq   chunkr  r,   r   sizer    squeezer   r   r   r   r   r   padr   rB   copy_r   r   r   )r.   ru   r   r   r   BCxBCBxconv_weightsconv_outr   yr2   r2   r3   cuda_kernels_forward  s*   
$
z"Lfm2ShortConv.cuda_kernels_forwardc                 C   sz  |j d }t||}| |dd}|jddd\}}}|| }	|d ur|d dkr|j| j }
|d| jd }|
j	ddd}
|	j
|
j|
jd|
d d d d |f< |j| j |
 tj|

|	j| jjd d dd d f  dd}| jr}|| jj7 }|d}n'|d urtj|	| j|	j d  df}
|j| j |
 | |	d	d |f }|| }|dd }| |}|S )
Nr   r6   r   r   rl   r   )shiftsdimsr`   .)rB   r   r  rq   r  r   r   clampr   rollr9   rZ   r8   r  r*   sumr  r,   r   r   r   r   r  r   r   )r.   ru   r   r   r   seqlenr	  r
  r  r  r   r  r  r2   r2   r3   slow_forward  s.   

$0
zLfm2ShortConv.slow_forwardr4   c                 C   s6   t rd|jjv rt s| ||||S | ||||S )Ncuda)is_fast_path_availablerZ   ro   r   r  r  )r.   r4   r   r   r   r2   r2   r3   r@     s   zLfm2ShortConv.forwardr|   )rE   rF   rG   r   r   r(   r*   rI   r   r   r  r  r@   rJ   r2   r2   r0   r3   r     sT    
%
)r   c                       s   e Zd Zdedef fddZ					ddejdeejejf dB dejdB d	ej	dB d
e
dB dej	dB dejfddZ  ZS )Lfm2DecoderLayerrM   r   c                    sl   t    |j| dk| _| jrt||| _nt||| _t|| _	t
|j|jd| _t
|j|jd| _d S )Nr   r   )r'   r(   r   is_attention_layerr   	self_attnr   r  r   feed_forwardr#   r/   r   operator_normffn_normr   r0   r2   r3   r(      s   

zLfm2DecoderLayer.__init__Nr4   r   r   rv   r   r   r&   c           
   	   K   sl   |}| j r| jd| ||||||d|\}}	n| j| ||||d}|| }|| | | }|S )N)r4   r   r   rv   r   r   )r4   r   r   r   r2   )r  r  r  r  r  r  )
r.   r4   r   r   rv   r   r   r   residualr   r2   r2   r3   r@   ,  s*   

zLfm2DecoderLayer.forward)NNNNN)rE   rF   rG   r   r   r(   r*   rI   rA   r   r   r@   rJ   r2   r2   r0   r3   r    s,    	r  c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )	Lfm2PreTrainedModelrM   modelTr  r   F)r4   
attentionsN)rE   rF   rG   r   r}   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr2   r2   r2   r3   r!  N  s   
 
r!  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )	Lfm2ModelrM   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t d| _d| _t j jd| _|   d S )Nc                    s   g | ]}t  |qS r2   )r  ).0r   rM   r2   r3   
<listcomp>i  s    z&Lfm2Model.__init__.<locals>.<listcomp>r0  Fr   )r'   r(   pad_token_idpadding_idx
vocab_sizer   	Embeddingr/   embed_tokens
ModuleListr   r   layersrK   
rotary_embgradient_checkpointingr#   r   embedding_norm	post_initr.   rM   r0   r0  r3   r(   b  s   zLfm2Model.__init__N	input_idsr   rv   r   inputs_embeds	use_cacher   r   r&   c              	   K   s6  |d u |d uA rt d|d u r| |}|r+|d u r+|jd }	t| j|	| j| jd}|d u rG|d ur7| nd}
tj	|
|
|jd  |jd}|d u rP|
d}t| j|||||d}|jd dkrd|nd }|}| j||d}| jd | jj D ]}|jr|n|}||f|||||d|}qx| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )rM   r   r8   rZ   r   )rZ   )rM   r?  r   r   r   rv   )rv   )r   r   rv   r   r   )last_hidden_stater   )
ValueErrorr6  rB   r   rM   r8   rZ   r   r*   rc   r   r   r9  r8  r   r  r;  r   )r.   r>  r   rv   r   r?  r@  r   r   
batch_sizer   causal_masklinear_attentionr4   r   decoder_layer
layer_maskr2   r2   r3   r@   r  sZ   


	

zLfm2Model.forward)NNNNNNN)rE   rF   rG   r   r(   r   r   r   r*   r   rI   r   FloatTensorboolr   r   r   r@   rJ   r2   r2   r0   r3   r.  `  s>    	
r.  c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )Lfm2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr4   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFr   )
r'   r(   r.  r"  r4  r   r   r/   rK  r<  r=  r0   r2   r3   r(     s
   
zLfm2ForCausalLM.__init__Nr   r>  r   rv   r   r?  labelsr@  r   logits_to_keepr   r&   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r>  r   rv   r   r?  r@  r   N)rM  rN  r4  )lossrM  r   r4   r#  r2   )r"  rA  rn   r   slicerK  loss_functionrM   r4  r   r   r4   r#  )r.   r>  r   rv   r   r?  rN  r@  r   rO  r   outputsr4   slice_indicesrM  rP  r2   r2   r3   r@     s0    zLfm2ForCausalLM.forward)	NNNNNNNNr   )rE   rF   rG   _tied_weights_keys_tp_plan_pp_planr(   r   r   r*   r   rI   r   rH  rI  r   r   r   r   r@   rJ   r2   r2   r0   r3   rJ    sN    		
rJ  )rJ  r.  r!  )r   )r   )Icollections.abcr   typingr   r   r*   torch.nn.functionalr   r   r   cache_utilsr   
generationr   integrationsr	   r
   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.import_utilsr   r   utils.output_capturingr   configuration_lfm2r   causal_conv1dr   r    Moduler#   rK   r   r   r   r   rI   r   r   rH   r   r   r   kernel_modulesallr  r   r  r!  r.  rJ  __all__r2   r2   r2   r3   <module>   s   A 
<k/WK