o
    	۷iQ                     @   s  d dl mZmZmZmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e r}d dl(m)Z)m*Z* nd\Z)Z*e)e*fZ+e,e+Z-e.e/Z0G dd de"Z1G dd de#Z2G dd dej3Z4G dd dZ5G dd deZ6G dd dej3Z7G d d! d!eZ8G d"d# d#e!Z9G d$d% d%e Z:G d&d' d'eZ;g d(Z<dS ))    )AnyCallableOptionalUnionN)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg)is_causal_conv1d_available   )apply_mask_to_padding_states)LlamaAttentionLlamaForCausalLM
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNc                   @      e Zd ZdS )Lfm2RMSNormN__name__
__module____qualname__ r&   r&   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/lfm2/modular_lfm2.pyr!   7       r!   c                   @   r    )Lfm2RotaryEmbeddingNr"   r&   r&   r&   r'   r)   ;   r(   r)   c                       s*   e Zd Zdef fddZdd Z  ZS )Lfm2MLPconfigc                    s   t    |j}|jr,td| d }|jd ur,t|j| }|j||j d |j  }tj|j	|dd| _
tj|j	|dd| _tj||j	dd| _d S )Nr   r   r   Fbias)super__init__intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearhidden_sizew1w3w2)selfr+   r0   	__class__r&   r'   r/   @   s   

zLfm2MLP.__init__c                 C   s    |  t| || | S N)r9   Fsilur7   r8   )r:   xr&   r&   r'   forwardO   s    zLfm2MLP.forward)r#   r$   r%   r   r/   rA   __classcell__r&   r&   r;   r'   r*   ?   s    r*   c                   @   s  e Zd ZdZdZdZdZdZej	dfde
dedejdeejedf fdd	Z	d#d
ejdejdedeeeef  deejejf f
ddZdejfddZd$dee defddZdejdedeeef fddZdefddZdefddZdedeejejf fdd Zd!d" ZdS )%Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFr+   max_batch_sizedtypedevicec                 C   s   g | _ g | _|| _|j| _| jd| _|j| _|| _g | _|d ur't	
|nd }t|jD ]}t	j| j|j| j| j|d}t	j| | j| q.d S )Nfull_attention)rE   rF   )	key_cachevalue_cacherD   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cachetorchrF   rangenum_hidden_layerszerosr6   _dynamomark_static_addressappend)r:   r+   rD   rE   rF   _
conv_stater&   r&   r'   r/   b   s(   zLfm2HybridConvCache.__init__
key_statesvalue_states	layer_idxcache_kwargsreturnc                 C   s   |duret | j|kr5tt | j|D ]}| jtg  | jtg  q| j| | j| n0| j|  sG|| j|< || j|< ntj| j| |gdd| j|< tj| j| |gdd| j|< | j| | j| fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        Ndim)	lenrH   rQ   rV   rP   tensorrI   numelcat)r:   rY   rZ   r[   r\   rW   r&   r&   r'   update   s   
zLfm2HybridConvCache.updatebeam_idxc                 C   s   t t| jD ]A}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)rQ   ra   rH   rF   index_selecttorI   rO   )r:   rf   r[   rF   r&   r&   r'   reorder_cache   s    z!Lfm2HybridConvCache.reorder_cacher   c                 C   sL   | j | dkr
| jn|}t| j|ks| j|  dkrdS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.rG   r   r^   )rJ   rL   ra   rH   rc   shaper:   r[   r&   r&   r'   get_seq_length   s    z"Lfm2HybridConvCache.get_seq_lengthcache_positionc                 C   s&   d}|j d }|  }|| }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rj   rl   )r:   rm   r[   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengthr&   r&   r'   get_mask_sizes   s
   
z"Lfm2HybridConvCache.get_mask_sizes
max_lengthc                 C   s   |dk r|   t| }|   |krdS tt| jD ]+}| j|  rF| j| dd|ddf | j|< | j| dd|ddf | j|< qdS )z"Crop the cache to the given lengthr   N.)rl   absrQ   ra   rH   rc   rI   )r:   rs   idxr&   r&   r'   crop   s   ""zLfm2HybridConvCache.cropc                 C   s
   t | jS r=   )ra   rH   )r:   r&   r&   r'   __len__   s   
zLfm2HybridConvCache.__len__c                 C   s   | j | | j| fS r=   )rH   rI   rk   r&   r&   r'   __getitem__   s   zLfm2HybridConvCache.__getitem__c                 C   s&   t t| jD ]	}| j|   qd S r=   )rQ   ra   rO   zero_rk   r&   r&   r'   reset   s   zLfm2HybridConvCache.resetr=   )r   )r#   r$   r%   __doc__rD   is_compileablerH   rI   rP   float32r   r2   rE   r   rF   strr/   Tensorr   dictr   tuplere   
LongTensorri   rl   rr   rv   rw   rx   rz   r&   r&   r&   r'   rC   S   sH    	
"
+ rC   c                       s   e Zd Zdedef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej de	eje
ej f fddZ  ZS )Lfm2Attentionr+   r[   c                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _t| j|jd| _t| j|jd| _| `| `d S )NFr,   eps)r.   r/   r   r5   r6   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projout_projr!   norm_epsq_layernormk_layernormo_projattention_dropoutr:   r+   r[   r;   r&   r'   r/      s   zLfm2Attention.__init__past_key_valuepast_key_values4.58new_nameversionNhidden_statesposition_embeddingsattention_maskrm   r]   c                 K   s$  |j d d }g |d| jR }| | |j| dd}	| | |j| dd}
| |j| dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t}| jjdkrkt| jj }|| |	|
||fd| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosrm   eagerg        )dropoutscaling)rj   r   r   r   view	transposer   r   r   r   re   r[   r   r+   _attn_implementationr   r   reshape
contiguousr   )r:   r   r   r   r   rm   kwargsinput_shapehidden_shapequery_statesrY   rZ   r   r   r\   attention_interfaceattn_outputattn_weightsoutputr&   r&   r'   rA      s8   



zLfm2Attention.forwardr   )r#   r$   r%   r   r2   r/   r   rP   r   r   r   rC   r   rA   rB   r&   r&   r;   r'   r      s$    r   c                       s   e Zd Zdedef fddZedddd						dd
ejde	e
 de	ej de	ej fddZedddd						dd
ejde	e
 de	ej de	ej fddZedddd						ddejde	e
 de	ej de	ej fddZ  ZS )Lfm2ShortConvr+   r[   c                    s   t    || _|| _|j| _|j| _tj	|j
|j
| j|j
| j| jd d| _tj|j
d|j
 | jd| _tj|j
|j
| jd| _d S )Nr   )in_channelsout_channelskernel_sizegroupsr-   paddingr   r,   )r.   r/   r+   r[   rM   L_cache	conv_biasr-   r   Conv1dr6   convr5   in_projr   r   r;   r&   r'   r/     s   
zLfm2ShortConv.__init__r   r   r   r   Nr@   rm   r   c                 C   s  t ||}| |dd}|jddd\}}}|| }| jj| jjd| jjd}	|d urO|d dkrOt|	d|j
| j |	| jjd }
|
d}
n&|d urktj|| j|jd  df}|j
| j | t||	| jjd d}
||
 }| |dd }|S )Nr   r^   r   r_   r   r   )
activation)r   r   r   chunkr   weightr   sizer   squeezerO   r[   r-   	unsqueezer   
functionalpadr   rj   copy_r   r   r   )r:   r@   r   rm   r   BCxBCBxconv_weightsconv_outrX   yr&   r&   r'   cuda_kernels_forward1  s*   
$
z"Lfm2ShortConv.cuda_kernels_forwardc                 C   sz  |j d }t||}| |dd}|jddd\}}}|| }	|d ur|d dkr|j| j }
|d| jd }|
j	ddd}
|	j
|
j|
jd|
d d d d |f< |j| j |
 tj|

|	j| jjd d dd d f  dd}| jr}|| jj7 }|d}n'|d urtj|	| j|	j d  df}
|j| j |
 | |	d	d |f }|| }|dd }| |}|S )
Nr   r   r^   r   r_   r   )shiftsdims)rF   rE   .)rj   r   r   r   r   rO   r[   clampr   rollrh   rF   rE   r   rP   sumr   r   r-   r   r   r   r   r   r   )r:   r@   r   rm   r   seqlenr   r   r   r   rX   r   r   r&   r&   r'   slow_forwardT  s.   

$0
zLfm2ShortConv.slow_forwardr   c                 C   s:   t rd|jjv rtj s| ||||S | ||||S )Ncuda)is_fast_path_availablerF   typerP   rT   is_compilingr   r   )r:   r   r   rm   r   r&   r&   r'   rA   {  s   zLfm2ShortConv.forward)NNN)r#   r$   r%   r   r2   r/   r   rP   r   r   rC   r   r   r   rA   rB   r&   r&   r;   r'   r     sZ    "&r   c                       s   e Zd Zdedef fddZedddd								dd
ejde	ejejf de
ej de
ej de
e de
ej dejfddZ  ZS )Lfm2DecoderLayerr+   r[   c                    sl   t    |j| dk| _| jrt||| _nt||| _t|| _	t
|j|jd| _t
|j|jd| _d S )NrG   r   )r.   r/   rJ   is_attention_layerr   	self_attnr   r   r*   feed_forwardr!   r6   r   operator_normffn_normr   r;   r&   r'   r/     s   

zLfm2DecoderLayer.__init__r   r   r   r   Nr   r   r   position_idsrm   r]   c           
   	   K   sl   |}| j r| jd| ||||||d|\}}	n| j| ||||d}|| }|| | | }|S )N)r   r   r   r   r   rm   )r   r   rm   r   r&   )r   r   r   r   r   r   )
r:   r   r   r   r   r   rm   r   residualrW   r&   r&   r'   rA     s*   
zLfm2DecoderLayer.forward)NNNN)r#   r$   r%   r   r2   r/   r   rP   r   r   r   r   rC   rA   rB   r&   r&   r;   r'   r     s,    	r   c                   @   s   e Zd ZdZdS )Lfm2PreTrainedModelFN)r#   r$   r%   _can_compile_fullgraphr&   r&   r&   r'   r     s    r   c                       s   e Zd Zdef fddZ							ddeej deej deej dee	 d	eej
 d
ee deej dee defddZ  ZS )	Lfm2Modelr+   c                    s4   t  | t|| _t|j|jd| _| `| `	d S )Nr   )
r.   r/   r)   pos_embr!   r6   r   embedding_normnorm
rotary_emv)r:   r+   r;   r&   r'   r/     s
   
zLfm2Model.__init__N	input_idsr   r   r   inputs_embeds	use_cacherm   r   r]   c              	   K   s  |d u |d uA rt d|d u r| |}|r+|d u r+|jd }	t| j|	| j| jd}|d u rG|d ur7| nd}
tj	|
|
|jd  |jd}|d u rP|
d}t| j|||||d}|}| ||}| jd | jj D ]}||f|||||d|}ql| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )r+   rD   rE   rF   r   )rF   )r+   input_embedsr   rm   r   r   )r   r   r   rm   r   )last_hidden_stater   )
ValueErrorembed_tokensrj   rC   r+   rE   rF   rl   rP   aranger   r   r   layersrR   r   r
   )r:   r   r   r   r   r   r   rm   r   
batch_sizerp   causal_maskr   r   decoder_layerr&   r&   r'   rA     sV   


	

zLfm2Model.forward)NNNNNNN)r#   r$   r%   r   r/   r   rP   r   r   rC   FloatTensorboolr   r   r
   rA   rB   r&   r&   r;   r'   r     s8    		
r   c                   @   r    )Lfm2ForCausalLMNr"   r&   r&   r&   r'   r     r(   r   )r   r   r   )=typingr   r   r   r   rP   torch.nn.functionalr   r   r>   masking_utilsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   utils.import_utilsr   bamba.modeling_bambar   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_lfm2r   causal_conv1dr   r   kernel_modulesallr   
get_loggerr#   loggerr!   r)   Moduler*   rC   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   sB   (

 7n0H