o
    پiK                     @   s  d Z ddlZddlmZmZmZmZ ddlZddlm	  m
Z ddlm	Z	 ddlmZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. e/e0Z1G dd de	j2Z3G dd de	j2Z4G dd de	j2Z5G dd de	j2Z6G dd de	j2Z7G dd de	j2Z8e8gZ9dS )a  
LFM2 (Liquid Foundation Model 2) implementation for SGLang.

This is a hybrid architecture with both attention and short conv layers.
- Attention layers use standard KV cache (RadixAttention)
- Conv layers use MambaPool for state caching (via HybridReqToTokenPool)

The model uses a gated 1D causal convolution (kernel=3) instead of attention
in some layers, providing linear memory complexity for those layers.

Uses optimized causal_conv1d kernels from the mamba package for fast inference.
    N)IterableOptionalSetTuple)nn)
Lfm2Config)get_pp_group$get_tensor_model_parallel_world_size)causal_conv1d_fncausal_conv1d_update)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loadersharded_weight_loader)
add_prefixmake_layersset_weight_attrsc                       sN   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZ  ZS )Lfm2MLPzMLP with SwiGLU activation.N configquant_configprefixc                    s   t    |j}|jr,td| d }|jd ur,t|j| }|j||j d |j  }t|j|d|t	d|d| _
t|j|d|t	d|d| _t||jd|t	d|d| _d S )	N         Fw1biasr    r!   w3w2)super__init__intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   hidden_sizer   r%   r(   r   r)   )selfr   r    r!   r,   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/lfm2.pyr+   7   sD   

zLfm2MLP.__init__xreturnc                 C   s8   |  |\}}| |\}}| t|| \}}|S N)r%   r(   r)   Fsilu)r2   r7   gate_upoutr5   r5   r6   forwarda   s   zLfm2MLP.forwardNr   )__name__
__module____qualname____doc__r   r   r   strr+   torchTensorr@   __classcell__r5   r5   r3   r6   r   4   s    *r   c                       s`   e Zd ZdZ		ddededee deddf
 fd	d
Z	de
jde
jdede
jfddZ  ZS )Lfm2Attentionz4Grouped-query attention with RoPE and Q/K layernorm.Nr   r   layer_idr    r!   r8   c              
      sT  t    |j| _|j| _|j| _t|dd p| j| j | _| jd | _	t|dd }|d ur7d|v r7|d }nt|dd}t
| j| jt|ddt|dd |d	t d
| _t| j| j| j| jd|td|d| _t| j| j | jd|td|d| _t| j|jd| _t| j|jd| _| jj| _| jj| _t| j| j| j	| j|td|d| _d S )Nhead_dimg      rope_parameters
rope_thetai'  max_position_embeddingsi    rope_scalingT)	head_size
rotary_dimmax_positionrP   baseis_neox_styledtypeFqkv_projr&   out_projepsattn)	num_headsrL   scalingnum_kv_headsrK   r!   )r*   r+   r1   num_attention_headstotal_num_headsnum_key_value_headstotal_num_kv_headsgetattrrL   r]   r   rG   get_default_dtype
rotary_embr   r   rW   r   rX   r   norm_epsq_layernormk_layernormr\   num_local_q_headsr^   num_local_kv_headsr   r[   )r2   r   rK   r    r!   rM   rN   r3   r5   r6   r+   k   sb   





	


zLfm2Attention.__init__	positionshidden_statesforward_batchc                 C   s   |j d }| |\}}| j| j }| j| j }tj||||gdd\}	}
}|	|| j| j}	|
|| j| j}
| |	d| j|| j| j}	| 	|
d| j|| j| j}
| 
||	|
\}	}
| |	|d|
|d||}| |\}}|S )Nr   dim)shaperW   ri   rL   rj   rG   splitreshaperg   rh   re   r[   rX   )r2   rk   rl   rm   Tqkvr=   q_sizekv_sizeqkvattn_outr?   r5   r5   r6   r@      s"   


 zLfm2Attention.forwardrA   rB   rC   rD   rE   r   r.   r   r   rF   r+   rG   rH   r   r@   rI   r5   r5   r3   r6   rJ   h   s0    @rJ   c                	       sV   e Zd ZdZ		ddededee def fdd	Z	d
e
jdede
jfddZ  ZS )Lfm2ShortConva  
    Gated short convolution layer using optimized causal_conv1d kernels.

    Architecture: in_proj -> split(B, C, x) -> Bx -> conv1d -> C*conv_out -> out_proj
    - Uses double gating: B (before conv) and C (after conv)
    - Fixed-size cache: stores last (kernel_size - 1) tokens
    - Uses causal_conv1d_fn for prefill and causal_conv1d_update for decode
    - Supports tensor parallelism: hidden dimension is sharded across TP ranks
    Nr   r   	layer_idxr    r!   c                    s   t    || _t|j| _t|j| _|j	| _	t
 }| j	| | _t|j	|j	gd | j|| dd| _t|j	|j	| jd|| dd| _tt| j| j| _t| jdtdi | jrttt| j| _t| jdtdi d S | d	d  d S )
Nr#   z.in_projr&   Tz	.out_proj)r'   input_is_parallelr    r!   weight_loaderr   	conv_bias)r*   r+   r~   r.   conv_L_cacheconv_kernelboolr   use_biasr1   r	   hidden_size_per_partitionr   in_projr   rX   r   	ParameterrG   emptyconv_weightr   r   register_parameter)r2   r   r~   r    r!   tp_sizer3   r5   r6   r+      s@   


zLfm2ShortConv.__init__rl   rm   r8   c              
   C   sJ  |j  r|S |j| j}|jd }|j}| |\}}|jddd\}}	}
||
 }|j 	 rAt
||| j| jd |tjd}nY|jd }|dd }|j}|d urrt|dkrrt|tj|gtj|jdg}|tj}ntjd|gtj|jd}|d d tj}t|| j| j||d |d ddd}| |	| \}}|S )	Nr   r#   rn   ro   )
activationconv_state_indicesr$   )rV   device)query_start_loccache_indiceshas_initial_stateconv_statesr   )forward_modeis_idlereq_to_token_poolmamba2_layer_cacher~   convreq_pool_indicesr   chunk	is_decoder   r   r   torG   int32rq   	transpose
contiguousextend_start_loclencattensorr   r
   rX   )r2   rl   rm   layer_cache
conv_stater   projr=   B_gateC_gater7   Bxconv_outrt   Bx_tr   r   r   outputr5   r5   r6   r@     s^   





	zLfm2ShortConv.forwardrA   r|   r5   r5   r3   r6   r}      s(    .r}   c                       st   e Zd ZdZ		ddededee def fdd	Z	ded
e
jde
jdee
j dedee
je
jf fddZ  ZS )Lfm2DecoderLayerz9Decoder layer - either attention or conv based on config.Nr   r   rK   r    r!   c                    s   t    |j| | _| jdk| _t|j|jd| _t|j|jd| _	| jr3t
|||td|d| _nt|||td|d| _t||td|d| _d S )	Nfull_attentionrY   	self_attnr   rK   r    r!   r   )r   r~   r    r!   feed_forward)r   r    r!   )r*   r+   layer_types
layer_typeis_attention_layerr   r1   rf   operator_normffn_normrJ   r   r   r}   r   r   r   )r2   r   rK   r    r!   r3   r5   r6   r+   F  s.   

zLfm2DecoderLayer.__init__rk   rl   residualrm   r8   c                 K   s^   |j  s+|}| |}| jr| |||}n| ||}|| }|| | | }||fS r9   )r   r   r   r   r   r   r   r   )r2   rK   rk   rl   r   rm   kwargsnormedr5   r5   r6   r@   i  s   
	
zLfm2DecoderLayer.forwardrA   )rB   rC   rD   rE   r   r.   r   r   rF   r+   rG   rH   r   r   r@   rI   r5   r5   r3   r6   r   C  s4    #r   c                       sb   e Zd Z		ddedee def fddZ	ddej	d	ej	d
e
deej	 dej	f
ddZ  ZS )	Lfm2ModelNr   r   r    r!   c                    s   t     | _t j j jtd|d| _tdd  j	D | _
dtdtf fdd}t j|| d	d
| _t j jd| _d S )Nembed_tokens)org_num_embeddingsr!   c                 s   s    | ]	}|d krdV  qdS )r   r$   Nr5   ).0ltr5   r5   r6   	<genexpr>  s    z%Lfm2Model.__init__.<locals>.<genexpr>idxr!   c                    s   t  | |dS )Nr   )r   )r   r!   r   r   r    r5   r6   	get_layer  s   z%Lfm2Model.__init__.<locals>.get_layerz.layersr!   rY   )r*   r+   r   r   
vocab_sizer1   r   r   sumr   num_attention_layersr.   rF   r   num_hidden_layerslayersr   rf   embedding_norm)r2   r   r    r!   r   r3   r   r6   r+     s    

zLfm2Model.__init__	input_idsrk   rm   inputs_embedsr8   c                 C   sT   |d ur|n|  |}d }tt| jD ]}| j| |||||d\}}q| |S )N)rK   rk   rl   r   rm   )r   ranger   r   r   )r2   r   rk   rm   r   rl   r   ir5   r5   r6   r@     s   
zLfm2Model.forwardrA   r9   )rB   rC   rD   r   r   r   rF   r+   rG   rH   r   r@   rI   r5   r5   r3   r6   r     s,    'r   c                       s   e Zd ZdZdZ		ddedee deddf fd	d
Z	de
fddZe 	ddejdejdedeej fddZ	ddeeeejf  dedee fddZ  ZS )Lfm2ForCausalLMzJLFM2 for causal language modeling with hybrid attention/conv architecture.FNr   r   r    r!   r8   c                    s   t    || _t | _| jjr| jjsJ || _t||t	d|d| _
t|j|j||jt	d|d| _t|| _| j
j| _d S )Nmodelr   lm_head)r    r   r!   )r*   r+   r   r   pp_groupis_first_rankis_last_rankr    r   r   r   r   r   r1   r   r   logits_processorr   )r2   r   r    r!   r3   r5   r6   r+     s   

zLfm2ForCausalLM.__init__c                 C   s   | j S r9   )r   )r2   r5   r5   r6   get_num_kv_cache_layers  s   z'Lfm2ForCausalLM.get_num_kv_cache_layersr   rk   rm   r   c                 K   s"   |  ||||}| ||| j|S r9   )r   r   r   )r2   r   rk   rm   r   r   rl   r5   r5   r6   r@     s   	
zLfm2ForCausalLM.forwardweightsis_mtpc                 C   sp  g d}t |  }t }d }|D ]\}}d|v rqd|v r |}d|v r/|dd}|d}d|v r9|dd}|D ]6\}	}
}|
|vrEq;||
|	}|d	rV||vrV n?||vr\ n9|| }t|d
}|||| ||  n#|d	r|||vr|q||vrq|| }t|d
t}||| || qd|vrd|v r|d ur|d }t|d
t}||| |d |S )N))rW   q_projrx   )rW   k_projry   )rW   v_projrz   zrotary_emb.inv_freqzembed_tokens.weightz.conv.conv.weightz.conv.conv_weightr$   z.conv.conv.biasz.conv.conv_biasz.biasr   zlm_head.weight)	dictnamed_parameterssetreplacesqueezeendswithrc   addr   )r2   r   r   stacked_params_mappingparams_dictloaded_paramsembed_tokens_weightnameloaded_weight
param_nameweight_nameshard_idparamr   r5   r5   r6   load_weights  sV   






zLfm2ForCausalLM.load_weightsrA   r9   )F)rB   rC   rD   rE   fall_back_to_pt_during_loadr   r   r   rF   r+   r.   r   rG   no_gradrH   r   r@   r   r   r   r   r   rI   r5   r5   r3   r6   r     sD    r   ):rE   loggingtypingr   r   r   r   rG   torch.nn.functionalr   
functionalr:   sglang.srt.configs.lfm2r   sglang.srt.distributedr   r	   /sglang.srt.layers.attention.mamba.causal_conv1dr
   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   	getLoggerrB   loggerModuler   rJ   r}   r   r   r   
EntryClassr5   r5   r5   r6   <module>   s6    
4az@;
l