o
    پi_                     @   s  d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0 G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd dej1Z6G dd  d ej1Z7G d!d" d"ej1Z8e8gZ9dS )#a(  
LFM2-MoE (Liquid Foundation Model 2 - Mixture of Experts) implementation for SGLang.

This is a hybrid architecture with attention, ShortConv, and MoE layers:
- Attention layers use standard KV cache (RadixAttention)
- Conv layers use MambaPool for state caching (via HybridReqToTokenPool)
- First `num_dense_layers` use dense MLP, rest use MoE with sigmoid routing

Key MoE characteristics:
- Sigmoid routing (not softmax) - auxiliary-loss-free style
- Expert bias (fp32) affects selection but not weighting
- Post-hoc normalization of top-k weights
    )IterableOptionalSetTupleN)nn)Lfm2MoeConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)causal_conv1d_fncausal_conv1d_update)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)FusedMoE)TopK)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loadersharded_weight_loader)
add_prefixmake_layersset_weight_attrsc                       sN   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZ  ZS )
Lfm2MoeMLPz3Dense MLP for first N layers (before MoE kicks in).N configquant_configprefixc                    sX   t    t|j|jgd d|td|d| _t|j|jd|td|d| _t	 | _
d S )N   Fgate_up_projbiasr#   r$   	down_proj)super__init__r   hidden_sizeintermediate_sizer   r&   r   r)   r
   act_fnselfr"   r#   r$   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/lfm2_moe.pyr+   7   s    

zLfm2MoeMLP.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r&   r.   r)   )r0   r5   gate_up_outr3   r3   r4   forwardO   s   
zLfm2MoeMLP.forwardNr!   )__name__
__module____qualname____doc__r   r   r   strr+   torchTensorr;   __classcell__r3   r3   r1   r4   r    4   s    r    c                	       sR   e Zd ZdZ		ddededee def fdd	Z	d
e
jde
jfddZ  ZS )Lfm2MoeSparseMoeBlocka6  
    Sparse MoE block with sigmoid routing using optimized FusedMoE.

    Key features:
    - Sigmoid scoring (not softmax) - auxiliary-loss-free style
    - Expert bias (fp32) for load balancing
    - Bias affects selection only, not weighting
    - Uses FusedMoE for efficient batched expert computation
    Nr!   r"   	layer_idxr#   r$   c                    s   t    t | _|j| _| j|jkr td| j d|j dt|j|jdd t	d|d| _
|jr@ttj|jtjd| _n| dd  t|j||jd	|jrS| jnd d
| _t|j|j|j|j|d|t	d|d| _d S )NzTensor parallel size z' is greater than the number of experts .Fgater'   dtypeexpert_biassigmoid)top_klayer_idrenormalizescoring_funccorrection_biasTexperts)num_expertsrM   r,   r-   rN   reduce_resultsr#   r$   )r*   r+   r	   tp_sizerouted_scaling_factorrS   
ValueErrorr   r,   r   rH   use_expert_biasr   	ParameterrB   zerosfloat32rK   register_parameterr   num_experts_per_toknorm_topk_probtopkr   moe_intermediate_sizerR   r0   r"   rF   r#   r$   r1   r3   r4   r+   a   sL   

	zLfm2MoeSparseMoeBlock.__init__hidden_statesr6   c                 C   s0   |  |\}}| ||}| ||}|| j S )z-Optimized expert forward pass using FusedMoE.)rH   r_   rR   rV   )r0   rb   router_logitsr9   topk_outputfinal_hidden_statesr3   r3   r4   r;      s   
zLfm2MoeSparseMoeBlock.forwardr<   )r=   r>   r?   r@   r   intr   r   rA   r+   rB   rC   r;   rD   r3   r3   r1   r4   rE   V   s    ;rE   c                       s`   e Zd ZdZ		ddededee deddf
 fd	d
Z	de
jde
jdede
jfddZ  ZS )Lfm2MoeAttentionz4Grouped-query attention with RoPE and Q/K layernorm.Nr!   r"   rN   r#   r$   r6   c              
      sH  t    |j| _|j| _|j| _| j| j | _| jd | _t	|dd }|d ur1d|v r1|d }nt	|dd}t
| j| jt	|ddt	|dd |dt d	| _t| j| j| j| jd
|td|d| _t| j| j | jd
|td|d| _t| j|jd| _t| j|jd| _| jj| _| jj| _t| j| j| j| j|td|d| _d S )Ng      rope_parameters
rope_thetag    .Amax_position_embeddingsi  rope_scalingT)	head_size
rotary_dimmax_positionrk   baseis_neox_stylerJ   Fqkv_projr'   out_projepsattn)	num_headshead_dimscalingnum_kv_headsrN   r$   )r*   r+   r,   num_attention_headstotal_num_headsnum_key_value_headstotal_num_kv_headsrw   rx   getattrr   rB   get_default_dtype
rotary_embr   r   rq   r   rr   r   norm_epsq_layernormk_layernormrv   num_local_q_headsry   num_local_kv_headsr   ru   )r0   r"   rN   r#   r$   rh   ri   r1   r3   r4   r+      s^   




	


zLfm2MoeAttention.__init__	positionsrb   forward_batchc                 C   s   |j d }| |\}}| j| j }| j| j }tj||||gdd\}	}
}|	|| j| j}	|
|| j| j}
| |	d| j|| j| j}	| 	|
d| j|| j| j}
| 
||	|
\}	}
| |	|d|
|d||}| |\}}|S )Nr   dim)shaperq   r   rw   r   rB   splitreshaper   r   r   ru   rr   )r0   r   rb   r   Tqkvr9   q_sizekv_sizeqkvattn_outr:   r3   r3   r4   r;      s"   


 zLfm2MoeAttention.forwardr<   r=   r>   r?   r@   r   rf   r   r   rA   r+   rB   rC   r   r;   rD   r3   r3   r1   r4   rg      s0    >rg   c                	       sV   e Zd ZdZ		ddededee def fdd	Z	d
e
jdede
jfddZ  ZS )Lfm2MoeShortConvz
    Gated short convolution layer using optimized causal_conv1d kernels.

    Architecture: in_proj -> split(B, C, x) -> Bx -> conv1d -> C*conv_out -> out_proj
    - Supports tensor parallelism: hidden dimension is sharded across TP ranks
    Nr!   r"   rF   r#   r$   c                    s   t    || _t|j| _t|j| _|j	| _	t
 | _| j	| j | _t|j	|j	gd | j|| dd| _t|j	|j	| jd|| dd| _tt| j| j| _t| jdtdi | jrvtt| j| _t| jdtdi d S | d	d  d S )
N   z.in_projr'   Tz	.out_proj)r(   input_is_parallelr#   r$   weight_loaderr   	conv_bias)r*   r+   rF   rf   conv_L_cacheconv_kernelboolr   use_biasr,   r	   rU   hidden_size_per_partitionr   in_projr   rr   r   rY   rB   emptyconv_weightr   r   r\   ra   r1   r3   r4   r+     s@   


zLfm2MoeShortConv.__init__rb   r   r6   c              
   C   sL  |j  r|S |j| j}|jd }|j}| |\}}|jddd\}}	}
||
 }|j 	 rAt
||| j| jd |tjd}nZ|jd }|dd }|j}|d urut|dkru|t|d }||d d< ||d< |tj}n|jd|gtjd}|d d tj}t|| j| j||d |d ddd}| |	| \}}|S )	Nr   r   r   r   )
activationconv_state_indices   rI   )query_start_loccache_indiceshas_initial_stateconv_statesr   )forward_modeis_idlereq_to_token_poolmamba2_layer_cacherF   convreq_pool_indicesr   chunk	is_decoder   r   r   torB   int32r   	transpose
contiguousextend_start_loclen	new_empty
new_tensorr   rr   )r0   rb   r   layer_cache
conv_stater   projr9   B_gateC_gater5   Bxconv_outr   Bx_tr   r   r   outputr3   r3   r4   r;   A  sR   




		zLfm2MoeShortConv.forwardr<   r   r3   r3   r1   r4   r   
  s(    /r   c                       st   e Zd ZdZ		ddededee def fdd	Z	ded
e
jde
jdee
j dedee
je
jf fddZ  ZS )Lfm2MoeDecoderLayerz
    Decoder layer with attention/conv and dense MLP or MoE.

    - Layers 0 to num_dense_layers-1: use Lfm2MoeMLP (dense)
    - Layers num_dense_layers+: use Lfm2MoeSparseMoeBlock (MoE)
    Nr!   r"   rN   r#   r$   c                    s   t    |j| | _| jdk| _t|j|jd| _t|j|jd| _	| jr3t
|||td|d| _nt|||td|d| _||jk rQt||td|d| _d S t|||td|d| _d S )	Nfull_attentionrs   	self_attnr"   rN   r#   r$   r   )r"   rF   r#   r$   feed_forward)r"   r#   r$   )r*   r+   layer_types
layer_typeis_attention_layerr   r,   r   operator_normffn_normrg   r   r   r   r   num_dense_layersr    r   rE   )r0   r"   rN   r#   r$   r1   r3   r4   r+     s<   


zLfm2MoeDecoderLayer.__init__r   rb   residualr   r6   c                 K   s^   |j  s+|}| |}| jr| |||}n| ||}|| }|| | | }||fS r7   )r   r   r   r   r   r   r   r   )r0   rN   r   rb   r   r   kwargsnormedr3   r3   r4   r;     s   
	
zLfm2MoeDecoderLayer.forwardr<   )r=   r>   r?   r@   r   rf   r   r   rA   r+   rB   rC   r   r   r;   rD   r3   r3   r1   r4   r   |  s4    -r   c                       sb   e Zd Z		ddedee def fddZ	ddej	d	ej	d
e
deej	 dej	f
ddZ  ZS )Lfm2MoeModelNr!   r"   r#   r$   c                    s   t     | _t j j jtd|d| _tdd  j	D | _
dtdtf fdd}t j|| d	d
| _t j jd| _d S )Nembed_tokens)org_num_embeddingsr$   c                 s   s    | ]	}|d krdV  qdS )r   r   Nr3   ).0ltr3   r3   r4   	<genexpr>  s    z(Lfm2MoeModel.__init__.<locals>.<genexpr>idxr$   c                    s   t  | |dS )Nr   )r   )r   r$   r   r"   r#   r3   r4   	get_layer  s   z(Lfm2MoeModel.__init__.<locals>.get_layerz.layersr$   rs   )r*   r+   r"   r   
vocab_sizer,   r   r   sumr   num_attention_layersrf   rA   r   num_hidden_layerslayersr   r   embedding_norm)r0   r"   r#   r$   r   r1   r   r4   r+     s    

zLfm2MoeModel.__init__	input_idsr   r   inputs_embedsr6   c                 C   sT   |d ur|n|  |}d }tt| jD ]}| j| |||||d\}}q| |S )N)rN   r   rb   r   r   )r   ranger   r   r   )r0   r   r   r   r   rb   r   ir3   r3   r4   r;     s   
zLfm2MoeModel.forwardr<   r7   )r=   r>   r?   r   r   r   rA   r+   rB   rC   r   r;   rD   r3   r3   r1   r4   r     s,    'r   c                       s   e Zd ZdZdZ		ddedee deddf fd	d
Z	de
fddZe 	ddejdejdedeej fddZ	ddeeeejf  dedee fddZ  ZS )Lfm2MoeForCausalLMz&LFM2-MoE for causal language modeling.FNr!   r"   r#   r$   r6   c                    s   t    || _t | _| jjr| jjsJ || _t||t	d|d| _
t|j|j||jt	d|d| _t|| _| j
j| _d S )Nmodelr   lm_head)r#   r   r$   )r*   r+   r"   r   pp_groupis_first_rankis_last_rankr#   r   r   r   r   r   r,   r   r   logits_processorr   r/   r1   r3   r4   r+     s"   

zLfm2MoeForCausalLM.__init__c                 C   s   | j S r7   )r   )r0   r3   r3   r4   get_num_kv_cache_layers$  s   z*Lfm2MoeForCausalLM.get_num_kv_cache_layersr   r   r   r   c                 K   s"   |  ||||}| ||| j|S r7   )r   r   r   )r0   r   r   r   r   r   rb   r3   r3   r4   r;   '  s   	
zLfm2MoeForCausalLM.forwardweightsis_mtpc              	   C   s  g d}t jddd| jjd}t|  }t }d}|D ]\}}	d|v r%qd|v r+|	}d	|v r:|d	d
}|	d}	d|v rD|dd}d|v rRd|vrR|dd}|D ];\}
}}||vr^qTd|v rcqT|||
}|	drt||vrt nn||vrz nh|| }t
|d}|||	| ||  nR|D ],\}
}}}||vrq|||
}||vrq|| }|j}|||	|||d ||  n#|	dr||vrq||vrq|| }t
|dt}|||	 || qd|vrd|v r|dur|d }t
|dt}||| |d |S )z)Load weights with FusedMoE expert format.))rq   q_projr   )rq   k_projr   )rq   v_projr   )r&   w1r   )r&   w3r   r   w2r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerS   Nzrotary_emb.inv_freqzembed_tokens.weightz.conv.conv.weightz.conv.conv_weightr   z.conv.conv.biasz.conv.conv_biaszfeed_forward.w2rR   zfeed_forward.down_projz.biasr   )shard_id	expert_idzlm_head.weight)r   make_expert_params_mappingr"   rS   dictnamed_parameterssetreplacesqueezeendswithr~   addr   r   )r0   r   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsembed_tokens_weightnameloaded_weight
param_nameweight_namer   paramr   r   r3   r3   r4   load_weights5  s   









zLfm2MoeForCausalLM.load_weightsr<   r7   )F)r=   r>   r?   r@   fall_back_to_pt_during_loadr   r   r   rA   r+   rf   r   rB   no_gradrC   r   r;   r   r   r   r   r  rD   r3   r3   r1   r4   r     sD    r   ):r@   typingr   r   r   r   rB   r   sglang.srt.configs.lfm2_moer   sglang.srt.distributedr   r	   sglang.srt.layers.activationr
   /sglang.srt.layers.attention.mamba.causal_conv1dr   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   &sglang.srt.layers.moe.fused_moe_tritonr   sglang.srt.layers.moe.topkr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   Moduler    rE   rg   r   r   r   r   
EntryClassr3   r3   r3   r4   <module>   s:    "U_rO; 
"