o
    پiz~                     @   s  d Z ddlZddlmZmZmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE eC ZFeB ZGeHeIZJG dd dejKZLG dd dejKZMG dd de;ZNG dd  d e8ZOG d!d" d"e:ZPG d#d$ d$e9ZQeQgZRdS )%zAInference-only GLM-Lite model compatible with HuggingFace weights    N)IterableOptionalTuple)nn)PretrainedConfig)SboFlags)"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)LayerCommunicatorLayerScatterModesenable_moe_dense_fully_dp)get_attention_tp_sizeis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)TopKTopKOutputFormat)QuantizationConfig)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)DeepseekV2AttentionMLADeepseekV2DecoderLayerDeepseekV2ForCausalLMDeepseekV2ModelDeepseekV2MoE)get_global_server_args)BumpAllocator	LazyValue
add_prefixget_device_smis_cudalog_info_on_rank0make_layersc                       sz   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ				ddedede	fddZ
  ZS )Glm4MoeLiteMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixtp_ranktp_sizereturnc	           	   
      sx   t    || _t||gd d|td|||d| _t||d||td|||d| _|dkr6td| d	t	 | _
d S )
N   Fgate_up_proj)biasr2   r4   r5   r6   	down_proj)r:   r2   r3   r4   r5   r6   siluUnsupported activation: !. Only silu is supported for now.)super__init__r6   r   r(   r9   r   r;   
ValueErrorr   act_fn)	selfr/   r0   r1   r2   r3   r4   r5   r6   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/glm4_moe_lite.pyr@   S   s4   
	

zGlm4MoeLiteMLP.__init__Fshould_allreduce_fusionuse_reduce_scattergemm_output_zero_allocatorc           	      C   s   | j dkr|jd dkr|S t| jdst| jd| j_t| jds*t| jd| j_|d urW|jd dkrW| jjjtj	krW|
|jd | jj |jd | jj}|d |f}| |\}}| |}| j||pi|d\}}|S )N   r   weightweight_packed   )skip_all_reduce)r6   shapehasattrr9   getattrrL   r;   dtypetorchuint8allocateoutput_size_per_partitionviewrB   )	rC   xforward_batchrH   rI   rJ   ygate_up_rF   rF   rG   forwardz   s,   



zGlm4MoeLiteMLP.forward)NTr.   NN)NFFN)__name__
__module____qualname__intstrr   r   boolr@   r&   r^   __classcell__rF   rF   rD   rG   r-   R   sH    	
*r-   c                       s<   e Zd Z		ddedef fddZddefd	d
Z  ZS )Glm4MoeLiteGater.   Fr4   is_nextnc                    sH   t    || _tt|j|jf| _	ttj|jtj
d| _d S )N)rS   )r?   r@   rg   r   	ParameterrT   emptyn_routed_expertsr/   rL   float32e_score_correction_bias)rC   configr4   rg   rD   rF   rG   r@      s   

zGlm4MoeLiteGate.__init__NrJ   c                 C   sv   t r1| js1|jd dk r1|jd dkr1| jjd dkr1tdkr1ddlm} ||| j|j}|S t	
|| jd }|S )Nr      rK   i   rN   Z   )dsv3_router_gemm)_is_cudarg   rP   rL   
_device_sm
sgl_kernelrp   torS   Flinear)rC   hidden_statesrJ   rp   logitsrF   rF   rG   r^      s   zGlm4MoeLiteGate.forward)r.   FN)	r_   r`   ra   rc   rd   r@   r&   r^   re   rF   rF   rD   rG   rf      s    rf   c                   @   sD   e Zd Z				ddededee dedeej	j
 d	efd
dZdS )Glm4MoeLiteSparseMoeBlockNr.   Frm   layer_idr2   r4   
alt_streamrg   c           	      C   s  t j|  t | _|j| _|j| _t jrdn|j| _	|| _
|| _|| _|| _| j|jkr;td| j d|j d|jdkrItd|j dt|td||d	| _t||j| j	 t j | j	|j| j	 |j|j| j|| jtd
|d	| _t|j| j	 | j|jd|j| j	|j| jj|| j| jj|d u rt j!nd d| _"d| _#d| _$|jd ur| j	dkr|j|j }t%d|j||j|dtd|dt& ' st& ( st) rt*dddni | _+t,| j+j-j.d}| o| j+j-j/j0t1j2k| _#| o| j+j-j/j0t1j3k| _$|j| _4t& ' st& ( r4t5 | _6|jt j | _7|j| _8|j| _|j| _9| jjd ur1| jjj:nd | _;t& ' p>t& ( | _<t=> | _?d S )Nr   zTensor parallel size z' is greater than the number of experts .r<   r=   r>   gate)rm   r4   rg   experts)	num_expertsnum_fused_shared_expertstop_kr/   r0   r{   r2   routed_scaling_factorr4   T)r   r{   renormalizeuse_grouped_topknum_expert_groupr   
topk_groupcorrection_biasr2   r   %apply_routed_scaling_factor_on_outputoutput_formatFshared_experts)r/   r0   r1   r2   r3   r4   rK   )r5   r6   r2   rF   )@r   Moduler@   r
   r6   r   n_shared_expertsr%   disable_shared_experts_fusionr   rm   r{   r|   rg   rj   rA   r1   rf   r(   r~   r   ep_num_redundant_expertsnum_experts_per_tokr/   moe_intermediate_sizer   r   norm_topk_probn_groupr   rl   )should_fuse_routed_scaling_factor_in_topkr   STANDARDtopkshared_experts_is_int8shared_experts_is_fp8r-   r   	is_deepepis_mooncaker   dictr   rQ   r9   quant_methodrL   rS   rT   int8float8_e4m3fnr   r   ep_sizer   r   r   datar   _enable_a2a_moer   fuse_shared_experts_inside_sbo_fuse_shared_experts_inside_sbo)	rC   rm   r{   r2   r4   r|   rg   r0   is_packed_weightrF   rF   rG   r@      s   	



	

z"Glm4MoeLiteSparseMoeBlock.__init__)Nr.   NF)r_   r`   ra   r   rb   r   r   rc   rT   cudaStreamrd   r@   rF   rF   rF   rG   rz      s$    
rz   c                   @   sH   e Zd Z				ddededee deded	ee	j
j d
dfddZdS )Glm4MoeLiteDecoderLayerNFr.   rm   r{   r2   rg   r4   r|   r7   c                 C   s  t j|  |j| _|| _ddlm} | | _d}d }	t|dd}
|| _	t
||j|j|j|j|j|j|j||	|
|d|td|d| _| j||d	| _| j|d
 dd	}| j|d
 dd	}tj||rcd
n|j| j||d| _| jrt||td|| j	||d| _nt rd\}}nd\}}t|j|j|j|td|||d| _t|j|j d| _!t|j|j d| _"t#| j| j!| j"d|p| j	| jjd
 k| jj$d| _%d S )Nr   is_nsa_enable_prefill_cpi@B max_position_embeddingsi  F	self_attn)rm   r/   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rank
rope_thetarope_scalingr   r2   r3   r{   r4   )rg   rK   )r{   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)rm   r2   r4   r{   r|   rg   )r   rK   )NN)r/   r0   r1   r2   r4   r5   r6   epsT)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layerqkv_latent_func)&r   r   r@   r/   rm   %sglang.srt.layers.attention.nsa.utilsr   nsa_enable_prefill_cprR   r{   r    num_attention_headsr   r   r   r   r   r(   r   _is_layer_sparser   r   init_newnum_hidden_layersr   rz   r   r   r-   r0   r1   r   rms_norm_epsr   r   r   prepare_qkv_latentlayer_communicator)rC   rm   r{   r2   rg   r4   r|   r   r   r   r   r   r   mlp_tp_rankmlp_tp_sizerF   rF   rG   r@   F  s   	
	

z Glm4MoeLiteDecoderLayer.__init__)NFr.   N)r_   r`   ra   r   rb   r   r   rd   rc   rT   r   r   r@   rF   rF   rF   rG   r   E  s(    
r   c                   @   s,   e Zd Z		ddedee defddZdS )	Glm4MoeLiteModelNr.   rm   r2   r4   c                    s  t j  j_ j_ j_t _ddl	m
} | _jr&t nd _d_t dd _jjrCt j jt d_nt _trNtj nd _t j fddjjjjt d|d\_!_"_#jj$r|t% j j&d	_'ntd
d_'g _(d S )Nr   r   llama_4_scaling)use_attn_tp_groupc                    s   t  | |jdS )N)rm   r{   r2   r4   r|   )r   r|   )idxr4   rm   r2   rC   rF   rG   <lambda>  s    z+Glm4MoeLiteModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer4   r   T)return_tuple))r   r   r@   pad_token_id
padding_id
vocab_sizefirst_k_dense_replacer	   pp_groupr   r   r   r   cp_sizegemm_output_zero_allocator_sizerR   llama_4_scaling_configis_first_rankr   r/   r   embed_tokensr   rq   rT   r   r   r|   r,   r   rank_in_group
world_sizer(   r   start_layer	end_layeris_last_rankr   r   normlayers_to_capture)rC   rm   r2   r4   r   rF   r   rG   r@     s:   

zGlm4MoeLiteModel.__init__Nr.   )r_   r`   ra   r   r   r   rc   r@   rF   rF   rF   rG   r     s    r   c                	   @   sf   e Zd Z		ddedee deddfddZ	 dd	efd
dZ			dde	e
eejf  fddZdS )Glm4MoeLiteForCausalLMNr.   rm   r2   r4   r7   c                    s   t j  d|_| _t  _| _t  _	 
d t||td|d _t|j|j|td|t jd _t| _t fdd _d	 _d
dlm} |  _ jrhd
dlm}m} |  _|  _ d S d   _ _ d S )NrK   r   model)r4   lm_head)r2   r4   r   c                      s   dd t  jjD S )Nc                 S   s(   i | ]\}}t |jtr||j qS rF   )
isinstancer   rz   get_moe_weights).0r{   layerrF   rF   rG   
<dictcomp>  s    

zEGlm4MoeLiteForCausalLM.__init__.<locals>.<lambda>.<locals>.<dictcomp>)	enumerater   r   rF   rC   rF   rG   r     s    
z1Glm4MoeLiteForCausalLM.__init__.<locals>.<lambda>Fr   r   )get_attention_tp_rankr   )!r   r   r@   moe_layer_freqrm   r
   r6   r2   r	   r   "determine_num_fused_shared_expertsr   r(   r   r   r   r/   r%   enable_dp_lm_headr   r   logits_processorr'    _routed_experts_weights_of_layercapture_aux_hidden_statesr   r   r   sglang.srt.layers.dp_attentionr   r   cp_rankr   )rC   rm   r2   r4   r   r   r   rF   r   rG   r@     s:   


zGlm4MoeLiteForCausalLM.__init__architecturec                 C   s   d| _ t jr	d S d }tr#tjddk s#| jjd |ks#| jj	dkr&d}nt
 dkr-d}|d urBdt _d| _ tt| d d S | jj	| _ d S )	Nr   r   )   r   rK   zhOnly GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization.zZGLM-4.5 or GLM-4.6 cannot use shared experts fusion optimization under expert parallelism.Tz0 Shared experts fusion optimization is disabled.)r   r%   r   rq   rT   r   get_device_capabilityrm   architecturesr   r   r+   logger)rC   r   disable_reasonrF   rF   rG   r     s*   
z9Glm4MoeLiteForCausalLM.determine_num_fused_shared_expertsFweightsc               	      s  |r%t jdr!jj}|dksJ djjdkrdnjj}ntdg d}jdkrRjdks5J dtttt	j
f  dtttt	j
f  ffd	d
}||}tjdddjjj d}	t jdokjjd u}
|
rpi nd }|r~d| }g d}nd }g }g }|rddg}|d u rt }g }|D ]\ }|  |s|st jdrψjj}|dkrψ drψ d}t|dkrt|d jjkrqn4|r؈ |sq|d urd v sd v rqd}|D ]}| v r |d d} nq|r |d d v r
q|D ]<\}}}| vrqd v rq ||  d r2 |vr2q |vr9q|  }|j}||||  nd}|	D ]/}|\}}}}| vr\qMd} ||  |vrkqM|  }|j}||| ||d!  n|rq d r |vrq |v rq|
r
d" v sd# v r
|| < d" v r n d#d"}d# v r n d"d#}||v r	||v r	|| }|| }t	j||gdd$}d" v r d"d%n d#d%}||vrq|| }t|d&t}||| || || n*d' v sd( v r4 |vr4t fd)d*d+D r, d,d- ntd.    |vr:q |  v rQ|  }t|d&t}||| qtd/  d0 qj!|d d1 d S )2Nnum_nextn_predict_layersrK   zOnly 1 nextn layer is supportedr   z-num_nextn_predict_layers is not in the config))qkv_projq_projq)r  k_projk)r  v_projv)r9   	gate_projr   )r9   up_projrK   r  r7   c                 3   sp    dd l }|d}| D ])\}}||}|r0t|d}|d}d| d jj d| }||fV  qd S )Nr   z1^model\.layers\.(\d+)\.mlp\.shared_experts\.(.+)$rK   r8   model.layers.z.mlp.experts.r}   )recompilematchrb   grouprm   rj   )r  r  patternnamerL   r  r{   suffixr   rF   rG   &iter_weights_with_fused_shared_expertsC  s   

zSGlm4MoeLiteForCausalLM.load_weights.<locals>.iter_weights_with_fused_shared_expertsr
  r;   r  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   r   r  )zshared_head.normeh_projenormhnormeagle_draft_tokens_mapzeagle_lm_head.weightzmodel.layersr}      r8   zshared_head.headr   Tr   Fzmodel.decoderzrotary_emb.inv_freqzmlp.expertsz.bias)shard_id	expert_idq_a_projkv_a_proj_with_mqa)dimfused_qkv_a_proj_with_mqaweight_loaderk_scalev_scalec                 3   s    | ]}| v V  qd S ry   rF   )r   scale)r  rF   rG   	<genexpr>  s    z6Glm4MoeLiteForCausalLM.load_weights.<locals>.<genexpr>)r$  r%  _projattn_mqaz#Unknown scale found in checkpoint: z
Parameter z not found in params_dict)rg   weight_names)"rQ   rm   r  r   rA   r   r   r   rc   rT   Tensorr   make_expert_params_mappingrj   r   r   named_parametersappend
startswithsplitlenrb   replaceendswithr#  catrR   r   popanyr   warningkeyspost_load_weights) rC   r  rg   params_dictis_eaglenum_nextn_layersnextn_layer_idstacked_params_mappingr  expert_params_mappingfuse_qkv_a_projcached_a_projnextn_layer_prefixnextn_spec_weight_nameseagle_ignore_weight_namesr*  loaded_weight	name_list
is_decoderweight_name
param_namer  paramr#  is_expert_weightmappingr  q_a_proj_namekv_a_proj_nameq_a_proj_weightkv_a_proj_weightfused_weightrF   )r  rC   rG   load_weights#  s>  
	

























	z#Glm4MoeLiteForCausalLM.load_weightsr   )r   )FNF)r_   r`   ra   r   r   r   rc   r@   r   r   r   rT   r+  rR  rF   rF   rF   rG   r     s,    
1
 r   )S__doc__loggingtypingr   r   r   rT   torch.nn.functionalr   
functionalru   transformersr   -sglang.srt.batch_overlap.single_batch_overlapr   sglang.srt.distributedr   r	   r
   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   r   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   r   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.deepseek_v2r    r!   r"   r#   r$   sglang.srt.server_argsr%   sglang.srt.utilsr&   r'   r(   r)   r*   r+   r,   rq   rr   	getLoggerr_   r   r   r-   rf   rz   r   r   r   
EntryClassrF   rF   rF   rG   <module>   sN   $

N% ]3  
U