o
    پi׽                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddlm  mZ ddl
mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJ ddlKmLZLmMZM ddlNmOZO ddlPmQZQmRZR ddlSmTZT dd lUmVZV dd!lWmXZX dd"lYmZZZ dd#l[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe eb Zfea ZgeD Zhe^d$o&efZie] Zje` Zke_ ZlemenZoG d%d& d&ejpZqG d'd( d(ejpZrG d)d* d*ejpZsG d+d, d,ejpZtG d-d. d.ejpZuG d/d0 d0ejpZvG d1d2 d2ejpZwG d3d4 d4eVZxewexgZydS )5zUInference-only GLM-4.5, GLM-4.6 and GLM-4.7 model compatible with HuggingFace weights    N)AnyDictIterableListOptionalTupleUnion)nn)PretrainedConfig)model_forward_maybe_tbo)"get_moe_expert_parallel_world_sizeget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeparallel_state tensor_model_parallel_all_reduce)use_symmetric_memory)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)
SiluAndMul)LayerCommunicatorLayerScatterModesenable_moe_dense_fully_dp)get_attention_tp_rankget_attention_tp_sizeis_allocation_symmetricis_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)TopK)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)is_fp8_fnuz)RadixAttention)get_rope)PPMissingLayer)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatchPPProxyTensors)default_weight_loader)DeepseekV2ForCausalLM)apply_qk_norm)get_global_server_args)

add_prefixcpu_has_amx_supportget_bool_env_varget_device_smis_cpuis_cudais_hipis_non_idle_and_non_emptylog_info_on_rank0make_layersSGLANG_USE_AITERc                       st   e Zd Z					ddedededee ded	ed
ee dee ddf fddZ			ddedefddZ	  Z
S )
Glm4MoeMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixtp_ranktp_sizereturnc	           	   
      sx   t    || _t||gd d|td|||d| _t||d||td|||d| _|dkr6td| d	t	 | _
d S )
N   Fgate_up_proj)biasrH   rJ   rK   rL   	down_proj)rP   rH   rI   rJ   rK   rL   siluUnsupported activation: !. Only silu is supported for now.)super__init__rL   r   r8   rO   r!   rQ   
ValueErrorr   act_fn)	selfrE   rF   rG   rH   rI   rJ   rK   rL   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/glm4_moe.pyrV   n   s4   
	

zGlm4MoeMLP.__init__Fshould_allreduce_fusionuse_reduce_scatterc                 C   sN   | j dkr|jd dkr|S | |\}}| |}| j||p |d\}}|S )N   r   )skip_all_reduce)rL   shaperO   rX   rQ   )rY   xforward_batchr^   r_   gate_up_r\   r\   r]   forward   s   

zGlm4MoeMLP.forward)NTrD   NNNFF)__name__
__module____qualname__intstrr   r*   boolrV   rg   __classcell__r\   r\   rZ   r]   rC   m   sB    	
*rC   c                !       s   e Zd Z													d)d
ededededededeeeef  dedee dede	dee
 de	dedeejj ddf  fddZdd Zdd Zd ejd!ejd"efd#d$Zd%d& Zd ejd!ejd"edejfd'd(Z  ZS )*Glm4MoeAttentionr   '        ?N    h㈵>TFrD   rE   	num_headsnum_kv_headslayer_id
rope_thetapartial_rotary_factorrope_scalingmax_position_embeddingshead_dimrms_norm_epsattention_biasrH   use_qk_normrJ   
alt_streamrM   c                    s  t    || _t }t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	|	pJ|| j | _
| j| j
 | _| j	| j
 | _| j
d | _|| _|| _|| _t | _t|| j
| j| j||||td|d	| _t| j| j
 |d|||dtd|d| _t| j
| j
||||d	| _t| j| j
| j| j	|td
|d| _| jrt| j
|
d| _t| j
|
d| _|| _d S )Nr   r`   g      qkv_proj)rP   rH   rK   rL   rJ   Fo_proj)rP   rH   rK   rL   rI   rJ   )
rotary_dimmax_positionry   baserz   attn)rv   rw   rJ   eps) rU   rV   rE   r   r   total_num_headsru   total_num_kv_headsmaxrv   r|   q_sizekv_sizescalingrx   r   r{   r   rK   r    r8   r   r!   r   r-   
rotary_embr,   r   r   q_normk_normr   )rY   rE   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   rH   r   rJ   r   attn_tp_rankattn_tp_sizerZ   r\   r]   rV      sz   


	
zGlm4MoeAttention.__init__c                 C   s    | j |j|d|jd|_d S )N!hidden_states_after_comm_pre_attn	positionshidden_statesrd   )forward_preparer   poprd   attn_intermediate_staterY   stater\   r\   r]   
op_prepare  s
   zGlm4MoeAttention.op_preparec                 C   s   |  |d|_d S )Nr   )forward_corer   hidden_states_after_attnr   r\   r\   r]   op_core
  s   
zGlm4MoeAttention.op_corer   r   rd   c           
      C   s   |j d dkr||d fS | |\}}|j| j| j| jgdd\}}}| jr6t||| j| j| j	| j
d\}}| |||\}}||||f}	d ||	fS )Nr   )dim)qkr   r   r|   r   )rb   r   splitr   r   r   r6   r   r   r|   r   r   )
rY   r   r   rd   qkvrf   r   r   vinner_stater\   r\   r]   r     s    
 

z Glm4MoeAttention.forward_preparec                 C   s2   |\}}}|d u r|S | j | }| |\}}|S N)r   r   )rY   intermediate_stater   rd   r   attn_outputoutputrf   r\   r\   r]   r   &  s   

zGlm4MoeAttention.forward_corec                 C   s   | j |||d}| |S )Nr   )r   r   )rY   r   r   rd   sr\   r\   r]   rg   .  s   
zGlm4MoeAttention.forward)r   rq   rr   Nrs   Nrt   TNFrD   N)ri   rj   rk   rl   floatr   r   rm   r   rn   r*   torchcudaStreamrV   r   r   Tensorr2   r   r   rg   ro   r\   r\   rZ   r]   rp      s    	

[
rp   c                       s.   e Zd Z	ddef fddZdd Z  ZS )Glm4MoeGaterD   rJ   c                    sB   t    tt|j|jf| _ttj|jtj	d| _
d S )N)dtype)rU   rV   r	   	Parameterr   emptyn_routed_expertsrE   weightfloat32e_score_correction_bias)rY   configrJ   rZ   r\   r]   rV   =  s   

zGlm4MoeGate.__init__c                 C   s   t || jd }|S r   )Flinearr   )rY   r   logitsr\   r\   r]   rg   J  s   zGlm4MoeGate.forward)rD   )ri   rj   rk   rm   rV   rg   ro   r\   r\   rZ   r]   r   <  s    r   c                   @   s&  e Zd Z			d,dededee dedeej	j
 f
dd	Zd
d Z			d-dejdee dededejf
ddZ		d.dejdededejfddZ		d.dejdededejfddZdejdedejfddZdejfddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ ZdS )/Glm4MoeSparseMoeBlockNrD   r   rw   rH   rJ   r   c                 C   s.  t j|  |j| _t | _t | _|j	| _	|j
| _
t jr dn|j
| _|| _|| _|| _| j|jkr@td| j d|j d|jdkrNtd|j dt|td|d	| _t||j| j | j| j| j | j|j|j|| j	tjtd
|d
| _t| j| j | j|jd|j|j | jj!| j	| jt"| jdddd| _#|j
d ur| jdkr|j|j
 }t$d|j||j|dtd|dt% & st% ' st% ( st) rt*dddni | _+t% & st% ' r	t | _,|jt j- | _.|j| _/|j | _ |j| _0| jj!d ur| jj!j1nd | _2t% & pt% ' | _3d S )Nr   zTensor parallel size z' is greater than the number of experts .rR   rS   rT   gate)r   rJ   experts)
num_expertsnum_fused_shared_expertstop_krw   rE   rF   rH   routed_scaling_factorrouting_method_typerJ   T)should_fuse_routed_scaling_factor_in_topkFr`   )r   rw   renormalizeuse_grouped_topknum_expert_group
topk_groupcorrection_biasr   r   %apply_routed_scaling_factor_on_output#fused_shared_experts_scaling_factorshared_experts)rE   rF   rG   rH   rI   rJ   )rK   rL   r\   )4r	   ModulerV   num_experts_per_tokr   r   rL   r   moe_ep_sizer   n_shared_expertsr7   disable_shared_experts_fusionr   r   rw   r   r   rW   rG   r   r8   r   r%   rE   moe_intermediate_sizer(   
DeepSeekV3r   r'   norm_topk_probn_groupr   r   getattrtopkrC   r#   	is_deepepis_mooncakeis_flashinferr$   dictr   ep_sizeep_num_redundant_expertsr   r   r   datar   _enable_a2a_moe)rY   r   rw   rH   rJ   r   rF   r\   r\   r]   rV   P  s   




	
zGlm4MoeSparseMoeBlock.__init__c                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))r   )r)   r   num_local_expertsr   ).0namerc   rY   r\   r]   
<listcomp>  s    
z9Glm4MoeSparseMoeBlock.get_moe_weights.<locals>.<listcomp>)r   named_parametersr   r\   r   r]   get_moe_weights  s   
z%Glm4MoeSparseMoeBlock.get_moe_weightsFr   rd   r^   r_   rM   c                 C   sZ   t   s'| jd ur | jdkr |jd dkr t r | |||S | |||S | ||S Nr   )	r#   r   r   r   rb   r1   forward_normal_dual_streamforward_normalforward_deepep)rY   r   rd   r^   r_   r\   r\   r]   rg     s   


zGlm4MoeSparseMoeBlock.forwardc           
      C   s  t j }| j| | |}t j| j" | |}| ||}| 	||}t
s2ts2|| j9 }W d    n1 s<w   Y  || j tt t  d t |}	W d    n1 saw   Y  t j|||	d |	}| jdkr|s|st st|}|S )Ndisabledoutr`   )r   r   current_streamr   wait_stream_forward_shared_expertsstreamr   r   r   _is_cuda
_use_aiterr   r   r   get_tp_groupr   
empty_likeaddrL   r$   r   )
rY   r   r^   r_   r   shared_outputrouter_logitstopk_outputfinal_hidden_statesfinal_hidden_states_outr\   r\   r]   r     s8   





z0Glm4MoeSparseMoeBlock.forward_normal_dual_streamc           	      C   s   |j d dkr| |}| |}| ||}n	d }| j|j}| ||}ts0ts0|| j	9 }|d ur]t
t t  d t|}W d    n1 sNw   Y  tj|||d |}| jdkrm|sm|smt smt|}|S )Nr   r   r   r`   )rb   r   r   r   empty_topk_outputdevicer   r   r   r   r   r   r   r   r   r   r   rL   r$   r   )	rY   r   r^   r_   r   r   r   r   r   r\   r\   r]   r     s4   



z$Glm4MoeSparseMoeBlock.forward_normalc                 C   s   d }|j d dkr#| |}| |}| j|||jtj| jdd}n| j|j	}| j
||d}|d urP|}| j
jrD|| |}|S |j|| jd |}|S | j
jsY|| j9 }|S )Nr   rw   )num_token_non_paddedexpert_location_dispatch_info)r   r   alpha)rb   r   r   r   r  r   init_newrw   r  r  r   r   add_r   )rY   r   rd   r   r   r   r   rc   r\   r\   r]   r   %  s:   

	

z$Glm4MoeSparseMoeBlock.forward_deepepc                 C   s&   |j d dkr| jdkr| |S d S r   )rb   r   r   )rY   r   r\   r\   r]   r   I  s   
z-Glm4MoeSparseMoeBlock._forward_shared_expertsc                 C   s,   t |jj|jr| |j|_d S d |_d S r   )r?   rd   forward_modehidden_states_mlp_inputr   r   r   r\   r\   r]   op_gateO  s
   

zGlm4MoeSparseMoeBlock.op_gatec                 C   s   | d}|j}|d ur7t | j | j|||jjtj	| jdd|_
W d    d S 1 s0w   Y  d S | j|j|_
d S )Nr   r  )r   r   r  r  )r   r  r   with_current_layerrw   r   rd   r  r   r  r   r  r  )rY   r   r   r   r\   r\   r]   op_select_expertsX  s    

"z'Glm4MoeSparseMoeBlock.op_select_expertsc                 C   s4   | j dkr| jjj|j|d|dd d S d S )Nr`   r   tbo_subbatch_index)r   r   r  )r   r   
dispatcher
dispatch_ar  r   getr   r\   r\   r]   op_dispatch_ak  s   

z#Glm4MoeSparseMoeBlock.op_dispatch_ac                 C   sZ   | j dkr+t | j | jjj|dd|_W d    d S 1 s$w   Y  d S d S Nr`   r  )r  )	r   r   r  rw   r   r  
dispatch_br  dispatch_outputr   r\   r\   r]   op_dispatch_bs  s   

"z#Glm4MoeSparseMoeBlock.op_dispatch_bc                 C   s   | j j|jd|_d S )N)r  )r   run_moe_corer  combine_inputr   r\   r\   r]   
op_experts|  s   z Glm4MoeSparseMoeBlock.op_expertsc                 C   s:   | j dkr| jjj|d|dd |d d S d S )Nr`   r  r  )r  r  r  )r   r   r  	combine_ar   r  r   r\   r\   r]   op_combine_a  s   
z"Glm4MoeSparseMoeBlock.op_combine_ac                 C   s*   | j dkr| jjj|dd|_d S d S r  )r   r   r  	combine_br  hidden_states_after_combiner   r\   r\   r]   op_combine_b  s
   
z"Glm4MoeSparseMoeBlock.op_combine_bc                 C   sJ   | d}| d }d ur|}|j|| jd |}n|| j9 }||_d S )Nr  r   r  )r   r	  r   hidden_states_mlp_output)rY   r   r   r   rc   r\   r\   r]   	op_output  s   


zGlm4MoeSparseMoeBlock.op_output)NrD   Nrh   )FF)ri   rj   rk   r
   rl   r   r*   rm   r   r   r   rV   r   r   r2   rn   rg   r   r   r   r   r  r  r  r  r  r  r  r!  r\   r\   r\   r]   r   O  s    

k

(
"
$		r   c                   @   s   e Zd Z				ddededee deded	ee	j
j d
dfddZdeded
efddZde	jde	jdedee	j d
e	jf
ddZ	dde	jde	jdedee	j dee f
ddZdd Zdd Zdd ZdS ) Glm4MoeDecoderLayerNFrD   r   rw   rH   is_nextnrJ   r   rM   c                 C   s  t j|  |j| _|| _t|dd}t|dd }tt|dd dd p(t|dd}	t|dd}
t|d	|j|j }|j}|j}|| _	t
|d
rJ|jnd}t| j|j|j||||	|
||||td|||d| _| j||d| _| j|d dd}| j|d dd}tj||rdn|j| j||d| _| jrt||td|| j	|d| _nt rd\}}nd\}}t|j|j|j|td|||d| _t|j|jd| _t|j|jd| _t| j| j| jd|p| j	| jjd kd| _ d S )Nrx   rq   rz   rope_parametersry   rr   r{   rs   r|   r   F	self_attn)rE   ru   rv   rw   rx   rz   ry   r{   r|   r}   r~   rH   rJ   r   r   )r#  r`   )rw   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)r   rH   rJ   rw   r   )r   r`   NN)rE   rF   rG   rH   rJ   rK   rL   r   T)layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layer)!r	   r   rV   rE   r   r   num_attention_headsr}   r~   rw   hasattrr   rp   num_key_value_headsr8   r%  _is_layer_sparser'  r   r  num_hidden_layersr,  r   r*  r   rC   rF   rG   r   r-  r.  r   layer_communicator)rY   r   rw   rH   r#  rJ   r   rx   rz   ry   r{   r|   r}   r~   r   r(  r)  mlp_tp_rankmlp_tp_sizer\   r\   r]   rV     s   	



zGlm4MoeDecoderLayer.__init__c                 C   s   |p| j jd uo|| j jkS r   )r   r   first_k_dense_replace)rY   rw   r#  r\   r\   r]   r4    s   
z$Glm4MoeDecoderLayer._is_layer_sparser   r   rd   residualc                 C   s   | j |||\}}| j|||d}| j |||\}}| j |}| j |}| ||||}|r9d|_||fS | j |||\}}||fS )Nr   T)	r6  prepare_attnr%  prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr*  _sglang_needs_allreduce_fusionpostprocess_layer)rY   r   r   rd   r:  r^   r_   r\   r\   r]   rg     s8   zGlm4MoeDecoderLayer.forwardr  c                 C   s0   | j |||\|_|_|t|||d d S )N)rd   r   r  )r6  r;  r   residual_after_input_lnupdater   )rY   r   r   r   rd   r:  r  r\   r\   r]   op_comm_prepare_attn0  s   

z(Glm4MoeDecoderLayer.op_comm_prepare_attnc                 C   s*   | j |d|d|j\|_|_d S )Nr   rA  )r6  r<  r   rd   r  residual_after_comm_pre_mlpr   r\   r\   r]   op_comm_prepare_mlpD  s   z'Glm4MoeDecoderLayer.op_comm_prepare_mlpc                 C   sB   | d}t r| js|jd dks| ||j|_d S ||_d S )Nr  r   )r   r   r'  rb   r*  rd   r   )rY   r   r   r\   r\   r]   op_mlpM  s   


zGlm4MoeDecoderLayer.op_mlpc                 C   sN   | j |d|d|j\}}t|j|||j|jd}|jh dd |S )Nr   rD  )r   r   r:  rd   r  >   r   rd   r  )expect_keys)r6  r@  r   rd   r   r   r  clear)rY   r   r   r:  r   r\   r\   r]   op_comm_postprocess_layerZ  s    z-Glm4MoeDecoderLayer.op_comm_postprocess_layer)NFrD   Nr   )ri   rj   rk   r
   rl   r   r*   rn   rm   r   r   r   rV   r4  r   r2   rg   rC  rE  rF  rI  r\   r\   r\   r]   r"    s^    

_
5
	r"  c                       s   e Zd Z		ddedee def fddZdej	fd	d
Z
		ddej	dej	dedej	dee deej	ef fddZ  ZS )Glm4MoeModelNrD   r   rH   rJ   c                    s   t    t _ _ j_ j_ j_jj	r(t
 j jt d_nt _tr3tj nd _t j fddjjjjtd|d\___jjratj jd_ntdd_g _d S )	N)use_attn_tp_groupc                    s   t |  |jdS )N)rw   r   rH   rJ   r   )r"  r   )idxrJ   r   rH   rY   r\   r]   <lambda>  s    z'Glm4MoeModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizerJ   r   T)return_tuple) rU   rV   r   pp_groupr   
vocab_sizer9  rE   	embed_dimis_first_rankr0   r   embed_tokensr.   r   r   r   r   r   rA   r5  rank_in_group
world_sizer8   rO  start_layer	end_layeris_last_rankr   r}   normlayers_to_capturerY   r   rH   rJ   rZ   rM  r]   rV   t  s2   


zGlm4MoeModel.__init__rM   c                 C   s   | j S r   )rW  r   r\   r\   r]   get_input_embeddings  s   z!Glm4MoeModel.get_input_embeddings	input_idsr   rd   input_embedspp_proxy_tensorsc              
   C   s  | j jr|d u r| |}n|}d }n|d usJ |d }|d }| j}| j}	|jrA| j|kr8| j|	k r8| j}	n	| j|k rAd }	}g }
t||	D ]2}t 	|" || j
v r]|
||  | j| }|||||\}}W d    n1 suw   Y  qH|	| jkrt| j|	| j d||||| j|	d  jjd\}}| j jst||dS |j s|d u r| |}n| ||\}}t|
dkr|S ||
fS )Nr   r:  r   Tr`   )rO  
enable_tbor   rd   r   r:  input_data_scatter_mode)r   r:  )rS  rV  rW  rZ  r[  can_run_tbor9  ranger   r  r^  appendrO  r   r,  layer_output_moder\  r3   r
  is_idler]  len)rY   ra  r   rd   rb  rc  r   r:  normal_start_layernormal_end_layeraux_hidden_statesilayerrf   r\   r\   r]   rg     sp   








zGlm4MoeModel.forwardNrD   r+  )ri   rj   rk   r
   r   r*   rm   rV   r   r   r`  r2   r3   r   rg   ro   r\   r\   rZ   r]   rJ  s  s4    *rJ  c                   @   s   e Zd Z		d%dedee deddfddZdej	fd	d
Z
dd Ze 		d&dejdejdedejdee dejfddZedd Zedd Zd'deeeejf  fddZdd Zdd Zed d! Zd(d"eee  fd#d$ZdS ))Glm4MoeForCausalLMNrD   r   rH   rJ   rM   c                 C   s   t j|  t | _|| _t | _|| _d| _	| 
  t||td|d| _t|j|j|td|t jd| _t|| _d| _d S )Nr   model)rJ   lm_head)rH   rJ   rK  F)r	   r   rV   r   rS  r   r   rL   rH   r   "determine_num_fused_shared_expertsrJ  r8   rs  r/   rT  rE   r7   enable_dp_lm_headrt  r"   logits_processorcapture_aux_hidden_statesr_  r\   r\   r]   rV     s&   

zGlm4MoeForCausalLM.__init__c                 C      | j jS r   )rs  rW  r   r\   r\   r]   r`    s   z'Glm4MoeForCausalLM.get_input_embeddingsc                 C   s   t  jrd S d }t| jdd sd}n!tsd}ntr$td ur$tdk r$d}nt dkr,d}nt  r3d}|d urEd	t  _t	t
| d
 d S | jj| _| jdksSJ dt	t
d d S )Nr   z,No shared experts are defined in the config.z6Shared experts fusion currently requires CUDA devices.P   z2Shared experts fusion requires SM80 or newer GPUs.r`   zLShared experts fusion is not supported together with expert parallelism yet.zJShared experts fusion is not supported when Deepep MoE backend is enabled.Tz0 Shared experts fusion optimization is disabled.z>Only 1 fused shared expert is supported for Glm4MoeForCausalLMz+Shared experts fusion optimization enabled.)r7   r   r   r   r   
_device_smr   r#   r   r@   loggerr   r   )rY   disable_reasonr\   r\   r]   ru    s2   


z5Glm4MoeForCausalLM.determine_num_fused_shared_expertsra  r   rd   rb  rc  c                 C   sD   |  |||||}d }| jr|\}}| jjr | ||| j||S |S r   )rs  rx  rS  r\  rw  rt  )rY   ra  r   rd   rb  rc  r   rn  r\   r\   r]   rg   )  s   	
zGlm4MoeForCausalLM.forwardc                 C   ry  r   )rs  rZ  r   r\   r\   r]   rZ  @     zGlm4MoeForCausalLM.start_layerc                 C   ry  r   )rs  r[  r   r\   r\   r]   r[  D  r~  zGlm4MoeForCausalLM.end_layerFweightsc              	   C   s  |r%t | jdr!| jj}|dksJ d| jjdkrdn| jj}ntdg d}tjddd	| jj| j d
}|rBd| }g d}t	| 
 }	g }
|D ]\}}|
| | jdkrjd|v rj|dd| jj }|st | jdr| jj}|dkr|dr|d}t|dkrt|d | jjkrqLn,||sqLd|v sd|v rqLd}|D ]}||v r||d}d} nq|r||d}d|v rqL|D ]2\}}}||vrqd|v rq|||}|dr||	vrq||	vrq|	| }|j}||||  njd}|D ]/}|\}}}}||vrq d}|||}||	vrq |	| }|j}||||||d  n6|r4qL|dr@||	vr@qL||	vrFqL||	 v r]|	| }t|dt}||| qLtd| d qLd S ) Nnum_nextn_predict_layersr`   zOnly 1 nextn layer is supportedr   z-num_nextn_predict_layers is not in the config))r   q_projr   )r   k_projr   )r   v_projr   )rO   	gate_projr   )rO   up_projr`   r  rQ   r  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   zmodel.layers.)zshared_head.normeh_projenormhnormzmlp.shared_expertszmlp.experts.zmodel.layersr      rN   zshared_head.headrW  Trs  Fzmodel.decoderzrotary_emb.inv_freqzmlp.expertsz.bias)shard_id	expert_idweight_loaderz
Parameter z not found in params_dict)r2  r   r  r5  rW   r&   make_expert_params_mappingr   r   r   r   rh  replace
startswithr   rk  rl   endswithr  keysr   r4   r|  warning)rY   r  r#  num_nextn_layersnextn_layer_idstacked_params_mappingexpert_params_mappingnextn_layer_prefixnextn_spec_weight_namesparams_dictweight_namesr   loaded_weight	name_list
is_decoderweight_name
param_namer  paramr  is_expert_weightmappingr  r\   r\   r]   load_weightsH  s   	






zGlm4MoeForCausalLM.load_weightsc                 C   s   | j jj| jjfS r   )rs  rW  r   rt  r   r\   r\   r]   get_embed_and_head  s   z%Glm4MoeForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )rs  rW  r   rt  r   r   empty_cachesynchronize)rY   embedheadr\   r\   r]   set_embed_and_head  s   

z%Glm4MoeForCausalLM.set_embed_and_headc                 C   s   t |j|j|jdS )N)r&  num_logical_experts
num_groups)r   r5  r   r   )clsr   r\   r\   r]   $get_model_config_for_expert_location  s
   z7Glm4MoeForCausalLM.get_model_config_for_expert_location	layer_idsc                 C   sX   | j jsd S |d u rd| _| jj}d|d |d g| j_d S d| _dd |D | j_d S )NTrN   r  c                 S   s   g | ]}|d  qS )r`   r\   )r   valr\   r\   r]   r      s    zCGlm4MoeForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)rS  r\  rx  r   r5  rs  r^  )rY   r  r&  r\   r\   r]   set_eagle3_layers_to_capture  s   z/Glm4MoeForCausalLM.set_eagle3_layers_to_capturerq  r+  )Fr   )ri   rj   rk   r
   r   r*   rm   rV   r	   	Embeddingr`  ru  r   no_gradr   r2   r3   rg   propertyrZ  r[  r   r   r  r  r  classmethodr  r   rl   r  r\   r\   r\   r]   rr    sR    


 
rr  c                       s   e Zd Z fddZ  ZS )GlmMoeDsaForCausalLMc                    s   t  d d S )Nr  )rU   ru  r   rZ   r\   r]   ru    s   z7GlmMoeDsaForCausalLM.determine_num_fused_shared_experts)ri   rj   rk   ru  ro   r\   r\   rZ   r]   r    s    r  )z__doc__loggingtypingr   r   r   r   r   r   r   r   torch.nn.functionalr	   
functionalr   transformersr
   *sglang.srt.batch_overlap.two_batch_overlapr   sglang.srt.distributedr   r   r   r   r   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.activationr   sglang.srt.layers.communicatorr   r   r   sglang.srt.layers.dp_attentionr   r   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r    r!   "sglang.srt.layers.logits_processorr"   sglang.srt.layers.moer#   r$   "sglang.srt.layers.moe.ep_moe.layerr%   ,sglang.srt.layers.moe.fused_moe_triton.layerr&   sglang.srt.layers.moe.topkr'   sglang.srt.layers.moe.utilsr(   r)   *sglang.srt.layers.quantization.base_configr*   )sglang.srt.layers.quantization.fp8_kernelr+   !sglang.srt.layers.radix_attentionr,   "sglang.srt.layers.rotary_embeddingr-   sglang.srt.layers.utilsr.   *sglang.srt.layers.vocab_parallel_embeddingr/   r0   +sglang.srt.model_executor.cuda_graph_runnerr1   ,sglang.srt.model_executor.forward_batch_infor2   r3   $sglang.srt.model_loader.weight_utilsr4   sglang.srt.models.deepseek_v2r5   sglang.srt.models.utilsr6   sglang.srt.server_argsr7   sglang.srt.utilsr8   r9   r:   r;   r<   r=   r>   r?   r@   rA   _is_hipr   _is_fp8_fnuzr   _is_cpu_amx_available_is_cpur{  	getLoggerri   r|  r   rC   rp   r   r   r"  rJ  rr  r  
EntryClassr\   r\   r\   r]   <module>   sx   $ 0
:   O Xx  