o
    پie                     @   s&  d Z ddlZddlmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z= ddl>m?Z?m@Z@ ddlAmBZBmCZC ddlDmEZEmFZFmGZG ddlHmIZI ddlJmKZKmLZLmMZMmNZN eOePZQeM ZRG dd dejSZTG d d! d!ejSZUG d"d# d#ejSZVG d$d% d%ejSZWG d&d' d'ejSZXeXZYdS )(zL
SGLang SDARMoeModelLM (block diffusion / dLLM-style forward) with MoE MLP.
    N)IterableOptionalTupleUnion)nn)PretrainedConfig)"get_moe_expert_parallel_world_sizeget_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_sizeis_dp_attention_enabled)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)TopK)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)AttentionTypeRadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loadermaybe_remap_kv_scale_name)apply_qk_normcreate_fused_set_kv_buffer_argenable_fused_set_kv_buffer)get_global_server_args)	LazyValue
add_prefixis_cudamake_layersc                       s   e Zd ZdZ		ddededee def fdd	Z			
	
dde
jdee dedede
jf
ddZ	
	
dde
jdedede
jfddZde
jdefddZdd Z  ZS )SDARMoeSparseMoeBlockz
    Qwen3MoE-style sparse MoE block:
      - gate: ReplicatedLinear(hidden, num_experts)
      - topk routing: TopK
      - experts: get_moe_impl_class(quant_config)(...)
    N layer_idconfigquant_configprefixc              
      s   t    || _t | _| j|jkrtd| j d|j dt|j|j	d|d| _
t||jt j |j||j|j|td|tjd| _t|j|jdd td|d	| _t  rkt | _|jt j | _|j| _d S d S )
NzTensor parallel size z > num_experts .F)top_krenormalizeuse_grouped_topkr6   experts)num_expertsr;   r6   hidden_sizeintermediate_sizer8   r9   routing_method_typegatebiasr8   r9   )super__init__r6   r
   tp_sizer?   
ValueErrorr   num_experts_per_toknorm_topk_probtopkr   r/   ep_num_redundant_expertsr@   moe_intermediate_sizer1   r   Renormalizer>   r   rC   r   	is_deepepr   ep_sizer;   )selfr6   r7   r8   r9   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/sdar_moe.pyrG   O   sL   

	zSDARMoeSparseMoeBlock.__init__Fhidden_statesforward_batchshould_allreduce_fusionuse_reduce_scatterreturnc                 C   s@   t   st   s| j|||dS |d usJ d| ||S )N)rY   rZ   z%deepep/fuseep MoE needs forward_batch)r   rP   is_ascend_fuseepforward_normalforward_deepep)rR   rW   rX   rY   rZ   rU   rU   rV   forward   s   zSDARMoeSparseMoeBlock.forwardc           
      C   sh   |j \}}|d|}| |\}}| ||}| ||}	| jdkr.|s.|s.t s.t|	}	|	||S )N   )shapeviewrC   rL   r>   rH   r   r   )
rR   rW   rY   rZ   
num_tokens
hidden_dimrouter_logits_topk_outputoutrU   rU   rV   r]      s   

z$SDARMoeSparseMoeBlock.forward_normalc                 C   s\   |j d dkr| |\}}| j|||jtj| jdd}n| j|j}| j	||d}|S )Nr   )r6   )num_token_non_paddedexpert_location_dispatch_info)rW   rh   )
rb   rC   rL   rj   r   init_newr6   empty_topk_outputdevicer>   )rR   rW   rX   rf   rg   rh   ri   rU   rU   rV   r^      s   	z$SDARMoeSparseMoeBlock.forward_deepepc                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))correction_bias)r   r>   num_local_expertsdata).0nameprR   rU   rV   
<listcomp>   s    
z9SDARMoeSparseMoeBlock.get_moe_weights.<locals>.<listcomp>)r>   named_parametersru   rU   ru   rV   get_moe_weights   s   
z%SDARMoeSparseMoeBlock.get_moe_weightsNr5   )NFF)FF)__name__
__module____qualname____doc__intr   r   r    strrG   torchTensorr(   boolr_   r]   r^   rx   __classcell__rU   rU   rS   rV   r4   G   sP    6

r4   c                       sl   e Zd Z				ddededee deded	ee	j
j f fd
dZde	jde	jdede	jfddZ  ZS )SDARMoeAttentionNTr5   r7   r6   r8   reduce_resultsr9   
alt_streamc                    s  t    || _|j| _|j| _t }t }| j| dksJ | j| | _|j	| _
| j
|kr8| j
| dks7J n	|| j
 dksAJ td| j
| | _t|d| j| j | _| j| j | _| j| j | _| jd | _t| j| j| j| j
t|dd|td|d| _t| j| j | jt|dd||||td	|d
| _t| j|jd| _t| j|jd| _t|dd}	t|dd }
t|dd}| j| _t| j| j||	|
d| _t| j| j| j| j|tj td|d| _!|| _"d S )Nr   ra   head_dimg      attention_biasFqkv_projrD   o_proj)rE   r8   r   tp_rankrH   r9   )eps
rope_thetag     @rope_scalingmax_position_embeddingsi   )
rotary_dimmax_positionbaser   attn)num_kv_headsr6   	attn_typer9   )#rF   rG   r6   r@   num_attention_headstotal_num_headsr   r   	num_headsnum_key_value_headstotal_num_kv_headsmaxr   getattrr   q_sizekv_sizescaler   r1   r   r   r   r   rms_norm_epsq_normk_normr   r#   
rotary_embr"   r!   ENCODER_ONLYr   r   )rR   r7   r6   r8   r   r9   r   attn_tp_rankattn_tp_sizer   r   max_posrS   rU   rV   rG      sz   
	

	


	zSDARMoeAttention.__init__	positionsrW   rX   r[   c              	   C   s   t  jd ur
| }| |\}}|j| j| j| jgdd\}}}t||| j| j	| j
| jd\}}| j|||t|rBt|| j|dnd d\}}t  jd urZ|tj}|tj}| j||||t| d}	| |	\}
}|
S )Nr`   )dim)qkr   r   r   r   )valuelayerrX   )fused_set_kv_buffer_arg)save_kv_cache)r/   rl_on_policy_targetbfloat16r   splitr   r   r,   r   r   r   r   r   r.   r-   r   tor   r   )rR   r   rW   rX   qkvrg   r   r   vcontextri   rU   rU   rV   r_     sH    

zSDARMoeAttention.forward)NTr5   N)rz   r{   r|   r   r~   r   r    r   r   r   cudaStreamrG   r   r(   r_   r   rU   rU   rS   rV   r      s6    
Pr   c                       sz   e Zd Z			ddededee dedeej	j
 f
 fdd	Zd
ejdejdedeej deejejf f
ddZ  ZS )SDARMoeBlockNr5   r7   r6   r8   r9   r   c                    s   t    || _|j| _|| _t jd urttj	dtj	ddni }t
| jfd|ji|| _t
| jfd|ji|| _t|||dtd||d| _t|||td|d| _tj||jdddd	| _t| j| j| jd||jd
 kd| _d S )NTweight_dtypecast_x_before_out_muloverride_orig_dtypefp32_residualr   F	self_attn)r7   r6   r8   r   r9   r   mlp)r6   r7   r8   r9   )r6   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsera   )layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layer)rF   rG   r7   r@   r6   r/   r   dictr   float32r   r   r   r   r   r1   r   r4   r   r   rl   num_hidden_layersr   r   layer_communicator)rR   r7   r6   r8   r9   r   norm_kwargsrS   rU   rV   rG   M  sn   
	
	zSDARMoeBlock.__init__r   rW   rX   residualr[   c                 C   s   | j |||\}}|jd dkr| j|||d}| j |||\}}| j |}| j |}| j||||d}|rAd|_||fS | j 	|||\}}||fS )Nr   )r   rW   rX   )rX   rY   rZ   T)
r   prepare_attnrb   r   prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr   _sglang_needs_allreduce_fusionpostprocess_layer)rR   r   rW   rX   r   rY   rZ   rU   rU   rV   r_     s@   zSDARMoeBlock.forwardNr5   N)rz   r{   r|   r   r~   r   r    r   r   r   r   rG   r   r(   r   r_   r   rU   rU   rS   rV   r   L  s4    
>r   c                       s~   e Zd Z			ddedee dedeejj	 f fddZ
		dd	ejd
ejdedejdee deejef fddZ  ZS )SDARMoeModelNr5   r7   r8   r9   r   c                    s   t    | _j| _j| _t | _| jjr)t	| j| jt
 td|d| _nt | _tj fdd| jj| jjtd|d\| _| _| _| jjrot jd ur^ttjdtjddni }t| jfd	ji|| _d S tdd
| _d S )Nembed_tokensr8   use_attn_tp_groupr9   c                    s   t | | dS )N)r7   r6   r8   r9   r   )r   )idxr9   r   r7   r8   rU   rV   <lambda>  s    z'SDARMoeModel.__init__.<locals>.<lambda>layers)pp_rankpp_sizer9   Tr   r   )return_tuple)rF   rG   r7   
vocab_sizer@   	embed_dimr	   pp_groupis_first_rankr'   r   r1   r   r$   r3   r   rank_in_group
world_sizer   start_layer	end_layeris_last_rankr/   r   r   r   r   r   r   norm)rR   r7   r8   r9   r   r   rS   r   rV   rG     sB   


zSDARMoeModel.__init__	input_idsr   rX   input_embedspp_proxy_tensorsr[   c           
   	   C   s   | j jr|d u r| |n|}d }n|d usJ |d }|dd }t| j| jD ]&}| j| }	t 	| |	||||\}}W d    n1 sJw   Y  q)| j j
s[t||dS |j sh| ||\}}|S )NrW   r   )rW   r   )r   r   r   getranger   r   r   r   with_current_layerr   r)   forward_modeis_idler   )
rR   r   r   rX   r   r   rW   r   ir   rU   rU   rV   r_     s,   


zSDARMoeModel.forwardr   NN)rz   r{   r|   r   r   r    r   r   r   r   rG   r   r(   r)   r   r_   r   rU   rU   rS   rV   r     s8    
:r   c                       s   e Zd ZdZ		ddedee def fddZe	d	d
 Z
e	dd Ze 		ddejdejdedeej dee dejfddZdeeeejf  fddZedd Z  ZS )SDARMoeForCausalLMFNr5   r7   r8   r9   c                    s   t    t | _| jjdksJ d| jj dt | _|| _|| _tr*tj	
 nd }t||tdd|d| _| jjrgt }| jjdkrUt|ddrU|dkrU| jj| _nt|j|j|t jtd	|d
| _nt | _t|dd| _d S )Nra   z?SDARMoeForCausalLM does not support pipeline parallel (pp_size=z). Please set pp_size=1.modelr5   )r8   r9   r   tie_word_embeddingsFlm_headr   T)return_full_logits)rF   rG   r	   r   r   r7   r8   _is_cudar   r   r   r   r1   r   r   r
   r   r   r   r&   r   r@   r/   enable_dp_lm_headr$   r   logits_processor)rR   r7   r8   r9   r   rH   rS   rU   rV   rG     s>   


zSDARMoeForCausalLM.__init__c                 C      | j jS N)r   r   ru   rU   rU   rV   r   F     zSDARMoeForCausalLM.start_layerc                 C   r   r   )r   r   ru   rU   rU   rV   r   J  r   zSDARMoeForCausalLM.end_layerr   r   rX   r   r   r[   c                 C   s2   | j |||||d}| jjr| ||| j|S |S )N)r   r   rX   r   r   )r   r   r   r   r   )rR   r   r   rX   r   r   rW   rU   rU   rV   r_   N  s   	
zSDARMoeForCausalLM.forwardweightsc              	      s~  g d}t jddd jjd}t dst   _ j}|D ]
\}}|ds>|ds9|d	s9|d
r>t	|d}|dkr` j
jr`t jddr`d|v r`|d }t|dt}||| t|}	|	d ur{t jdr{|	 jjk sz|	 jjkr{q d|v sd|v rq d|v sd|v rq d|v rt||}|d u rq |D ]5\}
}}||vrqd|v rq|||
}|dr||vrq||vrq|| }t|dt}||||  nXd}|D ].}|\}
}}}||vrqd}|||
}||vrq|| }t|dt}||||||d  n%|r
q |dr||vrq ||vrq || }t|dt}||| q t ds=t fdd _d S d S )N))r   q_projr   )r   k_projr   )r   v_projr   )gate_up_proj	gate_projr   )r  up_projra   r  	down_projr  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer?   _cached_params_dictzmodel.zlayers.zembed_tokens.znorm.r   zmodel.embed_tokens.weightr   Fzlm_head.weightweight_loaderr   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedr   zmlp.expertsz.biasT)shard_id	expert_idrouted_experts_weights_of_layerc                      s    fddt  j jD S )Nc                    s4   i | ]}t  jj| jtr| jj| j qS rU   )
isinstancer   r   r   r4   rx   )rr   lidru   rU   rV   
<dictcomp>  s    zESDARMoeForCausalLM.load_weights.<locals>.<lambda>.<locals>.<dictcomp>)r   r   r   rU   ru   rU   rV   r     s   
 z1SDARMoeForCausalLM.load_weights.<locals>.<lambda>)r   make_expert_params_mappingr7   r?   hasattrr   rw   r  
startswithr1   r   r   r   r*   r%   r   r   r   r+   replaceendswithr0   r  )rR   r  stacked_params_mappingexpert_params_mappingparams_dictrs   loaded_weightparamr  r6   
param_nameweight_namer  name2is_expert_weightmappingr  rU   ru   rV   load_weightsd  s   










zSDARMoeForCausalLM.load_weightsc                 C   s   t |j|jd dS )N)r   num_logical_experts
num_groups)r   r   r?   )clsr7   rU   rU   rV   $get_model_config_for_expert_location  s
   z7SDARMoeForCausalLM.get_model_config_for_expert_locationry   r   )rz   r{   r|   fall_back_to_pt_during_loadr   r   r    r   rG   propertyr   r   r   no_gradr   r(   r)   r_   r   r   r$  classmethodr(  r   rU   rU   rS   rV   r     sD    .

}r   )Zr}   loggingtypingr   r   r   r   r   r   transformersr   sglang.srt.distributedr   r	   r
   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr   sglang.srt.layers.moe.utilsr   r   *sglang.srt.layers.quantization.base_configr    !sglang.srt.layers.radix_attentionr!   r"   "sglang.srt.layers.rotary_embeddingr#   sglang.srt.layers.utilsr$   r%   *sglang.srt.layers.vocab_parallel_embeddingr&   r'   ,sglang.srt.model_executor.forward_batch_infor(   r)   $sglang.srt.model_loader.weight_utilsr*   r+   sglang.srt.models.utilsr,   r-   r.   sglang.srt.server_argsr/   sglang.srt.utilsr0   r1   r2   r3   	getLoggerrz   loggerr   Moduler4   r   r   r   r   
EntryClassrU   rU   rU   rV   <module>   sR   
  pY V