o
    پiv                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZD ddlEmFZG ddlEmHZH ddlImJZJmKZKmLZL ddlMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZU eR ZVeVrdd lWmXZX e
d!ed"ZYdZZeS Z[e\e]Z^eR ZVeU Z_e_rdd#l`maZa d$ed%ebececececf fd&d'ZdG d(d) d)ejeZfG d*d+ d+ejeZgG d,d- d-ejeZhG d.d/ d/eHZiG d0d1 d1ejeZjejZkdS )2zBInference-only Qwen3MoE model compatible with HuggingFace weights.    N)AnyDictIterableListOptionalTupleTypeVar)nn)PretrainedConfig)"get_moe_expert_parallel_world_sizeget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)'get_global_expert_distribution_recorder)ModelConfigForExpertLocation)ExpertLocationDispatchInfo)LayerCommunicatorLayerScatterModes)get_attention_tp_rankget_attention_tp_size)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)get_moe_impl_class)FusedMoE)TopK)RoutingMethodType%filter_moe_weight_param_global_expert)QuantizationConfig)RadixAttention)MRotaryEmbeddingget_rope)get_layer_id)ParallelLMHead)ForwardBatchPPProxyTensors)default_weight_loader)Qwen2MoeMLP)Qwen2MoeModel)apply_qk_normcreate_fused_set_kv_buffer_argenable_fused_set_kv_buffer)get_global_server_args)	LazyValue
add_prefixis_cudais_flashinfer_availableis_non_idle_and_non_emptyis_npu)fused_qk_norm_ropeTConfig)bound)split_qkv_rmsnorm_ropeconfigreturnc                    s8  t | dd}|du rdS | j}t| dr| jnd}t | d| j| j }t|| }t |dd}|d}|d	}|d
}	d|v rL|d }
| j|
 }n| j}
ddd}|du rm|ri|	rit	||||||	 }n||}|dpsd}|dpzd}dd   fdd}|dd}||||||
|\}}||||fS )a  
    Refer to https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L197C1-L288C1
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)
    Args:
        config ([`~transformers.PretrainedConfig`]):
            The model configuration.
    Returns:
        factor: float, the scaling factor for the RoPE embeddings
        low: float, the lower bound of the dimension range
        high: float, the upper bound of the dimension range
        attention_factor: float, the post-processing scaling factor applied to the computed cos/sin
    rope_scalingN)      ?r   r   r?   partial_rotary_factorr?   head_dimfactorattention_factormscalemscale_all_dim original_max_position_embeddings   c                 S   s"   | dkrdS d| t |  d S )NrG   r?   g?)mathlog)scalerD    rK   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3_moe.py
get_mscale   s   z+compute_yarn_parameters.<locals>.get_mscale	beta_fast    	beta_slowc                 S   s*   |t || d t j   dt |  S )zPInverse dimension formula to find the dimension based on the number of rotations   )rH   rI   pi)num_rotationsdimbasemax_position_embeddingsrK   rK   rL   find_correction_dim   s   z4compute_yarn_parameters.<locals>.find_correction_dimc                    sL    | |||} ||||}|rt |}t |}t|dt||d fS )z.Find dimension range bounds based on rotationsr   rG   )rH   floorceilmaxmin)low_rothigh_rotrT   rU   rV   truncatelowhighrW   rK   rL   find_correction_range   s   

z6compute_yarn_parameters.<locals>.find_correction_ranger^   TrG   )
getattr
rope_thetahasattrr@   hidden_sizenum_attention_headsintgetrV   float)r<   r>   rU   r@   rA   rT   rB   rC   rD   rE   rF   rM   rN   rP   rb   r^   r_   r`   rK   ra   rL   compute_yarn_parameterse   sL   



rl   c                       s   e Zd Z		d'dededee def fddZ					d(d
e	j
dee dedede	j
f
ddZdd Z				d)d
e	j
dedede	j
fddZd
e	j
dede	j
fddZdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Z  ZS )*Qwen3MoeSparseMoeBlockN layer_idr<   quant_configprefixc              
      s   t    t | _|| _| j|jkrtd| j d|j dt|j|j	d|d| _
t||jt j |j||j|j|td|tjd| _t|j|jdd td|d	| _t  rkt | _|jt j | _|j| _d S d S )
NzTensor parallel size z' is greater than the number of experts .F)top_krenormalizeuse_grouped_topkro   experts)num_expertsrs   ro   rg   intermediate_sizerp   rq   routing_method_typegate)biasrp   rq   )super__init__r   tp_sizero   rw   
ValueErrorr    num_experts_per_toknorm_topk_probtopkr   r1   ep_num_redundant_expertsrg   moe_intermediate_sizer3   r!   Renormalizerv   r   rz   r   	is_deepepr   ep_sizers   )selfro   r<   rp   rq   	__class__rK   rL   r}      sP   


zQwen3MoeSparseMoeBlock.__init__Fhidden_statesforward_batchshould_allreduce_fusionuse_reduce_scatterr=   c                 C   s.   t   st   s| |||S | ||S N)r   r   is_ascend_fuseepforward_normalforward_deepep)r   r   r   r   r   rK   rK   rL   forward	  s   	zQwen3MoeSparseMoeBlock.forwardc                    s    fdd j  D S )Nc                    s.   g | ]\}}|d vrt || jjr|jqS ))correction_bias)r"   rv   num_local_expertsdata).0namexr   rK   rL   
<listcomp>  s    
z:Qwen3MoeSparseMoeBlock.get_moe_weights.<locals>.<listcomp>)rv   named_parametersr   rK   r   rL   get_moe_weights  s   
z&Qwen3MoeSparseMoeBlock.get_moe_weightsc           
      C   sh   |j \}}|d|}| |\}}| ||}| ||}	| jdkr.|s.|s.t s.t|	}	|	||S )NrG   )shapeviewrz   r   rv   r~   r   r   )
r   r   r   r   
num_tokens
hidden_dimrouter_logits_topk_outputfinal_hidden_statesrK   rK   rL   r   %  s   

z%Qwen3MoeSparseMoeBlock.forward_normalc                 C   s\   |j d dkr| |\}}| j|||jtj| jdd}n| j|j}| j	||d}|S )Nr   ro   )num_token_non_paddedexpert_location_dispatch_info)r   r   )
r   rz   r   r   r   init_newro   empty_topk_outputdevicerv   )r   r   r   r   r   r   r   rK   rK   rL   r   <  s    	z%Qwen3MoeSparseMoeBlock.forward_deepepc                 C   s0   t |jj|jr| |j\|_}d S d |_d S r   )r6   r   forward_modehidden_states_mlp_inputrz   r   )r   stater   rK   rK   rL   op_gateR  s
   

zQwen3MoeSparseMoeBlock.op_gatec                 C   s   | d}|j}|d ur7t | j | j|||jjtj	| jdd|_
W d    d S 1 s0w   Y  d S | j|j|_
d S )Nr   r   )r   r   r   r   )popr   r   with_current_layerro   r   r   r   r   r   r   r   r   )r   r   r   r   rK   rK   rL   op_select_experts[  s    

"z(Qwen3MoeSparseMoeBlock.op_select_expertsc                 C   s8   | j dkr| jjj|d|d|dd d S d S )NrG   r   r   tbo_subbatch_index)r   r   r   )r   rv   
dispatcher
dispatch_ar   rj   r   r   rK   rK   rL   op_dispatch_am  s   

z$Qwen3MoeSparseMoeBlock.op_dispatch_ac                 C   sZ   | j dkr+t | j | jjj|dd|_W d    d S 1 s$w   Y  d S d S NrG   r   )r   )	r   r   r   ro   rv   r   
dispatch_brj   dispatch_outputr   rK   rK   rL   op_dispatch_bu  s   

"z$Qwen3MoeSparseMoeBlock.op_dispatch_bc                 C   s   | j j|jd|_d S )N)r   )rv   run_moe_corer   combine_inputr   rK   rK   rL   
op_experts~  s   z!Qwen3MoeSparseMoeBlock.op_expertsc                 C   s:   | j dkr| jjj|d|dd |d d S d S )NrG   r   r   )r   r   r   )r   rv   r   	combine_ar   rj   r   rK   rK   rL   op_combine_a  s   
z#Qwen3MoeSparseMoeBlock.op_combine_ac                 C   s*   | j dkr| jjj|dd|_d S d S r   )r   rv   r   	combine_brj   hidden_states_after_combiner   rK   rK   rL   op_combine_b  s
   
z#Qwen3MoeSparseMoeBlock.op_combine_bc                 C   s   | d|_d S )Nr   )r   hidden_states_mlp_outputr   rK   rK   rL   	op_output  s   z Qwen3MoeSparseMoeBlock.op_outputNrn   )NFF)FF)__name__
__module____qualname__ri   Qwen3MoeConfigr   r#   strr}   torchTensorr)   boolr   r   r   r   r   r   r   r   r   r   r   r   __classcell__rK   rK   r   rL   rm      sj    6


		rm   c                !       s2  e Zd Z												d-ded	ed
edededeeeef  dedee dede	dee
 dee dedeeeef  deejj ddf  fddZdd Zdd Zdejdejd efd!d"Zdejdejd efd#d$Zd%d& Zdejdejd efd'd(Zd)d* Zdejdejd edejfd+d,Z  ZS ).Qwen3MoeAttentionr   '  N    ư>Frn   rg   	num_headsnum_kv_headsro   re   r>   rV   rA   rms_norm_epsattention_biasr<   rp   rq   dual_chunk_attention_config
alt_streamr=   c                    s  t    || _t }t }|| _|| _| j| dksJ | j| | _|| _| j|kr5| j| dks4J n	|| j dks>J t	d| j| | _
|pM|| j | _| j| j | _| j
| j | _| jd | _|| _|| _t | _t|| j| j| j|
|||td|d	| _t| j| j ||
|||dtd|d| _t| j| j||||d	| _t| jtrdnd
| _t| jt o| jdv | _t jo| j| _ d| _!t"| j| j| j| j
|td|d| _#t$| j|	d| _%t$| j|	d| _&|| _'d S )Nr   rG   g      qkv_proj)r{   rp   tp_rankr~   rq   Fo_proj)r{   rp   r   r~   reduce_resultsrq   )
rotary_dimmax_positionrU   r>   r   T)@         attn)r   ro   rq   eps)(r|   r}   rg   r   r   r<   total_num_headsr   total_num_kv_headsrZ   r   rA   q_sizekv_sizescalingre   rV   r   r   r   r3   r   r   r   r&   
rotary_emb
isinstancer%   compatible_with_fused_kv_buffer"compatible_with_fused_qk_norm_roper1   enable_fused_qk_norm_ropeuse_fused_qk_norm_rope"_used_fused_qk_norm_rope_last_callr$   r   r   q_normk_normr   )r   rg   r   r   ro   re   r>   rV   rA   r   r   r<   rp   rq   r   r   attn_tp_rankattn_tp_sizer   rK   rL   r}     s   


		
zQwen3MoeAttention.__init__c                 C   s    | j |j|d|jd|_d S )N!hidden_states_after_comm_pre_attn	positionsr   r   )forward_preparer   r   r   attn_intermediate_stater   rK   rK   rL   
op_prepare  s
   zQwen3MoeAttention.op_preparec                 C   s   |  |d|_d S )Nr   )forward_corer   hidden_states_after_attnr   rK   rK   rL   op_core  s   
zQwen3MoeAttention.op_corer   r   r   c           
      C   s   |  |\}}| jj|jjkr| j| t|| jj| jj	| j
| j| j| jj| jj| jjt| jdd t| jdd d\}}}||||f}	d ||	fS )Nr{   )r   q_weightk_weightq_biask_bias)r   r   ro   token_to_kv_poolstart_layerr   get_cos_sin_with_positionr;   position_sinposition_cosr   r   rA   r   variance_epsilonweightr   rd   
r   r   r   r   qkvr   qkvinner_staterK   rK   rL   forward_prepare_npu  s$   
z%Qwen3MoeAttention.forward_prepare_npuc           
      C   s8   |  |\}}| |||\}}}||||f}	d ||	fS r   )r   apply_qk_norm_roper  rK   rK   rL   forward_prepare_native"  s   
z(Qwen3MoeAttention.forward_prepare_nativec                 C   s:  | j o|jtjk}|r[t| jdd}|djtj|j	d
 }t| j\}}}}	t|| j| j| j| j| jj| jj| jj|| jj|||||	 |j| j| j| jgdd\}
}}d| _n=|j| j| j| jgdd\}
}}t|
|| j| j| j| jd\}
}| j||
|t|r| jrt|| j|dnd d	\}
}d
| _|
||fS )Nre   g     @r   )dtyper   )rT   T)r  r  r   r   rA   r   )valuelayerr   )fused_set_kv_buffer_argF) r   r  r   bfloat16rd   r<   r   toint32r   
contiguousrl   r8   r   r   rA   r   r	  r
  r   r   is_neox_stylesplitr   r   r   r.   r   r0   r   r/   r   )r   r  r   r   	use_fusedthetarB   r_   r`   rC   r  r  r  rK   rK   rL   r  /  sd     


z$Qwen3MoeAttention.apply_qk_norm_ropec                 C   sF   |j d dkr||d fS tr|j r| j|||dS | j|||dS )Nr   r   )r   _is_npur   	is_extendr  r  )r   r   r   r   rK   rK   rL   r   g  s   
z!Qwen3MoeAttention.forward_preparec                 C   sb   |\}}}|d u r|S |\}}}}| j }	|	pt|o| j }
| j|||||
d}| |\}}|S )N)save_kv_cache)r   r0   r   r   r   )r   intermediate_stater   r   r  r  r  r  fbmust_save_kvr"  attn_outputoutputr   rK   rK   rL   r   |  s$   
zQwen3MoeAttention.forward_corec                 C   s   | j |||d}| |S )Nr   )r   r   )r   r   r   r   srK   rK   rL   r     s   
zQwen3MoeAttention.forward)r   r   Nr   Nr   FNNrn   NN)r   r   r   ri   rk   r   r   r   r   r   r9   r#   dictr   cudaStreamr}   r   r   r   r)   r  r  r  r   r   r   r   rK   rK   r   rL   r     s    	

f

8
r   c                       s   e Zd Z			ddededee dedeej	j
 ddf fd	d
Z	ddejdejdedeej deeej  deejejf fddZ	ddejdejdedeej dee f
ddZdd Zdd Zdd Z  ZS )Qwen3MoeDecoderLayerNrn   r<   ro   rp   rq   r   r=   c                    sl  t    || _|j| _t|dd}t|dd }t|dd}t|d|j|j }	|j}
|j}t|dd }t| j|j|j	|||||	|
|||t
d|||d	| _|| _t | _t | _d
| _d
}d
}tj||j| j||d| _| jr~t| j||t
d|d| _nt|j|j|j|t
d|d| _t|j|jd| _t|j|jd| _t| j| j| jd
| j| jjd kd| _d S )Nre   r   r>   rV   r   rA   r   	self_attn)rg   r   r   ro   re   r>   rV   rA   r   r   r<   rp   rq   r   r   T)ro   
num_layersis_layer_sparseis_previous_layer_sparseis_next_layer_sparsemlp)ro   r<   rp   rq   )rg   rx   
hidden_actrp   rq   r   rG   )layer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layer) r|   r}   r<   rg   rd   rh   r   r   r   num_key_value_headsr3   r-  ro   r   r   r   r   r/  r   r   num_hidden_layersr4  rm   r2  Qwen3MoeMLPrx   r3  r   r5  r6  r   layer_communicator)r   r<   ro   rp   rq   r   re   r>   rV   rA   r   r   r   r0  r1  r   rK   rL   r}     s   

zQwen3MoeDecoderLayer.__init__r   r   r   residualcaptured_last_layer_outputsc           	      K   s   | j j|||fd|i|\}}|jd dkr| j|||d}| j |||\}}| j |}| j |}| ||||}|rFd|_||fS | j 	|||\}}||fS )Nr>  r   r   T)
r<  +prepare_attn_and_capture_last_layer_outputsr   r-  prepare_mlp)should_fuse_mlp_allreduce_with_next_layershould_use_reduce_scatterr2  _sglang_needs_allreduce_fusionpostprocess_layer)	r   r   r   r   r=  r>  kwargsr   r   rK   rK   rL   r     sH   
zQwen3MoeDecoderLayer.forwardr   c                 C   s0   | j |||\|_|_|t|||d d S )N)r   r   r   )r<  prepare_attnr   residual_after_input_lnupdater)  )r   r   r   r   r   r=  r   rK   rK   rL   op_comm_prepare_attn.  s   

z)Qwen3MoeDecoderLayer.op_comm_prepare_attnc                 C   s*   | j |d|d|j\|_|_d S )Nr   rG  )r<  r@  r   r   r   residual_after_comm_pre_mlpr   rK   rK   rL   op_comm_prepare_mlpB  s   z(Qwen3MoeDecoderLayer.op_comm_prepare_mlpc                 C   s   | d}| ||j|_d S )Nr   )r   r2  r   r   )r   r   r   rK   rK   rL   op_mlpK  s   
zQwen3MoeDecoderLayer.op_mlpc                 C   sN   | j |d|d|j\}}t|j|||j|jd}|jh dd |S )Nr   rJ  )r   r   r=  r   r   >   r   r   r   )expect_keys)r<  rD  r   r   r)  r   r   clear)r   r   r   r=  r'  rK   rK   rL   op_comm_postprocess_layerO  s    z.Qwen3MoeDecoderLayer.op_comm_postprocess_layer)Nrn   Nr   )r   r   r   r   ri   r   r#   r   r   r*  r+  r}   r   r)   r   r   r   rI  rK  rL  rO  r   rK   rK   r   rL   r,    s\    
\
>
	r,  c                	       s:   e Zd Zddefdedee deddf fddZ  Z	S )	Qwen3MoeModelNrn   r<   rp   rq   r=   c                    s,   t rtj nd }t j|||||d d S )N)r<   rp   rq   decoder_layer_typer   )_is_cudar   r*  r+  r|   r}   )r   r<   rp   rq   rQ  r   r   rK   rL   r}   i  s   
zQwen3MoeModel.__init__)
r   r   r   r,  r   r   r#   r   r}   r   rK   rK   r   rL   rP  h  s    rP  c                       s2  e Zd ZdZg dddgdZ		d(ded	ee d
eddf fddZ	de
jfddZe 		d)dejdejdedejdee dejfddZe 	d*dejdejdedeeef dejf
ddZedd Zedd Zdd Zd*d eee  fd!d"Zd#eeeejf  fd$d%Zed&d' Z  Z S )+Qwen3MoeForCausalLMF)q_projk_projv_proj	gate_projup_proj)r   gate_up_projNrn   r<   rp   rq   r=   c                    sj   t    t | _|| _|| _t||td|d| _t	|j
|j|td|t jd| _t|| _d| _d S )Nmodel)rq   lm_head)rp   rq   use_attn_tp_groupF)r|   r}   r   pp_groupr<   rp   rP  r3   rZ  r(   
vocab_sizerg   r1   enable_dp_lm_headr[  r   logits_processorcapture_aux_hidden_states)r   r<   rp   rq   r   rK   rL   r}     s    


zQwen3MoeForCausalLM.__init__c                 C      | j jS r   )rZ  embed_tokensr   rK   rK   rL   get_input_embeddings  s   z(Qwen3MoeForCausalLM.get_input_embeddings	input_idsr   r   input_embedspp_proxy_tensorsc                 C   sF   | j |||||d}d }| jr|\}}| jjr!| ||| j||S |S )N)rg  )rZ  ra  r]  is_last_rankr`  r[  )r   re  r   r   rf  rg  r   aux_hidden_statesrK   rK   rL   r     s   	zQwen3MoeForCausalLM.forwardsplit_intervalc              	   C   s   |\}}|dkr|d u r| j ||_n||_t||D ]+}t | | j j| }	|	||j||j\|_|_W d    n1 sBw   Y  q|| j jj	kri| j 
|j|j\}
}|
|_| ||j| j|}|S d }|S )Nr   )rZ  rc  r   ranger   r   layersr=  r<   r:  normr`  r[  )r   re  r   r   rj  rf  startendir  r   r   resultrK   rK   rL   forward_split_prefill  s6   		z)Qwen3MoeForCausalLM.forward_split_prefillc                 C   rb  r   )rZ  r  r   rK   rK   rL   r       zQwen3MoeForCausalLM.start_layerc                 C   rb  r   )rZ  	end_layerr   rK   rK   rL   rt    rs  zQwen3MoeForCausalLM.end_layerc                 C   s   | j jj| jjfS r   )rZ  rc  r
  r[  r   rK   rK   rL   get_embed_and_head  s   z&Qwen3MoeForCausalLM.get_embed_and_head	layer_idsc                 C   sZ   | j jsd S d| _|d u r | jj}| jd|d |d g d S | jdd |D  d S )NTrQ      c                 S   s   g | ]}|d  qS rc   rK   )r   valrK   rK   rL   r      s    zDQwen3MoeForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r]  rh  ra  r<   r:  rZ  set_eagle3_layers_to_capture)r   rv  r.  rK   rK   rL   ry    s   z0Qwen3MoeForCausalLM.set_eagle3_layers_to_captureweightsc              	      s  g d}t jddd jjd}t dst   _ j}|D ]\}}t|}|d ur?t j	dr?| j	j
k s>| j	jkr?q d|v rDq |D ]2\}}	}
|	|vrPqFd	|v rUqF||	|}|d
re||vreqF||vrjqF|| }|j}||||
  nad}|D ]+}|\}}	}}
|	|vrq}d}||	|}||vrq}|| }|j}|||||
|d  n1|rq |d
r||vrq ||vrq || v r|| }t|dt}||| q td| d q t dst fdd _d S d S )N))r   rT  r  )r   rU  r  )r   rV  r  )rY  rW  r   )rY  rX  rG   rW  	down_projrX  )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerw   _cached_params_dictr  zrotary_emb.inv_freqzmlp.expertsz.biasFT)shard_id	expert_idweight_loaderz
Parameter z not found in params_dictrouted_experts_weights_of_layerc                      s    fddt  j jD S )Nc                    s4   i | ]}t  jj| jtr| jj| j qS rK   )r   rZ  rl  r2  rm   r   )r   ro   r   rK   rL   
<dictcomp>m  s    zFQwen3MoeForCausalLM.load_weights.<locals>.<lambda>.<locals>.<dictcomp>)rk  r  rt  rK   r   rK   rL   <lambda>m  s   
 z2Qwen3MoeForCausalLM.load_weights.<locals>.<lambda>)r   make_expert_params_mappingr<   rw   rf   r)  r   r  r'   rZ  r  rt  replaceendswithr  keysrd   r+   loggerwarningr2   r  )r   rz  stacked_params_mappingexpert_params_mappingparams_dictr   loaded_weightro   
param_nameweight_namer  paramr  is_expert_weightmappingr  rK   r   rL   load_weights  s   	




z Qwen3MoeForCausalLM.load_weightsc                 C   s   t |j|jd dS )N)r.  num_logical_experts
num_groups)r   r:  rw   )clsr<   rK   rK   rL   $get_model_config_for_expert_locationv  s
   z8Qwen3MoeForCausalLM.get_model_config_for_expert_locationr   )NNr   )!r   r   r   fall_back_to_pt_during_loadpacked_modules_mappingr   r   r#   r   r}   r	   	Embeddingrd  r   no_gradr   r)   r*   r   r   ri   rr  propertyr  rt  ru  r   ry  r   r  classmethodr  r   rK   rK   r   rL   rS  z  sn    
*

trS  )l__doc__loggingrH   typingr   r   r   r   r   r   r   r   r	   transformersr
   sglang.srt.distributedr   r   r   r   r   #sglang.srt.eplb.expert_distributionr   sglang.srt.eplb.expert_locationr   (sglang.srt.eplb.expert_location_dispatchr   sglang.srt.layers.communicatorr   r   sglang.srt.layers.dp_attentionr   r   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   r   "sglang.srt.layers.moe.ep_moe.layerr   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.moe.topkr    sglang.srt.layers.moe.utilsr!   r"   *sglang.srt.layers.quantization.base_configr#   !sglang.srt.layers.radix_attentionr$   "sglang.srt.layers.rotary_embeddingr%   r&   sglang.srt.layers.utilsr'   *sglang.srt.layers.vocab_parallel_embeddingr(   ,sglang.srt.model_executor.forward_batch_infor)   r*   $sglang.srt.model_loader.weight_utilsr+   sglang.srt.models.qwen2_moer,   r;  r-   sglang.srt.models.utilsr.   r/   r0   sglang.srt.server_argsr1   sglang.srt.utilsr2   r3   r4   r5   r6   r7   rR  
sgl_kernelr8   r9   r   _is_flashinfer_available	getLoggerr   r  r   *sgl_kernel_npu.norm.split_qkv_rmsnorm_roper;   tuplerk   rl   Modulerm   r   r,  rP  rS  
EntryClassrK   rK   rK   rL   <module>   sz   $ 	

p A   I  