o
    پi)                  
   @  s  d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZmZmZmZmZ d dlZzd dlmZmZmZmZ W n	 eyI   Y nw d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5 e
rd dl6m7Z7 e8e9Z:e0 Z;e1 Z<e/ Z=e, Z>e3 Z?e2 Z@e3 Z?e-doe<ZAe;rd dlBmCZC zd dlDmEZE W n ey   dZEY nw zd dlBmFZF W n ey ZG zW Y dZG[GndZG[Gww e;se<se?rd dlBmHZH zd dlBmIZI W n
 ey   Y nw eAr5zd dlJmKZL W n ey4   edw eG dd dZMG dd dZNG dd  d eZOeG d!d" d"eZPG d#d$ d$eZQG d%d& d&eZRG d'd( d(eZSG d)d* d*e*ZT		+dodpd6d7ZU				+dqdrd<d=ZVd>d? ZW				+dqdsd@dAZXejYdBe. e@dC			 				DdtdudMdNZZ			 				DdtdudOdPZ[ejYdBe. e@dC				DdvdwdQdRZ\ejYdBe. e@dC			 				DdtdxdSdTZ]dUdV Z^	dydzdXdYZ_ejYdBe. dZd[d\ Z`			 				Ddtdxd]d^Za			B	 				Dd{d|d`daZbe=r$e>r$ebZKe[ZceVZdeVZXneaZKeZZceUZdddddbd}dgdhZee;rOejfgdi	 	 	Dd~djdkZhe5dldmdn ZidS dS )    )annotationsN)	dataclass)IntEnumauto)TYPE_CHECKINGCallable
NamedTupleOptionalProtocol	TypeGuardruntime_checkable)
GatherIndxRoutingDataScatterIndxrouting)get_tp_group)use_symmetric_memory)expert_location_dispatch)'get_global_expert_distribution_recorder)ExpertLocationDispatchInfotopk_ids_logical_to_physical)is_allocation_symmetric)get_moe_runner_backend)get_global_experts_capturer)MultiPlatformOp)cpu_has_amx_supportget_bool_env_varget_compiler_backendis_cpuis_cudais_hipis_npuis_xpu)register_fake_if_exists)QuantizationConfigSGLANG_USE_AITER)moe_fused_gate)fused_topk_deepseek)kimi_k2_moe_fused_gate)topk_softmax)topk_sigmoid)biased_grouped_topkz6aiter is required when SGLANG_USE_AITER is set to Truec                   @  s   e Zd ZU ded< dZded< dZded< dZded	< d
Zded< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dS )
TopKConfiginttop_kFbooluse_grouped_topkNOptional[int]
topk_groupnum_expert_groupTrenormalizer   num_fused_shared_expertsOptional[Callable]custom_routing_functionOptional[torch.Tensor]correction_biastorch_nativeOptional[float]routed_scaling_factor%apply_routed_scaling_factor_on_output#fused_shared_experts_scaling_factorOptional[TopKOutputFormat]output_formatsoftmaxstrscoring_func)__name__
__module____qualname____annotations__r0   r2   r3   r4   r5   r7   r9   r:   r<   r=   r>   r@   rC    rH   rH   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/topk.pyr,   k   s   
 r,   c                   @  s6   e Zd ZedddZeddd	ZedddZdS )TopKOutputCheckertopk_output
TopKOutputreturnTypeGuard[StandardTopKOutput]c                 C  
   t | tS N)
isinstanceStandardTopKOutputrK   rH   rH   rI   format_is_standard      
z$TopKOutputChecker.format_is_standard!TypeGuard[TritonKernelTopKOutput]c                 C  rO   rP   )rQ   TritonKernelTopKOutputrS   rH   rH   rI   format_is_triton_kernels   s   
z*TopKOutputChecker.format_is_triton_kernelsTypeGuard[BypassedTopKOutput]c                 C  rO   rP   )rQ   BypassedTopKOutputrS   rH   rH   rI   format_is_bypassed   rU   z$TopKOutputChecker.format_is_bypassedN)rK   rL   rM   rN   )rK   rL   rM   rV   )rK   rL   rM   rY   )rD   rE   rF   staticmethodrT   rX   r[   rH   rH   rH   rI   rJ      s    rJ   c                   @  s   e Zd Ze Ze Ze ZdS )TopKOutputFormatN)rD   rE   rF   r   STANDARDTRITON_KERNELBYPASSEDrH   rH   rH   rI   r]      s    
r]   c                   @  s   e Zd ZdZedddZdS )rL   z0Protocol for top-k outputs in different formats.rM   r]   c                 C  s   dS )zThe format of the output.NrH   selfrH   rH   rI   format   s   zTopKOutput.formatNrM   r]   )rD   rE   rF   __doc__propertyrc   rH   rH   rH   rI   rL      s    rL   c                   @  s8   e Zd ZU dZded< ded< ded< eddd	Zd
S )rR   zStandard top-k output format.torch.Tensortopk_weightstopk_idsrouter_logitsrM   r]   c                 C     t jS rP   )r]   r^   ra   rH   rH   rI   rc         zStandardTopKOutput.formatNrd   rD   rE   rF   re   rG   rf   rc   rH   rH   rH   rI   rR         
 rR   c                   @  s8   e Zd ZU dZded< ded< ded< edd
dZdS )rW   z"Triton kernel top-k output format.r   routing_datar   gather_indxr   scatter_indxrM   r]   c                 C  rk   rP   )r]   r_   ra   rH   rH   rI   rc      rl   zTritonKernelTopKOutput.formatNrd   rm   rH   rH   rH   rI   rW      rn   rW   c                   @  sP   e Zd ZU dZded< ded< ded< dZded	< dZd
ed< edddZdS )rZ   zBypassed top-k output format.rg   hidden_statesrj   r,   topk_configNr8   num_token_non_padded$Optional[ExpertLocationDispatchInfo]expert_location_dispatch_inforM   r]   c                 C  rk   rP   )r]   r`   ra   rH   rH   rI   rc      rl   zBypassedTopKOutput.formatrd   )	rD   rE   rF   re   rG   rt   rv   rf   rc   rH   rH   rH   rI   rZ      s   
 rZ   c                      s   e Zd ZdZdddddddddddddddd8 fd!d"Zddd#d9d,d-Zddd#d9d.d/Zddd#d9d0d1Zddd#d9d2d3Zd:d6d7Z	  Z
S );TopKa  
    Parameters:
    --top_k: The all number of top experts selected per token, including the fused shared expert(s).
    --num_fused_shared_experts: num of shared experts, can be activate both in TP or EP mode.
    --routed_scaling_factor: the scaling factor for routed experts in topk_weights.
    --fused_shared_experts_scaling_factor: scaling factor for fused shared experts on AMD-platform.
    NFTr   rA   )layer_idr0   r2   r3   r4   r5   r7   rC   r9   quant_configr<   r=   r@   r>   r.   r-   rx   r1   r0   r/   r2   r3   r4   r5   r7   r6   rC   rB   r9   r8   ry   Optional[QuantizationConfig]r<   r;   r=   Optional[bool]r@   r?   r>   c                  sP   t    |r|d ur|d usJ || _t||||||||
|||||	d| _d S )N)r.   r0   r4   r2   r3   r5   r7   r9   r<   r=   r>   r@   rC   )super__init__rx   r,   rs   )rb   r.   rx   r0   r2   r3   r4   r5   r7   rC   r9   ry   r<   r=   r@   r>   	__class__rH   rI   r}      s&   
zTopK.__init__)rt   rv   rr   rg   rj   rt   rv   ru   rM   rL   c                C  s    d| j _t|| j|| j ||dS )NTrr   rx   rj   rs   rt   rv   )rs   r:   select_expertsrx   rb   rr   rj   rt   rv   rH   rH   rI   forward_native   s   zTopK.forward_nativec          
   	   C  s   | j jd ur| j j}nt  rtj}nt  st  r"tj}ntj	}|tjkr?t
|| j j| j j d\}}}t|||S |tjkrNt||| j ||dS d| j _tt t  d t|| j|| j ||d}	W d    |	S 1 ssw   Y  |	S )N)sm_first)rr   rj   rs   rt   rv   Fdisabledr   )rs   r@   r   is_triton_kernelsr]   r_   is_flashinfer_trtllmis_flashinfer_mxfp4r`   r^   r   r.   r4   rW   rZ   r:   r   r   r   r   rx   )
rb   rr   rj   rt   rv   r@   ro   
gather_idxscatter_idxrK   rH   rH   rI   forward_cuda  sR   





zTopK.forward_cudac                C  s   t || j|| j||dS )Nr   )r   rx   rs   r   rH   rH   rI   forward_cpuC  s   zTopK.forward_cpuc                C  s$   ddl m} |||| j||| jdS )Nr   )fused_topk_npu)rr   rj   rs   rt   rv   rx   )(sglang.srt.hardware_backend.npu.moe.topkr   rs   rx   )rb   rr   rj   rt   rv   r   rH   rH   rI   forward_npuT  s   	zTopK.forward_npudevicetorch.devicec                 C  s   | j j| j j }tt t  d tjd|ftj|d}tj	d|fdtj
|d}W d    n1 s3w   Y  tjd|ftj|d}t|||S )Nr   r   dtyper   )rs   r.   r5   r   r   r   torchemptyfloat32fullint32rR   )rb   r   topkrh   ri   rj   rH   rH   rI   empty_topk_outputh  s   
zTopK.empty_topk_output)r.   r-   rx   r1   r0   r/   r2   r1   r3   r1   r4   r/   r5   r-   r7   r6   rC   rB   r9   r8   ry   rz   r<   r;   r=   r{   r@   r?   r>   r;   )
rr   rg   rj   rg   rt   r8   rv   ru   rM   rL   )r   r   rM   rL   )rD   rE   rF   re   r}   r   r   r   r   r   __classcell__rH   rH   r~   rI   rw      s<    08rw   rA   rr   rg   gating_outputr   r-   r4   r/   r9   rC   rB   c                   s  d fdd}|d ur1|j d }||}|d||d }	tj|	|ddd	d
 }
|d
|
}nA| j d |j d ksGJ d| j d|j | j \}}tj||tj| jd}tj||tj	| jd}
||
 }tj||dd\}}
|r}||jddd }||
fS )Nr   rg   rM   c                   s2    dkr
| j ddS  dkr|  S td  )NrA   r   dimsigmoidInvalid scoring function: )rA   r   
ValueError)r   rC   rH   rI   scoring_func_impl  s
   z2fused_topk_torch_native.<locals>.scoring_func_implr   r   Fkr   sorted   z/Number of tokens mismatch, hidden_states.shape=z vs gating_output.shape=r   r   Tr   keepdim)r   rg   rM   rg   )shapeview	unsqueezer   r   gatherr   r   r   r   floatsum)rr   r   r   r4   r9   rC   r   n_routed_expertsscoresscores_for_choiceri   rh   M_rH   r   rI   fused_topk_torch_nativew  s0   

r   rt   r8   rv   ru   c           
      C  s6   t jjj| |||d\}}	t|	|}	t|	| ||	fS )Nrr   r   r   r4   )r   ops
sgl_kerneltopk_softmax_cpur   _mask_topk_ids_padded_region)
rr   r   r   r4   rt   rv   r9   rC   rh   ri   rH   rH   rI   fused_topk_cpu  s   



r   c                 C  s4   | s||fS || |j }tj|tjd}||fS )Nr   )tor   r   	ones_liker   )
need_applyrh   inputsrH   rH   rI   apply_topk_weights_cpu  s   r   c                 C  s   | j d |j d ksJ d| j \}}	tj||tj| jd}
tj||tj| jd}|dkr5t|
||| n|dkrBt|
|||| ntd| t	||}t
|| |
|fS )Nr   Number of tokens mismatchr   rA   r   r   )r   r   r   r   r   r   r)   r*   r   r   r   )rr   r   r   r4   r9   rt   rv   rC   r   r   rh   ri   rH   rH   rI   
fused_topk  s2   



r   T)dynamicbackenddisableFr3   r1   r2   r5   r<   r;   r=   r{   c                 C  s  | j d |j d ksJ dtj|dd}|j d }|j d }|||djddj}tj||dddd }t|}|d|d |	d
|||j d | |d}||  d}tj||d|dkrid	ndd\}}|rtj||| |df|j|jd
|d d df< |d ur|d d d df jdd| |d d df< |r|dkr|jdd	dn|d d d df jdd	d}|| }|
r||9 }|tj|tj}}t||	}t|| ||fS )Nr   r   r   r   r   Fr   g        Tlowhighsizer   r   r   )r   r   rA   r   maxvaluesr   
zeros_likescatter_r   expandreshapemasked_fillr/   randintr   r   r   r   r   r   r   r   r   )rr   r   r   r4   r3   r2   r5   r<   rt   rv   r=   r   	num_tokennum_expertsgroup_scores	group_idx
group_mask
score_mask
tmp_scoresrh   ri   topk_weights_sumrH   rH   rI   grouped_topk_gpu  sZ   






r   c                 C  s2   |
rJ |	d u s
J t jj| ||||||||	S rP   )r   r   r   grouped_topk_cpurr   r   r   r4   r3   r2   r5   r<   rt   rv   r=   rH   rH   rI   r   1  s   r   c	                 C  s   | j d |j d ksJ d| }	|	j d }
|	|
d|d }tj||ddd\}}|	d|}|rF|jddd}|| }|rF||9 }|tj	|tj
}}t||}t|| ||fS )	z
    Optimized version for num_expert_group=1 case (e.g., Kimi K2 with 384 experts).
    Simplifies the grouped topk logic by removing unnecessary group masking operations.
    Note: This function assumes num_fused_shared_experts=0.
    r   r   r   Fr   r   Tr   )r   r   r   r   r   r   r   r   r   r   r   r   r   )rr   r   r9   r   r4   r<   rt   rv   r=   r   r   r   r   ri   rh   r   rH   rH   rI   kimi_k2_biased_topk_implM  s   


r   c                 C  s  | j d |j d ksJ d| }|j d }|j d }||d|d }|||djdddd jdd}tj||dddd }t|}|d|d |d	|||j d | 
|d}||  td	}tj||d|dkryd
ndd\}}|d|}|rtj||| |df|j|jd|d d df< |d ur|d d d df jdd| |d d df< |r|dkr|jdd
dn|d d d df jdd
d}|| }|r||9 }|tj|tj}}t||
}t||	 ||fS )Nr   r   r   r      r   Fr   z-infTr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r/   r   r   r   r   r   r   r   r   r   r   r   )rr   r   r9   r   r4   r3   r2   r5   r<   rt   rv   r=   r   r   r   r   r   r   r   r   r   r   ri   rh   r   rH   rH   rI   biased_grouped_topk_implw  sn   






r   c                 C  s   | dko
t |  S )Nr   )mathlog2
is_integer)nrH   rH   rI   is_power_of_two  s   r   ri   c                 C  s<   |d u rd S t jd| jd | jd}d| ||kd d f< d S )Nr   )r   r   )r   aranger   r   )ri   rt   indicesrH   rH   rI   r     s   r   )r   r   c                 C  s   t | |} t| | | S rP   )r   r   )ri   rv   rt   rH   rH   rI    _biased_grouped_topk_postprocess  s   

r   c                 C  s  |j d }|j d }|r|| n|}trtd ur|dkrt|r|dkr||kr|| |kr|dkr=|dkr|| dkrn|dkrtj||ftj|jd}tj||ftj|jd}d}|d urc|rc|}t|j	tjd	|||||||d
	 |
d us||	d urt
||
|	}||fS tr|dkrt|rt|j	tjd	||||||d ur|nd|\}}|
d us|	d urt
||
|	}||fS tr|rJ d|j d }|j}| j d |j d ksJ d| j d  d|j d  tj||ftj|d}tj||ftj|d}t||j	|jd	||||||d ur|nd ||fS |j d }tr7|dkr7|dkr7t|j	tjd	|||||dS t| |||||||||	|
|dS )Nr   r             i  r         ?r   TNot implementedz4Number of tokens mismatch: hidden_states.shape[0] = z, gating_output.shape[0] = )r   r4   r<   r=   )r5   r<   rt   rv   r=   )r   _is_cudar'   r   r   r   r   r   r   r   r   r&   
_use_aiteraiter_biased_grouped_topkr   r(   r   )rr   r   r9   r   r4   r3   r2   r5   r<   rt   rv   r=   
num_tokensr   experts_per_grouprh   ri   scaling_factortokenr   rH   rH   rI   biased_grouped_topk_gpu  s   




	r   compiledc                 C  s8   |d u sJ |rJ dt jj| ||||||||	|

S )Nr   )r   r   r   biased_grouped_topk_cpu)rr   r   r9   r   r4   r3   r2   r   r5   r<   rt   rv   r=   rH   rH   rI   r   j  s   r   )rx   rt   rv   rj   rs   rx   rM   c                C  s0  |j }|j}|j}|j}	|j}
|j}|j}|j}|j}|j	}|j
}|j}|j}tj|||d\}}|| }|rt|d us=J |	d usCJ |d u r]t| |trN|n||
|	||||||d\}}nt| ||tre|n||
|	||||||d\}}nj|r|d u r|d u sJ d|d u sJ |rJ dt| |tr|n||
||d\}}n?|d u r|rJ dt| |tr|n||
||||d\}}n"|d u sJ d|d u sJ |rJ d|| |tr|n||
d	\}}|d
krtr|j\}}|d u rdn|}d
dlm} ||||||\}}t j|d t j||d t|||S )N)rj   r9   infor   )rr   r   r9   r   r4   r3   r2   r5   r<   rt   rv   r=   z>num_token_non_padded is not yet supported in fused_topk_nativer   )rr   r   r   r4   r9   rC   )rr   r   r   r4   r9   rt   rv   rC   zDnum_token_non_padded is not yet supported in custom_routing_functionr   r   r   )fused_append_shared_experts)ri   )rx   ri   )r.   r0   r2   r3   r4   r5   r7   r9   r:   r<   r=   r>   rC   r   transform_select_experts_inputsgrouped_topkr   r+   fused_topk_nativer   r   ?sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_kernelsr   r   on_select_expertsr   capturerR   )rr   rj   rs   rx   rt   rv   r.   r0   r2   r3   r4   r5   r7   r9   r:   r<   r=   r>   rC   num_routed_topkrh   ri   r   Nscale_factorr   rH   rH   rI   r     s   










r   zsgl_kernel::moe_fused_gatec                 C  sB   | j d }tj||ftj| jd}	tj||ftj| jd}
|	|
fS )Nr   r   )r   r   r   r   r   r   )input_tensorbiasr3   r2   r   r5   r<   r=   num_rowsrh   ri   rH   rH   rI   _moe_fused_gate#  s   
r  z"sgl_kernel::kimi_k2_moe_fused_gatec           	      C  s6   | j d }| j||tjd}| j||tjd}||fS )Nr   r   )r   	new_emptyr   r   r   )	r  r  r   r4   r<   r=   r  rh   ri   rH   rH   rI   _kimi_k2_moe_fused_gate7  s   
	r
  )NrA   )rr   rg   r   rg   r   r-   r4   r/   r9   rg   rC   rB   )NNNrA   )rr   rg   r   rg   r   r-   r4   r/   rt   r8   rv   ru   r9   rg   rC   rB   )rr   rg   r   rg   r   r-   r4   r/   r9   r8   rt   r8   rv   ru   rC   rB   )NNr   NNNF)rr   rg   r   rg   r   r-   r4   r/   r3   r1   r2   r1   r5   r-   r<   r;   rt   r8   rv   ru   r=   r{   )NNNF)rr   rg   r   rg   r9   rg   r   r-   r4   r/   r<   r;   rt   r8   rv   ru   r=   r{   )rr   rg   r   rg   r9   rg   r   r-   r4   r/   r3   r1   r2   r1   r5   r-   r<   r;   rt   r8   rv   ru   r=   r{   rP   )ri   rg   rt   r8   )NNTr   NNNF)rr   rg   r   rg   r9   rg   r   r-   r4   r/   r3   r1   r2   r1   r   r/   r5   r-   r<   r;   rt   r8   rv   ru   r=   r{   )rr   rg   rj   rg   rs   r,   rx   r1   rt   r8   rv   ru   rM   rR   )r   r   F)j
__future__r   loggingr   dataclassesr   enumr   r   typingr   r   r   r	   r
   r   r   r   triton_kernels.routingr   r   r   r   ImportErrorsglang.srt.distributedr   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.eplbr   #sglang.srt.eplb.expert_distributionr   (sglang.srt.eplb.expert_location_dispatchr   r   sglang.srt.layers.dp_attentionr   sglang.srt.layers.moer   -sglang.srt.layers.moe.routed_experts_capturerr   sglang.srt.layers.utilsr   sglang.srt.utilsr   r   r   r   r   r    r!   r"   sglang.srt.utils.patch_torchr#   sglang.srt.layers.quantizationr$   	getLoggerrD   loggerr   _is_hip_is_cpu_is_cpu_amx_available_is_xpu_is_npur   r   r&   flashinfer.fused_moer'   r(   er)   r*   aiterr+   r   r,   rJ   r]   rL   rR   rW   rZ   rw   r   r   r   r   compiler   r   r   r   r   r   r   r   r   r   r   r   libraryregister_faker  r
  rH   rH   rH   rI   <module>   sD  $
(

	 3.+H)K

  
