o
    پi=                     @  s>  d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlmZmZmZmZmZ erZd dlm Z m!Z! e Z"e Z#e Z$e Z%e&e'e(d	d
Z)e Z*e&e'e(dd
rdnd Z+e#se"rd dl,m-Z-m.Z. e"rdZ/e)rzd dl0m1Z1 W n1 e2y   e2dw z
d dl3m4Z5 dZ/W n e2y   dZ/Y nw ne%re$rn
e*rd dl,m6Z6m.Z. e#se"se*rd dl,m7Z8 eG dd deZ9eG dd deZ:eG dd deZ;G dd deZ<eddd0d&d'Z=ed(dd1d+d,Z>edd(d2d.d/Z?dS )3    )annotationsN)	dataclass)TYPE_CHECKINGListOptional)MoeQuantInfoMoeRunnerConfigMoeRunnerCoreRunnerInputRunnerOutputregister_fused_funcregister_post_permuteregister_pre_permute)MoeRunnerBackend)cpu_has_amx_supportis_cpuis_cudais_hipis_xpu)StandardCombineInputStandardDispatchOutputSGLANG_USE_AITER0SGLANG_MOE_PADDING   )gelu_and_mulsilu_and_mulF)moe_sumz6aiter is required when SGLANG_USE_AITER is set to True)_custom_opsT)moe_sum_reducer   )moe_align_block_sizec                   @  sL   e Zd ZU ded< ded< ded< ded< ded< ded< edd
dZdS )TritonRunnerInputtorch.Tensorhidden_statestopk_weightstopk_idssorted_token_ids
expert_idsnum_tokens_post_paddedreturnr   c                 C     t jS Nr   TRITONself r0   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/moe_runner/triton.pyrunner_backendR      z TritonRunnerInput.runner_backendNr)   r   __name__
__module____qualname____annotations__propertyr2   r0   r0   r0   r1   r!   H   s   
 r!   c                   @  s$   e Zd ZU ded< edddZdS )	TritonRunnerOutputr"   r#   r)   r   c                 C  r*   r+   r,   r.   r0   r0   r1   r2   \   r3   z!TritonRunnerOutput.runner_backendNr4   r5   r0   r0   r0   r1   r;   W   s   
 r;   c                   @  s   e Zd ZU ded< ded< dZded< dZded< dZd	ed
< dZd	ed< dZd	ed< dZ	d	ed< dZ
d	ed< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dS )TritonMoeQuantInfor"   
w13_weight	w2_weightNzOptional[torch.Tensor]b13b2Fbooluse_fp8_w8a8use_int8_w8a8use_int8_w8a16use_int4_w4a16per_channel_quant	w13_scalew2_scalew13_zpw2_zp	a13_scalea2_scalezOptional[List[int]]block_shape)r6   r7   r8   r9   r?   r@   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r0   r0   r0   r1   r<   a   s"   
 r<   c                      s6   e Zd Zd fddZdddZedddZ  ZS )TritonRunnerCoreconfigr   c                   s   t  | d S r+   )super__init__)r/   rO   	__class__r0   r1   rQ   w   s   zTritonRunnerCore.__init__runner_inputr!   
quant_infor<   running_statedictr)   r;   c           /      C  s  ddl m}m}m}m}m} |j}	|j}
|j}|j	}|j
}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}| jj}| jj} | jj}!| jj }"| jj!}#| jj"}$| jj#}%| jj$stJ d|	j%d }&|j%\}'}(})|	j&t'j(krt)j(nt)j*}*t'j+|&|j%d |(f|	j,|	j&d}+||	|||+||||
|||||%|j%d |d |*||||||d t'j+|&|j%d  |(d f|	j,|	j&d},|d	kr|"d ur|#d usJ ||+-d
|(|"|#},nc|#d ur||+-d
|(|#},nUt.st/st0rt1|+-d
|(|, nDt21|,|+-d
|( n9|dkrC|"d u sJ d|#d u s(J dt.s.t/r8t3|+-d
|(|, nt23|,|+-d
|( nt4d|t'j+|&|j%d |j%d f|	j,|	j&d}-| ry|!reJ t'j+|&|j%d |j%d f|	j,|	j&d}.n|!r|	}.nt'5|	}.||,||| s|j%d dkr|-n|.6d||||
|||||% d|d |*||||||d |$d u rd}$| rnt.r|j%d dkr|$dkrn|j%d dkr|$dkrt'j7|-d d df |-d d df |.dj8dd n\|&dkr||-j-|-j% |.|$ nL||-j-|-j% |.|$ nAt/r7t9rt:|-j-|-j% |. n1t;r,t2:|-j-|-j% |. n#||-j-|-j% |.|$ nt0rEt<|-j-|-j% |.|$ n
t2:|-j-|-j% |. t=|.dS )Nr   )_swiglu_gpt_oss_sigmoid_alpha_swiglu_silu_clamp_mulinvoke_fused_moe_kernelmoe_sum_reduce_torch_compilemoe_sum_reduce_tritonz/Only gated MoEs are supported for Triton runner   )devicedtyperO   )compute_typerB   rC   rD   rE   rF   rM      silugeluz%gemm1_alpha is not supported for geluz%gemm1_limit is not supported for geluz#Unsupported activation: activation=g      ?)out)dim    r#   )>0sglang.srt.layers.moe.fused_moe_triton.fused_moerX   rY   rZ   r[   r\   r#   r$   r%   r&   r'   r(   r=   r>   r?   r@   rK   rL   rG   rH   rI   rJ   rM   rF   rB   rC   rD   rE   rO   
activation
no_combineinplacegemm1_alphagemm1_clamp_limitrouted_scaling_factorapply_router_weight_on_inputis_gatedshaper_   torchbfloat16tlfloat16emptyr^   view_is_cuda_is_hip_is_xpur   vllm_opsr   
ValueError
empty_like	unsqueezeaddsqueeze
_use_aiterr   	_has_vllmr   r;   )/r/   rT   rU   rV   rX   rY   rZ   r[   r\   r#   r$   r%   r&   r'   r(   w13w2r?   r@   rK   rL   rG   rH   rI   rJ   rM   rF   rB   rC   rD   rE   rj   rk   rl   rm   gemm1_limitro   rp   MEN_r`   intermediate_cache1intermediate_cache2intermediate_cache3out_hidden_statesr0   r0   r1   runz   s~  













zTritonRunnerCore.runr   c                 C  r*   r+   r,   r.   r0   r0   r1   r2   `  r3   zTritonRunnerCore.runner_backend)rO   r   )rT   r!   rU   r<   rV   rW   r)   r;   r4   )r6   r7   r8   rQ   r   r:   r2   __classcell__r0   r0   rR   r1   rN   u   s    
 grN   nonetritondispatch_outputr   rU   runner_configr   r)   r   c                 C  s   ddl m} ddlm} |di d| jd|jd|jd| jd|d	|jd
|j	d|j
d|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|j}||dS )Nr   )fused_expertsr   r#   w1r   topk_outputmoe_runner_configb1r@   rB   rC   rD   rE   rF   w1_scalerH   w1_zprJ   a1_scalerL   rM   rh   r0   )ri   r   /sglang.srt.layers.moe.token_dispatcher.standardr   r#   r=   r>   r   r?   r@   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   )r   rU   r   r   r   outputr0   r0   r1   fused_experts_none_to_tritone  sX   	
r   standardrV   rW   c              	   C  s  ddl m}m}m} ddlm} | j| j}}	||	sJ |j	d }
|j
}|js,|jr3|jd us3tr6d}nt}||j|j|j|j|jd}tj||jj	||jj	d |jj	d | f|	jj	d ||j|jd}||
}||	j|d |\}}}||d	< t||	j|	j|||d
S )Nr   )get_config_dtype_strr    try_get_optimal_moe_config)TopKOutputChecker)rB   rC   rD   rE   r_   r]   ra   )rM   rF   BLOCK_SIZE_MrO   )r#   r$   r%   r&   r'   r(   )ri   r   r    r   sglang.srt.layers.moe.topkr   r#   r   format_is_standardrr   num_local_expertsrB   rC   rM   r   _MOE_PADDING_SIZErD   rE   r_   	functoolspartialr=   r>   r%   rF   r!   r$   )r   rU   r   rV   r   r    r   r   r#   r   
num_tokensr   padding_sizeconfig_dtypeget_config_funcrO   r&   r'   r(   r0   r0   r1   pre_permute_standard_to_triton  sb   




r   runner_outputc                 C  s   ddl m} || jdS )Nr   r   rh   )r   r   r#   )r   rU   r   rV   r   r0   r0   r1   post_permute_triton_to_standard  s   r   )r   r   rU   r<   r   r   r)   r   )
r   r   rU   r<   r   r   rV   rW   r)   r!   )
r   r;   rU   r<   r   r   rV   rW   r)   r   )@
__future__r   r   osdataclassesr   typingr   r   r   rs   triton.languagelanguageru   %sglang.srt.layers.moe.moe_runner.baser   r   r	   r
   r   r   r   r   sglang.srt.layers.moe.utilsr   sglang.srt.utilsr   r   r   r   r   r   r   r   rz   ry   _is_cpu_amx_available_is_cpurA   intgetenvr   r{   r   
sgl_kernelr   r   r   aiterr   ImportErrorvllmr   r|   r   r    sgl_moe_align_block_sizer!   r;   r<   rN   r   r   r   r0   r0   r0   r1   <module>   st    (
	 q#L