o
    پib                     @  s  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZmZ d d
lmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5 erd dl6m7Z7m8Z8m9Z9 e4 Z:e5 Z;e, Z<e3doe:Z=e=rd dl>m?Z?m@Z@ d dlAmBZB ne;rd dlCZCeDeEZFe;rd dlCZCG dd deZGG dd deGZHG dd deGZId#d!d"ZJdS )$    )annotationsN)TYPE_CHECKINGAnyDictOptionalUnion)is_in_piecewise_cuda_graph)envs)npu_format_cast)deep_gemm_wrapper)get_deepep_modeget_moe_a2a_backendget_moe_runner_backend)FlashInferFusedMoEFusedMoE%moe_forward_piecewise_cuda_graph_impl)upscale)DeepEPLLCombineInputDeepEPNormalCombineInput)MoriEPNormalCombineInput)
TopKOutputTopKOutputChecker)QuantizationConfig)'NPUCompressedTensorsW4A16Int4DynamicMoE)	Fp8ConfigFp8MoEMethod)is_fp8_fnuz)QuarkW4A4MXFp4MoE)W4AFp8ConfigW4AFp8MoEMethod)get_bool_env_varis_hipis_npu)DeepEPLLDispatchOutputDeepEPNormalDispatchOutputDispatchOutputSGLANG_USE_AITER)ActivationType	QuantType)	fused_moec                      s   e Zd ZdZdZ						d:d; fddZd<ddZd< fdd Zd<d!d"Zd= fd%d&Z		d>d?d+d,Z
d@d.d/ZdAd1d2ZdBd4d5ZdAd6d7Zd@d8d9Z  ZS )C	DeepEPMoEz
    MoE Expert Parallel Impl based on DeepEP (https://github.com/deepseek-ai/DeepEP/tree/main)
    Mooncake EP shares the same class, as they expose the same interface.
    Fr   N silunum_expertsinttop_khidden_sizeintermediate_sizelayer_idnum_fused_shared_expertsparams_dtypeOptional[torch.dtype]quant_configOptional[QuantizationConfig]prefixstr
activationrouted_scaling_factorOptional[float]c                   sB  t  jd|||||||||	|
|d| tstrd| _ntjr)t|tr)d| _nd| _| jr1d S t|trIt	| j
dd| _d| _tj| _d| _nt|trXd| _d| _d| _n	d| _d| _d| _t | _| j rtst  rx| j dkstjsJ d| j dtrtj| jd tj tjd	| _d| jd d
< d S d S )Nr-   r/   r0   r1   r2   r3   r4   r6   r8   r:   r;   FTblock_quantmodelopt_fp4zDeepEP z mode requires deep_gemm   devicedtype )super__init__
_use_aiter_is_npudeprecate_flagr   ENABLE_JIT_DEEPGEMM
isinstancer   getattrquant_methoduse_block_quantuse_fp8_w8a8torchfloat8_e4m3fn	fp8_dtype
use_w4afp8r   r   deepep_modeenable_low_latencyr   is_flashinfer_cutedslr6   get_namezerosnum_local_expertscudacurrent_devicer.   expert_maskselfr-   r/   r0   r1   r2   r3   r4   r6   r8   r:   r;   kwargs	__class__rE   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/ep_moe/layer.pyrG   I   sp   

zDeepEPMoE.__init__hidden_statestorch.Tensortopk_outputr   c                 C  s<   t  rt|sJ dt||j|j|j| jS | ||S )Nz?Only standard topk output is supported for piecewise cuda graph)	r   r   format_is_standardr   topk_weightstopk_idsrouter_logitsr2   forward_implr_   rd   rf   rE   rE   rc   forward   s   zDeepEPMoE.forwardc                   s@   | j r
t ||S | jj||d}| |}| jj|d}|S )Nrd   rf   )combine_input)rJ   rF   rk   
dispatcherdispatchrun_moe_corecombine)r_   rd   rf   dispatch_outputro   ra   rE   rc   rk      s   
zDeepEPMoE.forward_implc                 C  s   | j j||dS )Nrn   )rp   rq   rl   rE   rE   rc   rq      s   zDeepEPMoE.dispatchrt   r%   c                   s   | j r	t |S ddlm} tr||sJ | |}nEtr-||s'J | 	|}n6|
|r?| jr;| |}n(J d||rct  rV| j dkrV| |}n| jr_| |}nJ d|
|rjtnt}|||j|jdS )Nr   DispatchOutputCheckerFz)forward_deepgemm_contiguous is deprecatedr?   z%forward_deepgemm_masked is deprecatedrd   ri   rh   )rJ   rF   rr   &sglang.srt.layers.moe.token_dispatcherrv   rH   format_is_deepepforward_aiterrI   forward_npuformat_is_deepep_normalrT   forward_cutlass_w4afp8format_is_deepep_llr   rW   r6   rX   forward_flashinfer_cutedslforward_cutlass_w4afp8_maskedr   r   ri   rh   )r_   rt   rv   outputcombine_input_wrapperra   rE   rc   rr      s@   

zDeepEPMoE.run_moe_coreri   rh   overlap_argsOptional[Dict[str, Any]]c                 C  s   | j j||||dS )N)rd   ri   rh   r   )rp   rs   )r_   rd   ri   rh   r   rE   rE   rc   rs      s   zDeepEPMoE.combine9Union[DeepEPNormalDispatchOutput, DeepEPLLDispatchOutput]c                 C  s~   |j |j|j}}}|jd dkr|S |tj}| j||dk< t|| j	| j
||| j| jtj| jjdkr8tjntj| jd
S )Nr   rD   r,   )w1_scalew2_scale
quant_typer:   r]   )rd   ri   rh   shapetorQ   int32rZ   r)   
w13_weight	w2_weightw13_weight_scale_invw2_weight_scale_invr(   per_128x128moe_runner_configr:   r'   SiluGelur]   )r_   rt   rd   ri   rh   topk_ids_copyrE   rE   rc   rz     s,   
zDeepEPMoE.forward_aiterr#   c                 C  sL   |\}}}}}}| j d usJ | jjdksJ | j j| ||f|| jd}|S )Nr,   )layerxmasked_mr   )rN   r   r:   apply_without_routing_weights)r_   rt   rd   hidden_states_scale_r   r   rE   rE   rc   r   /  s   z$DeepEPMoE.forward_flashinfer_cutedslr$   c                 C  s0   | j jdksJ t| jtsJ | jj| |dS )Nr,   r   rt   )r   r:   rL   rN   r   apply_deepep_normalr_   rt   rE   rE   rc   r}   ?  s   z DeepEPMoE.forward_cutlass_w4afp8c                 C  sB   | j jdksJ t| jtsJ tj sJ d| jj| |dS )Nr,   zOW4AFP8 does not support FP8 dispatch; please set SGLANG_DEEPEP_BF16_DISPATCH=1.r   )	r   r:   rL   rN   r   r	   SGLANG_DEEPEP_BF16_DISPATCHgetapply_deepep_llr   rE   rE   rc   r   J  s   z'DeepEPMoE.forward_cutlass_w4afp8_maskedc                 C  sd  | j d usJ | jjdksJ ddlm} ddlm} tj}d}|	|rpt
r.t|ts.J |\}}}}}	tj|	tj|jd}
| jjtjkrP|| |||
|}|S td}|sct| j tsct|\}}| j | ||||
|}|S ||rt
r~t|ts~J |\}}}}}
}|
tj}
| jjtjkr|| |||
|}|S | j | ||||
|}|S td|j )	Nr,   r   )*npu_fused_moe_without_routing_weights_bf16ru   r@   )rC   rB   DEEP_NORMAL_MODE_USE_INT8_QUANTzNot Supported DeepEP format )rN   r   r:   Asglang.srt.hardware_backend.npu.quantization.fused_moe_method_npur   rx   rv   rQ   bfloat16r|   r   rL   r$   tensorint64rB   r   rC   r    r   	torch_npunpu_dynamic_quantr   r~   r#   r   
ValueErrorformat)r_   rt   r   rv   output_dtypegroup_list_typerd   r   r   num_recv_tokens_per_expert
group_listinput_quantri   rh   rE   rE   rc   r{   X  s   

1&

zDeepEPMoE.forward_npur   NNr+   r,   Nr-   r.   r/   r.   r0   r.   r1   r.   r2   r.   r3   r.   r4   r5   r6   r7   r8   r9   r:   r9   r;   r<   rd   re   rf   r   )rt   r%   )N)rd   re   ri   re   rh   re   r   r   )rt   r   )rt   r#   )rt   r$   )__name__
__module____qualname____doc___has_printedrG   rm   rk   rq   rr   rs   rz   r   r}   r   r{   __classcell__rE   rE   ra   rc   r*   A   s*    	
T

3

"

r*   c                      s^   e Zd Z						d.d/ fddZ			d0d1ddZd2d d!Zd3d4d&d'Zd5d,d-Z  ZS )6NpuFuseEPMoEr   Nr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   c                   s8   t  jd|||||||||	|
|d| | j| j_d S )Nr=   rE   )rF   rG   _process_weights_after_loadingrN   process_weights_after_loadingr^   ra   rE   rc   rG     s"   
zNpuFuseEPMoE.__init__Frd   re   rf   r   c                 C  s"   | j j||| j| j| j| jdjS )N)rd   rf   gmm1_permuted_weightgmm1_permuted_weight_scalegmm2_weightgmm2_weight_scale)rp   rq   r   w13_weight_scaler   w2_weight_scalehidden_state)r_   rd   rf   forward_shared_experts
alt_streamdisable_sborE   rE   rc   rm     s   zNpuFuseEPMoE.forwardwtile_nc                 C  s   |d dkrt d| |j^ }}|| dkr#t d| d| |jg |d|| |d R  }ttt|g d }||}|jg ||R  S )N   r   ztile_n must be even, got zLast dimension z must be divisible by tile_n )rD   )r   r   reshapelistrangelenpermute)r_   r   r   dimsn
w_reshaped
perm_order
w_permutedrE   rE   rc   permute_w13_weight_scale  s   $
z%NpuFuseEPMoE.permute_w13_weight_scale@   weightdim
chunk_sizec                 C  s   |j }|dk r|t|7 }|| d|  dkr(td| d||  dd|  g |d | d|| d|  |||d d  R }||}|||d  }|jg |d | d||d d  R  S )Nr   r   z
Dimension z size z must be divisible by r@   rD   )r   r   r   view	transpose
contiguous)r_   r   r   r   original_shape	new_shaperE   rE   rc   reshape_w13_weight  s*   

,zNpuFuseEPMoE.reshape_w13_weightr   torch.nn.ModulereturnNonec                 C  s  |j dd }| |d }t|}tjj|dd|_ t|j	}tjj|dd|_	|j
jd }| |d}tjj|tjdd|_
|jjd }tjj|tjdd|_t|drstjj|jjd dd|_t|drtjj|jjd dd|_d S d S )	Nr@   r   rD   F)requires_grad   w13_weight_offsetw2_weight_offset)r   r   cpur   npur
   rQ   nn	Parameterr   r   datasqueezer   r   r   float32r   hasattrr   r   )r_   r   cpu_w13w13w2	w13_scaler   rE   rE   rc   r     s4   


z+NpuFuseEPMoE._process_weights_after_loadingr   r   NNFr   )r   re   r   r.   )r   )r   re   r   r.   r   r.   )r   r   r   r   )	r   r   r   rG   rm   r   r   r   r   rE   rE   ra   rc   r     s    &
r   c                      s>   e Zd Z						dd fddZ			d d!ddZ  ZS )"	MoriEPMoEr   Nr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   c                   sz   t  jd|||||||||	|
|d| tsJ dtj| jtj tjd| _	| j
| j }|| j }d| j	||< d S )Nr=   z2Mori need to be used together with aiter as of nowrA   r@   rE   )rF   rG   rH   rQ   rY   r-   r[   r\   r   r]   moe_ep_rankrZ   )r_   r-   r/   r0   r1   r2   r3   r4   r6   r8   r:   r;   r`   expert_start_idxexpert_end_idxra   rE   rc   rG   '  s0   
zMoriEPMoE.__init__Frd   re   rf   r   c                 C  sf  |j d }|j}d }t| jt}	t| jt}
| j||}|\}}}}}| j	}| j
}d }d }tj}|	s@|d ur@t||||}d }|
rkttdrU| j	tj}| j
tj}| j}| j}tj}t| j	drjd|_d|_n|	rt| dru| j}t| dr}| j}tj}t|||||||||| jjdkrtjntj| j||d}t }|||j!|j"d	}| j#|}|d | S )
Nr   float4_e2m1fn_x2is_shuffledTr   r   r,   )rd   w1r   r   r   a1_scaletopk_weightri   r   r:   r]   num_local_tokensrC   rw   )$r   rC   rL   rN   r   schemer   rp   rq   r   r   r(   Nor   r   rQ   r   r   r   r   per_1x32r   r   r   r   r)   r   r:   r'   r   r   r]   r   ri   rh   rs   )r_   rd   rf   r   r   r   	num_tokenr   scaleis_fp8_quantis_quark_w4a4rt   dispatch_a1dispatch_scaledispatch_idsdispatch_weightsdispatch_recv_token_numr   r   r   r   r   r   ro   resultrE   rE   rc   rm   O  s   




zMoriEPMoE.forwardr   r   r   r   )r   r   r   rG   rm   r   rE   rE   ra   rc   r   &  s    ,r   r6   r7   c                 C  s   t   rtS t   st   rtS t   rtS t 	 rI| d ur1| 
 dkr1ddlm} |S | d u sG| 
 dksG| 
 dksG| 
 dkrItS t  rPtS tS )Nr?   r   )FlashInferFP4MoEfp8modelopt_fp8compressed_tensors)r   is_morir   	is_deepepis_mooncaker*   is_ascend_fuseepr   r   is_flashinfer_trtllmrX   ,sglang.srt.layers.moe.fused_moe_triton.layerr  r   is_flashinfer_cutlassr   )r6   r  rE   rE   rc   get_moe_impl_class  s$   



r  )r6   r7   )K
__future__r   loggingtypingr   r   r   r   r   rQ   0sglang.srt.compilation.piecewise_context_managerr   sglang.srt.environr	   %sglang.srt.hardware_backend.npu.utilsr
   sglang.srt.layersr   sglang.srt.layers.moer   r   r   r  r   r   r   $sglang.srt.layers.moe.rocm_moe_utilsr   -sglang.srt.layers.moe.token_dispatcher.deepepr   r   -sglang.srt.layers.moe.token_dispatcher.moriepr   sglang.srt.layers.moe.topkr   r   *sglang.srt.layers.quantization.base_configr   9sglang.srt.layers.quantization.compressed_tensors.schemesr   "sglang.srt.layers.quantization.fp8r   r   )sglang.srt.layers.quantization.fp8_kernelr   ,sglang.srt.layers.quantization.quark.schemesr   %sglang.srt.layers.quantization.w4afp8r   r   sglang.srt.utilsr    r!   r"   rx   r#   r$   r%   _is_hiprI   _is_fp8_fnuzrH   aiterr'   r(   aiter.fused_moer)   r   	getLoggerr   loggerr*   r   r   r  rE   rE   rE   rc   <module>   sT    
  k| 	