o
    پi)                     @  s  U d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ e	r7d dlmZ eeZG d	d
 d
eZG dd deZG dd deZdaded< daded< daded< daded< daded< daded< daded< da ded< da!ded< da"ded< da#ded< dDd"d#Z$dEd%d&Z%dFd'd(Z&dFd)d*Z'dEd+d,Z(dGd-d.Z)dHd0d1Z*dId3d4Z+dId5d6Z,dJd8d9Z-d:d; Z.d<d= Z/ed>d? Z0ed@dA Z1G dBdC dCeZ2dS )K    )annotationsN)contextmanager)EnumIntEnum)TYPE_CHECKINGOptional)"get_moe_expert_parallel_world_size)get_attention_dp_sizeis_dp_attention_enabled)
ServerArgsc                   @  s`   e Zd ZdZdZdZdZdZdZe	dd Z
d	d
 Zdd Zdd Zdd Zdd Zdd ZdS )MoeA2ABackendnonedeepepmooncakemoriascend_fuseep
flashinferc                 C  s@   |d u r| j S | D ]}||jkr|  S q	td| j d| )NzNo z member for value )NONEvalue
ValueError__name__)clsr   member r   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/utils.py	_missing_   s   
zMoeA2ABackend._missing_c                 C  
   | t jkS N)r   r   selfr   r   r   is_none&      
zMoeA2ABackend.is_nonec                 C  r   r   )r   DEEPEPr   r   r   r   	is_deepep)   r!   zMoeA2ABackend.is_deepepc                 C  r   r   )r   MOONCAKEr   r   r   r   is_mooncake,   r!   zMoeA2ABackend.is_mooncakec                 C  r   r   )r   
FLASHINFERr   r   r   r   is_flashinfer/   r!   zMoeA2ABackend.is_flashinferc                 C  r   r   )r   ASCEND_FUSEEPr   r   r   r   is_ascend_fuseep2   r!   zMoeA2ABackend.is_ascend_fuseepc                 C  r   r   )r   MORIr   r   r   r   is_mori5   r!   zMoeA2ABackend.is_moriN)r   
__module____qualname__r   r"   r$   r*   r(   r&   classmethodr   r    r#   r%   r'   r)   r+   r   r   r   r   r      s    
r   c                   @  s   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZdS ) MoeRunnerBackendauto	deep_gemmtritontriton_kernelflashinfer_trtllmflashinfer_cutlassflashinfer_mxfp4flashinfer_cutedslcutlassmarlinc                 C  r   r   )r/   AUTOr   r   r   r   is_autoF   r!   zMoeRunnerBackend.is_autoc                 C  r   r   )r/   	DEEP_GEMMr   r   r   r   is_deep_gemmI   r!   zMoeRunnerBackend.is_deep_gemmc                 C  r   r   )r/   TRITONr   r   r   r   	is_tritonL   r!   zMoeRunnerBackend.is_tritonc                 C  r   r   )r/   TRITON_KERNELSr   r   r   r   is_triton_kernelsO   r!   z"MoeRunnerBackend.is_triton_kernelsc                 C  r   r   )r/   FLASHINFER_TRTLLMr   r   r   r   is_flashinfer_trtllmR   r!   z%MoeRunnerBackend.is_flashinfer_trtllmc                 C  r   r   )r/   FLASHINFER_CUTLASSr   r   r   r   is_flashinfer_cutlassU   r!   z&MoeRunnerBackend.is_flashinfer_cutlassc                 C  r   r   )r/   FLASHINFER_CUTEDSLr   r   r   r   is_flashinfer_cutedslX   r!   z&MoeRunnerBackend.is_flashinfer_cutedslc                 C  r   r   )r/   FLASHINFER_MXFP4r   r   r   r   is_flashinfer_mxfp4[   r!   z$MoeRunnerBackend.is_flashinfer_mxfp4c                 C  r   r   )r/   CUTLASSr   r   r   r   
is_cutlass^   r!   zMoeRunnerBackend.is_cutlassc                 C  r   r   )r/   MARLINr   r   r   r   	is_marlina   r!   zMoeRunnerBackend.is_marlinN)r   r,   r-   r:   r<   r>   r@   rB   rD   rH   rF   rJ   rL   r;   r=   r?   rA   rC   rE   rG   rI   rK   rM   r   r   r   r   r/   9   s*    r/   c                   @  sT   e Zd ZdZdZdZdddZddd	ZdddZdddZ	dddZ
dddZdS )
DeepEPModenormallow_latencyr0   returnboolc                 C     | t jt jfv S r   )rN   NORMALr:   r   r   r   r   enable_normalk      zDeepEPMode.enable_normalc                 C  rS   r   )rN   LOW_LATENCYr:   r   r   r   r   enable_low_latencyn   rV   zDeepEPMode.enable_low_latencyis_extend_in_batchc                 C  s   | t jkr| S |rt jS t jS r   )rN   r:   rT   rW   )r   rY   r   r   r   resolveq   s
   
zDeepEPMode.resolvec                 C  r   r   )rN   rT   r   r   r   r   	is_normalz   r!   zDeepEPMode.is_normalc                 C  r   r   )rN   rW   r   r   r   r   is_low_latency}   r!   zDeepEPMode.is_low_latencyc                 C  r   r   )rN   r:   r   r   r   r   r;      r!   zDeepEPMode.is_autoNrQ   rR   )rY   rR   rQ   rN   )r   r,   r-   rT   rW   r:   rU   rX   rZ   r[   r\   r;   r   r   r   r   rN   e   s    



	
rN   zOptional[MoeA2ABackend]MOE_A2A_BACKENDzOptional[MoeRunnerBackend]MOE_RUNNER_BACKENDSPECULATIVE_MOE_RUNNER_BACKENDSPECULATIVE_MOE_A2A_BACKENDzOptional[DeepEPMode]DEEPEP_MODEzOptional[bool]IS_TBO_ENABLEDIS_SBO_ENABLEDzOptional[float] TBO_TOKEN_DISTRIBUTION_THRESHOLDzOptional[str]DEEPEP_CONFIG,DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHERMOE_QUANTIZATIONserver_argsr   c                 C  sz   t | jat| ja| jd urt| jnta| jd ur t | jnta	t
| ja| jp+da| ja| ja| ja| ja| jad S )N )r   moe_a2a_backendr^   r/   moe_runner_backendr_   speculative_moe_runner_backendr`   speculative_moe_a2a_backendra   rN   deepep_moderb   deepep_configrf   enable_two_batch_overlaprc   enable_single_batch_overlaprd    tbo_token_distribution_thresholdre   ,disable_flashinfer_cutlass_moe_fp4_allgatherrg   quantizationrh   )ri   r   r   r   initialize_moe_config   s$   








rv   rQ   c                   C     t d u rtja t S r   )r^   r   r   r   r   r   r   get_moe_a2a_backend      rx   c                   C  rw   r   )r_   r/   r:   r   r   r   r   get_moe_runner_backend   ry   rz   c                   C     t d u rtd tja t S )NzESPECULATIVE_MOE_RUNNER_BACKEND is not initialized, using auto backend)r`   loggerwarningr/   r:   r   r   r   r   "get_speculative_moe_runner_backend      r~   c                   C  r{   )NzBSPECULATIVE_MOE_A2A_BACKEND is not initialized, using none backend)ra   r|   r}   r   r   r   r   r   r   get_speculative_moe_a2a_backend   r   r   c                   C  r{   )Nz/DEEPEP_MODE is not initialized, using auto mode)rb   r|   r}   rN   r:   r   r   r   r   get_deepep_mode   s   
r   strc                   C     t d u rtd da t S )Nz6DEEPEP_CONFIG is not initialized, using default configrj   )rf   r|   r}   r   r   r   r   get_deepep_config   s   
r   rR   c                   C     t d u rda t S NF)rc   r   r   r   r   is_tbo_enabled      r   c                   C  r   r   )rd   r   r   r   r   is_sbo_enabled   r   r   floatc                   C  r   )Nz?TBO_TOKEN_DISTRIBUTION_THRESHOLD is not initialized, using 0.48gQ?)re   r|   r}   r   r   r   r   $get_tbo_token_distribution_threshold   s   r   c                 C  s*   t |dd o|jjdko|jjd |kS )zK
    Filter out for MoE expert parameters that requires global expert.
    _sglang_require_global_expertsFr   )getattrdatandimshape)namexnum_local_expertsr   r   r   %filter_moe_weight_param_global_expert   s
   
r   c                   C  s4   t  ot  ot  ot otdkot t kS )z
    Perform FP4 quantize before all-gather for flashinfer cutlass moe to reduce communication cost for high-throughput serving.
    modelopt_fp4)	rg   rx   r    rz   rE   r
   rh   r   r	   r   r   r   r   /should_use_flashinfer_cutlass_moe_fp4_allgather
  s   
r   c                  c  s$    t } zt a dV  W | a dS | a w )z
    Context manager to temporarily use the speculative MoE backend for draft model operations.
    This ensures that draft models in speculative decoding use the configured speculative backend.
    N)r_   r~   )original_backendr   r   r   speculative_moe_backend_context  s   r   c                  c  s4    t } t}zt a dadV  W | a |adS | a |aw )z
    Context manager to temporarily use the speculative MoE A2A backend for draft model operations.
    This ensures that draft models in speculative decoding use the configured speculative A2A backend.
    TN)r^   rg   r   )r   5original_disable_flashinfer_cutlass_moe_fp4_allgatherr   r   r   #speculative_moe_a2a_backend_context'  s   r   c                   @  s(   e Zd ZdZdZdZdZdZdZdZ	dS )	RoutingMethodType)r   )   )   )   )   )      N)
r   r,   r-   DefaultRenormalize
DeepSeekV3Llama4RenormalizeNaiveTopKUnspecifiedr   r   r   r   r   A  s    r   )ri   r   )rQ   r   )rQ   r/   )rQ   rN   )rQ   r   r]   )rQ   r   )3
__future__r   logging
contextlibr   enumr   r   typingr   r   %sglang.srt.distributed.parallel_stater   sglang.srt.layers.dp_attentionr	   r
   sglang.srt.server_argsr   	getLoggerr   r|   r   r/   rN   r^   __annotations__r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rv   rx   rz   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sR    
%,

$












