o
    پip                     @  s  d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z* e) Z+er{d dl,m-Z- zd dl.m/Z/m0Z0 e+sd dl1m2Z2 dZ3W n e4y   dZ3Y nw d dl5m6Z6m7Z7m8Z8 d dl9Z9d dl:m;Z< e&doe( Z=e>e?Z@G dd deZAG dd de
ZBG dd de
ZCeDeBesJ eDeCesJ G dd de
ZEG dd de
ZFeDeEesJ eDeFesJ G dd  d e7ZGG d!d" d"ZHG d#d$ d$eZIG d%d& d&ZJG d'd( d(eJZKG d)d* d*eJZLeG d+d, d,e6ZMG d-d. d.eZNdS )/    )annotationsN)nullcontext)	dataclass)TYPE_CHECKINGList
NamedTupleOptionalTupleUnion)envs)'get_global_expert_distribution_recorder)deep_gemm_wrapper)get_is_extend_in_batch)BaseDispatcherBaseDispatcherConfigCombineInputCombineInputFormatDispatcherBaseHooksDispatchOutputDispatchOutputFormat)
TopKOutput)
DeepEPModeget_deepep_configget_moe_runner_backendis_tbo_enabled)get_bool_env_varis_blackwellis_hipis_npuload_json_config)CombineOverlapArgs)BufferConfig) sglang_per_token_group_quant_fp8TF)EnumIntEnumautoSGLANG_USE_AITERc                   @  s   e Zd ZdddZdS )DeepEPPDispatchHooks
dispatcherr   c                 C  s   | j  D ]}|| qd S N)	hook_dictvalues)selfr)   hook_fun r/   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/token_dispatcher/deepep.py__call__A   s   
zDeepEPPDispatchHooks.__call__N)r)   r   )__name__
__module____qualname__r1   r/   r/   r/   r0   r(   ?   s    r(   c                   @  sH   e Zd ZU dZded< ded< ded< ded< ded	< edddZdS )DeepEPNormalDispatchOutputzDeepEP normal dispatch output.torch.Tensorhidden_statesOptional[torch.Tensor]hidden_states_scaletopk_idstopk_weightsz	List[int]num_recv_tokens_per_expertreturnr   c                 C     t jS r*   )r   DEEPEP_NORMALr-   r/   r/   r0   formatO      z!DeepEPNormalDispatchOutput.formatNr=   r   r2   r3   r4   __doc____annotations__propertyrA   r/   r/   r/   r0   r5   F   s   
 r5   c                   @  sP   e Zd ZU dZded< ded< ded< ded< ded< d	ed
< edddZdS )DeepEPLLDispatchOutputz#DeepEP low latency dispatch output.r6   r7   r8   r9   r:   r;   masked_mint
expected_mr=   r   c                 C  r>   r*   )r   	DEEPEP_LLr@   r/   r/   r0   rA   ^   rB   zDeepEPLLDispatchOutput.formatNrC   rD   r/   r/   r/   r0   rH   T   s   
 rH   c                   @  8   e Zd ZU dZded< ded< ded< eddd	Zd
S )DeepEPNormalCombineInputzDeepEP normal combine input.r6   r7   r:   r;   r=   r   c                 C  r>   r*   )r   r?   r@   r/   r/   r0   rA   n   rB   zDeepEPNormalCombineInput.formatNr=   r   rD   r/   r/   r/   r0   rN   g      
 rN   c                   @  rM   )DeepEPLLCombineInputz!DeepEP low latency combine input.r6   r7   r:   r;   r=   r   c                 C  r>   r*   )r   rL   r@   r/   r/   r0   rA   z   rB   zDeepEPLLCombineInput.formatNrO   rD   r/   r/   r/   r0   rQ   s   rP   rQ   c                   @  s   e Zd Ze Ze ZdS )DeepEPDispatchModeN)r2   r3   r4   r&   NORMALLOW_LATENCYr/   r/   r/   r0   rR      s    
rR   c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded< e			ddddZ
e	dd Ze	dd Ze	dd Ze	dddZdS ) DeepEPBufferNzOptional[DeepEPDispatchMode]_dispatch_modezOptional[int]_hidden_size!_num_max_dispatch_tokens_per_rank_num_expertsgroupdist.ProcessGrouphidden_sizerJ   param_bytesdeepep_moder    num_max_dispatch_tokens_per_ranknum_expertsc                 C  s  | j d ur| j S || _|| _|| _d\}}| rN|| }	t jp(t	|
 t jp3t|
 fD ]}
t|
|	|
 |}t|
|	|
 |}q5| rs|dksXJ |dkrd||
  dksfJ tt|||
 ||}|tjkr~t j}n |tjkr||
  }n|tjkrtt j||
  }nttstjjddj}|tjkrt st j|d k rtdt j d t|||| |d	d
| _ | j S )N)r   r   rZ   r   cuda)device   z	Only use z SMs for DeepEP communication. This may result in highly suboptimal performance. Consider using --deepep-config to change the behavior.T)low_latency_modenum_qps_per_rankallow_mnnvl) _bufferrW   rX   rY   enable_normalDeepEPConfigget_instancenormal_dispatch_configr!   get_dispatch_configsizenormal_combine_configget_combine_configmaxget_nvl_buffer_size_hintget_rdma_buffer_size_hintenable_low_latencyget_low_latency_rdma_size_hintr   rS   num_smsrT   AUTONotImplementedError_is_nputorchrb   get_device_propertiesmulti_processor_countr   loggerwarning)clsr[   r]   r^   r_   r`   ra   num_nvl_bytesnum_rdma_byteshidden_bytesconfigrf   total_num_smsr/   r/   r0   get_deepep_buffer   s   







	zDeepEPBuffer.get_deepep_bufferc                 C  s&   | j jsd S | j | j| j| j d S r*   )rh   re   clean_low_latency_bufferrX   rW   rY   r   r/   r/   r0   clean_buffer   s   zDeepEPBuffer.clean_bufferc                 C  s   t j| _d S r*   )rR   rS   rV   r   r/   r/   r0   set_dispatch_mode_as_normal   s   z(DeepEPBuffer.set_dispatch_mode_as_normalc                 C  s    | j tjkr
|   tj| _ d S r*   )rV   rR   rS   r   rT   r   r/   r/   r0    set_dispatch_mode_as_low_latency   s   z-DeepEPBuffer.set_dispatch_mode_as_low_latencymodec                 C  s0   |  r
|   d S | r|   d S td)Nzunsupported mode)is_low_latencyr   	is_normalr   	Exception)r   r   r/   r/   r0   set_dispatch_mode   s
   zDeepEPBuffer.set_dispatch_mode)rZ   rZ   )r[   r\   r]   rJ   r^   rJ   r_   r   r`   rJ   ra   rJ   )r   r   )r2   r3   r4   rh   rV   rF   rW   rX   rY   classmethodr   r   r   r   r   r/   r/   r/   r0   rU      s$   
 X
	

rU   c                   @  s$   e Zd ZdZdd Zedd ZdS )rj   Nc                 C  s   t  }|rAt|}tj dkrtd|  |d }|d }tdi || _tdi || _	|d |d ks:J |d | _
d S d | _d | _	tj
| _
d S )Nr   zUse DeepEP Config: normal_dispatchnormal_combinerv   r/   )r   r   rz   distributedget_rankr}   infor"   rl   ro   rv   r!   )r-   
config_strconfig_parsedconfig_dispatchconfig_combiner/   r/   r0   __init__	  s   zDeepEPConfig.__init__c                 C  s   | j d u r	t | _ | j S r*   )	_instancerj   r   r/   r/   r0   rk     s   
zDeepEPConfig.get_instance)r2   r3   r4   r   r   r   rk   r/   r/   r/   r0   rj     s
    rj   c                   @  s`   e Zd Zd.ddZd/ddZdd Zd0ddZdd Zdd Zd1d$d%Z	d2d)d*Z
d3d+d,Zd-S )4_DeepEPDispatcherImplBaser[   torch.distributed.ProcessGrouprouter_topkrJ   permute_fusionboolra   num_local_expertsr]   params_dtypetorch.dtyper_   r   c	           	      C  sx   t std|| _|| _|| _|| _|| _|| _|| _|| _	d| _
tj | _| jdks.J d | _d | _d | _d | _d S )NzbDeepEP is not installed. Please install DeepEP package from https://github.com/deepseek-ai/deepep.rd   i   )
use_deepepImportErrorr[   r   r   ra   r   r]   r   r_   params_bytesr   .SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANKgetr`   handlequant_configoverlap_argsmeta_overlap_args)	r-   r[   r   r   ra   r   r]   r   r_   r/   r/   r0   r   $  s(   
z"_DeepEPDispatcherImplBase.__init__r7   r6   topk_outputr   c                 C     t r*   rx   )r-   r7   r   r/   r/   r0   
dispatch_aN  s   z$_DeepEPDispatcherImplBase.dispatch_ac                 O  r   r*   r   r-   argskwargsr/   r/   r0   
dispatch_bU     z$_DeepEPDispatcherImplBase.dispatch_br:   r;   c                 C  r   r*   r   )r-   r7   r:   r;   r/   r/   r0   	combine_aX  s   z#_DeepEPDispatcherImplBase.combine_ac                 O  r   r*   r   r   r/   r/   r0   	combine_b`  r   z#_DeepEPDispatcherImplBase.combine_bc                 C  r   r*   r   r@   r/   r/   r0   _get_bufferc  r   z%_DeepEPDispatcherImplBase._get_bufferr   dictr=   Nonec                 C  s
   || _ d S r*   )r   r-   r   r/   r/   r0   set_quant_configf  s   
z*_DeepEPDispatcherImplBase.set_quant_configcombine_overlap_argsr    r   c                 C  s   || _ || _d S r*   r   r   r-   r   r   r/   r/   r0   set_overlap_argsi  s   
z*_DeepEPDispatcherImplBase.set_overlap_argsc                 C  s   d | _ d | _d S r*   r   r@   r/   r/   r0   clear_overlap_argso  s   
z,_DeepEPDispatcherImplBase.clear_overlap_argsN)r[   r   r   rJ   r   r   ra   rJ   r   rJ   r]   rJ   r   r   r_   r   r7   r6   r   r   r7   r6   r:   r6   r;   r6   )r   r   r=   r   )r   r    r   r   r=   r   )r=   r   )r2   r3   r4   r   r   r   r   r   r   r   r   r   r/   r/   r/   r0   r   #  s    

*


r   c                      s^   e Zd Zd fddZdd	d
Zdd ZdddZdddZdd ZdddZ	dd Z
  ZS ) _DeepEPDispatcherImplNormalasync_finishr   c                   s(   t  jdi | || _d | _i | _d S Nr/   )superr   r   src2dstr   )r-   r   r   	__class__r/   r0   r   u  s   
z$_DeepEPDispatcherImplNormal.__init__r7   r6   r   r   c                 C  sj   |j |j}}|tj}tjr&t  s&t	j
 s&t|dtjtjtjd}| jr-t nd }||||fS )N   )column_major_scalesscale_tma_alignedscale_ue8m0)r;   r:   torz   int64r   ENABLE_JIT_DEEPGEMMr   
is_cutlassr   SGLANG_DEEPEP_BF16_DISPATCHr   r#   DEEPGEMM_SCALE_UE8M0r   r!   capture)r-   r7   r   r;   r:   previous_eventr/   r/   r0   r   |  s"   z&_DeepEPDispatcherImplNormal.dispatch_ac                 C  sT   |  ||||\}}}}}| jr| nd t|tr |\}}nd }t|||||S r   )_dispatch_corer   current_stream_wait
isinstancetupler5   )r-   r7   r:   r;   r   r<   eventr9   r/   r/   r0   r     s&   

z&_DeepEPDispatcherImplNormal.dispatch_bx6Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]r:   r;   c                 C  s   |   }|j|| j|| j|d ud\}}}}	}|j||||||	||| j|d uo)| jtjr.dndt j	d\}
}}}| _
}t j||||d |
||||fS )N)r   r   allocate_on_comm_streamr      )topk_idxr;   num_tokens_per_ranknum_tokens_per_rdma_rankis_token_in_ranknum_tokens_per_expertr   r   r   expert_alignmentr   )r   r   r   )r   get_dispatch_layoutra   r   dispatchr   r   rj   rk   rl   r   r   on_deepep_dispatch_normal)r-   r   r:   r;   r   bufferr   r   r   r   recv_xrecv_topk_idsrecv_topk_weightsr<   r   r/   r/   r0   r     s^   z*_DeepEPDispatcherImplNormal._dispatch_corec                 C  s4   t jststr
|}nt | jrt nd }||fS r*   )r   r   
_use_aiterry   rx   r   r!   r   )r-   r7   r:   r;   outputr   r/   r/   r0   r     s
   z%_DeepEPDispatcherImplNormal.combine_ac                 C  sD   |  ||\}}| jr| n
d d | _d | _|S  d | _d | _|S r   )_combine_corer   r   r   r   )r-   r   r   r7   r   r/   r/   r0   r     s   z%_DeepEPDispatcherImplNormal.combine_bc                 C  s:   |   }|j|| j| j||d ut jd\}}}||fS )N)r   r   r   r   )r   combiner   r   rj   rk   ro   )r-   r   r   r   
combined_x_r   r/   r/   r0   r     s   z)_DeepEPDispatcherImplNormal._combine_corec                 C  (   t   t | j| j| j| j| j| jS r*   )	rU   r   r   r[   r]   r   r_   r`   ra   r@   r/   r/   r0   r   	  s   z'_DeepEPDispatcherImplNormal._get_buffer)r   r   r   )r   r   r:   r6   r;   r6   r   )r   r6   r2   r3   r4   r   r   r   r   r   r   r   r   __classcell__r/   r/   r   r0   r   t  s    


=
r   c                      s^   e Zd Zd fddZdd	d
Zdd ZdddZdddZdd ZdddZ	dd Z
  ZS )_DeepEPDispatcherImplLowLatencyreturn_recv_hookr   c                   s.   t  jdi | 	 || _t | _i | _d S r   )r   r   r   rz   get_device_moduledevice_moduler   )r-   r   r   r   r/   r0   r     s
   

z(_DeepEPDispatcherImplLowLatency.__init__r7   r6   r   r   c           
      C  sn   |   }|j|j}}|tj}|jd |j |jd  | j | j }| 	||\}}}}	|||||||	fS )Nr   r   )
r   r;   r:   r   rz   r   shape
group_sizera   r   )
r-   r7   r   r   r;   r:   rK   rI   r   hookr/   r/   r0   r   "  s(   z*_DeepEPDispatcherImplLowLatency.dispatch_ac           
      C  sN   | j r| n|  t | t|tr|\}}nd }t||||||}	|	S r*   )r   r   r   on_deepep_dispatch_low_latencyr   r   rH   )
r-   r7   r:   r;   rI   rK   r   r   r9   deepep_outputr/   r/   r0   r   <  s"   


z*_DeepEPDispatcherImplLowLatency.dispatch_br:   c           
      C  s   d }}| j dd }|d urd}ntj sd}|  }|j||| j| jfd|i|r0tddnt |d ur<t|dnt | j	 | j	t
joIt
jt
joNt
jd\}| _| _}}	|| j||	fS )NFinput_global_scaleTuse_fp8)	use_nvfp4)x_global_scale)r   r   round_scale	use_ue8m0)r   r   r   r   r   low_latency_dispatchr`   ra   r   r   r   r   DEEPGEMM_BLACKWELLpacked_recv_countr   )
r-   r7   r:   r  r   r   r   packed_recv_hiddenr   r   r/   r/   r0   r   [  s<   
	
z._DeepEPDispatcherImplLowLatency._dispatch_corer;   c                 C  s   |  |||\}}}|||fS r*   )r   )r-   r7   r:   r;   r   r   r/   r/   r0   r     s   

z)_DeepEPDispatcherImplLowLatency.combine_ac                 C  sR   | j }|d ur|j| j  | jr| n|  |d ur'| j |j |S r*   )r   streamwait_streamr   current_streamr   r   )r-   r7   r   r   r   r/   r/   r0   r     s   z)_DeepEPDispatcherImplLowLatency.combine_bc              
   C  s   |   }| j}| j}t }|d ur@|j|j tj|j}t r-t	|j
|j|jd}nt	|j
| j|j|d |d |jd}ni }| |jd|||| j| j | jd|\}	}
}W d    n1 sew   Y  d  | _| _|	|
|fS )N)overlapsrc_signalssrc_signal_expect_valueblock_m	threshold)r  r  comp_signalr  r  rv   )r   r   r;   r   r   r   r/   )r   r   r   r   r	  
wait_eventrz   rb   r   r   r  signalr  r  rv   low_latency_combiner   r   )r-   r7   r:   r;   r   r   r   ctxoverlap_args_dictcombined_hidden_statesr   r   r/   r/   r0   r     sH   	
z-_DeepEPDispatcherImplLowLatency._combine_corec                 C  r   r*   )	rU   r   r   r[   r]   r   r_   r`   ra   r@   r/   r/   r0   r     s   z+_DeepEPDispatcherImplLowLatency._get_buffer)r   r   r   )r7   r6   r:   r6   r   r   r/   r/   r   r0   r     s    


$
/r   c                   @  s$   e Zd Ze Ze Ze Ze ZdS )_StageN)r2   r3   r4   r&   INITIALAFTER_DISPATCH_AAFTER_DISPATCH_BAFTER_COMBINE_Ar/   r/   r/   r0   r    s
    
r  c                      s   e Zd Zdddddejddfd: fddZd;ddZd<ddZdd Zd=d"d#Z	d>d$d%Z
d&d' Zd?d)d*Zd+d, Zd@ fd/d0ZdA fd4d5Z fd6d7Zd8d9 Z  ZS )BDeepEPDispatcherFNr[   r   r   rJ   r   r   ra   r   r]   r   r   r_   r   r   r   c              
     sx   t    || _t||||||||d}| j r#tdd|
i|| _| j r2tdd|	i|| _	t
j| _t | _d S )N)r[   r   r   ra   r   r]   r   r_   r   r   r/   )r   r   r_   r   rt   r   _low_latency_dispatcherri   r   _normal_dispatcherr  r  _stager(   _deepep_dispatch_hooks)r-   r[   r   r   ra   r   r]   r   r_   r   r   common_kwargsr   r/   r0   r     s4   


zDeepEPDispatcher.__init__r7   r6   r   r   r=   r   c                 C  s,   |  || | jd ur| |  |  }|S r*   )r   r!  r   )r-   r7   r   retr/   r/   r0   r     s
   

zDeepEPDispatcher.dispatchc                 C  s,   |  tjtj |  j||d}|| _d S )N)r7   r   )_update_stager  r  r  	_get_implr   _dispatch_intermediate_state)r-   r7   r   inner_stater/   r/   r0   r     s   
zDeepEPDispatcher.dispatch_ac                 C  (   |  tjtj | j}| `|  j| S r*   )r$  r  r  r  r&  r%  r   r-   r'  r/   r/   r0   r        zDeepEPDispatcher.dispatch_bcombine_inputr   c                 C  s   |  | |  }|S r*   )r   r   )r-   r+  r#  r/   r/   r0   r   #  s   
zDeepEPDispatcher.combinec                 C  s8   |\}}}|  tjtj |  j|||d}|| _d S )N)r7   r:   r;   )r$  r  r  r  r%  r   _combine_intermediate_state)r-   r+  r7   r:   r;   r'  r/   r/   r0   r   +  s   

zDeepEPDispatcher.combine_ac                 C  r(  r*   )r$  r  r  r  r,  r%  r   r)  r/   r/   r0   r   8  r*  zDeepEPDispatcher.combine_br   c                 C  sB   t  }| j|}|tjkr| jS |tjkr| jS td| j )NzInvalid deepep_mode: )	r   r_   resolver   rS   r  rT   r  
ValueError)r-   is_extend_in_batchresolved_deepep_moder/   r/   r0   r%  >  s   

zDeepEPDispatcher._get_implc                 C  s   | j |ksJ || _ d S r*   )r   )r-   	old_stage	new_stager/   r/   r0   r$  H  s   
zDeepEPDispatcher._update_stager   r   c                   s@   t  | | j r| j| | j r| j| d S d S r*   )r   r   r_   rt   r  ri   r  r   r   r/   r0   r   L  s   

z!DeepEPDispatcher.set_quant_configr   r    r   c                   sF   t  || | j r| j|| | j r!| j|| d S d S r*   )r   r   r_   rt   r  ri   r  r   r   r/   r0   r   S  s   

z!DeepEPDispatcher.set_overlap_argsc                   s:   t    | j r| j  | j r| j  d S d S r*   )r   r   r_   rt   r  ri   r  r@   r   r/   r0   r   `  s   



z#DeepEPDispatcher.clear_overlap_argsc                 C  s   | j |S r*   )r!  register_hook)r-   r   r/   r/   r0   register_deepep_dispatch_hookg  s   z.DeepEPDispatcher.register_deepep_dispatch_hook)r[   r   r   rJ   r   r   ra   rJ   r   rJ   r]   rJ   r   r   r_   r   r   r   r   r   )r7   r6   r   r   r=   r   r   )r+  r   r=   r6   )r+  r   )r=   r   )r   r   )r   r    r   r   )r2   r3   r4   r   rw   r   r   r   r   r   r   r   r%  r$  r   r   r   r4  r   r/   r/   r   r0   r    s,    
*




r  )O
__future__r   logging
contextlibr   dataclassesr   typingr   r   r   r   r	   r
   sglang.srt.environr   #sglang.srt.eplb.expert_distributionr   sglang.srt.layersr   sglang.srt.layers.dp_attentionr   +sglang.srt.layers.moe.token_dispatcher.baser   r   r   r   r   r   r   sglang.srt.layers.moe.topkr   sglang.srt.layers.moe.utilsr   r   r   r   sglang.srt.utilsr   r   r   r   r   ry   -sglang.srt.batch_overlap.single_batch_overlapr    deep_epr!   r"   )sglang.srt.layers.quantization.fp8_kernelr#   r   r   enumr$   r%   r&   rz   torch.distributedr   distr   	getLoggerr2   r}   r(   r5   rH   r   rN   rQ   rR   rU   rj   r   r   r   r  r  r/   r/   r/   r0   <module>   sb     $	
~Q # >