o
    پi.                     @  sB  d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dlmZ d dlm Z  e!e"Z#G dd de	Z$e%e$espJ G dd de	Z&e%e&esJ G dd dZ'G dd dZ(eG dd deZ)G dd deZ*dS )    )annotationsN)	dataclass)Enumauto)
NamedTupleOptional)ElasticEPStateManager)'get_global_expert_distribution_recorder)get_is_extend_in_batch)BaseDispatcherCombineInputCombineInputFormatDispatchOutputDispatchOutputFormat)
TopKOutput)
DeepEPMode)get_int_env_varc                   @  sP   e Zd ZU dZded< ded< ded< ded< ded< d	ed
< edddZdS )MooncakeDispatchOutputzMooncake EP dispatch output.torch.Tensorhidden_stateszOptional[torch.Tensor]hidden_states_scaletopk_idstopk_weightsmasked_mint
expected_mreturnr   c                 C     t jS N)r   	DEEPEP_LLself r"   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/token_dispatcher/mooncake.pyformat&      zMooncakeDispatchOutput.formatN)r   r   )__name__
__module____qualname____doc____annotations__propertyr$   r"   r"   r"   r#   r      s   
 r   c                   @  s    e Zd ZdZ	 edddZdS )MooncakeCombineInputzMooncake EP combine input.r   r   c                 C  r   r   )r   r   r    r"   r"   r#   r$   3   r%   zMooncakeCombineInput.formatN)r   r   )r&   r'   r(   r)   r+   r$   r"   r"   r"   r#   r,   .   s
    r,   c                   @  sJ   e Zd ZU dZdZded< dZded< dZded< e		ddddZ	dS )EPBufferNzOptional[int]_hidden_size!_num_max_dispatch_tokens_per_rank_num_expertsgroupdist.ProcessGrouphidden_sizer   param_bytesdeepep_moder    num_max_dispatch_tokens_per_ranknum_expertsc           	      C  s   | j d ur| j S ddlm} || _|| _|| _d}| r!td| rC|dks+J |dkr7||	  dks9J |
|||	 |}|||| _ | j S )Nr   Bufferz1Normal mode is not supported for Mooncake EP yet.r1   )_buffermooncake.mooncake_ep_bufferr:   r.   r/   r0   enable_normalNotImplementedErrorenable_low_latencysizeget_ep_buffer_size_hint)	clsr2   r4   r5   r6   r7   r8   r:   num_ep_buffer_bytesr"   r"   r#   get_ep_bufferA   s,   

zEPBuffer.get_ep_buffer)r1   r1   )r2   r3   r4   r   r5   r   r6   r   r7   r   r8   r   )
r&   r'   r(   r;   r.   r*   r/   r0   classmethodrD   r"   r"   r"   r#   r-   ;   s   
 r-   c                   @  sZ   e Zd Zd(ddZd)ddZdd Z	d*d+ddZd,dd Zd!d" Zd,d#d$Z	d%d& Z
d'S )-_MooncakeEPDispatcherImplr2   torch.distributed.ProcessGrouprouter_topkr   permute_fusionboolr8   num_local_expertsr4   params_dtypetorch.dtypereturn_recv_hookr6   r   c
                 C  s   zddl m}
 W n ty   tdw || _|| _|| _|| _|| _|| _|| _	|| _
|	| _d| _tdd| _| jdks?J d| _d	| _d | _d S )
Nr   r9   zMooncake EP is not installed. Please install Mooncake package at https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md with EP support to run SGLang with Mooncake EP.   3SGLANG_MOONCAKE_EP_NUM_MAX_DISPATCH_TOKENS_PER_RANK   i   Ti )r<   r:   ImportErrorr2   rH   rI   r8   rK   r4   rL   rN   r6   params_bytesr   r7   first_execution
timeout_ushandle)r!   r2   rH   rI   r8   rK   r4   rL   rN   r6   r:   r"   r"   r#   __init__i   s0   
z"_MooncakeEPDispatcherImpl.__init__r   r   topk_outputr   c           
      C  sr   |j |j}}|  }|tj}|jd |j |jd  | j | j }| j	||dd\}}}}	|||||||	fS )Nr      T)use_fp8)
r   r   _get_buffertotorchint64shape
group_sizer8   _dispatch_core)
r!   r   rX   r   r   bufferr   r   eventhookr"   r"   r#   
dispatch_a   s*   z$_MooncakeEPDispatcherImpl.dispatch_ac           	      C  sJ   | j r| n|  t | t|tr|\}}nd }t||||||S r   )rN   current_stream_waitr	   on_deepep_dispatch_low_latency
isinstancetupler   )	r!   r   r   r   r   r   rc   rd   r   r"   r"   r#   
dispatch_b   s    


z$_MooncakeEPDispatcherImpl.dispatch_bFr   rZ   c           
      C  s\   |   }t j}|j|||| j| j| jrdn| j|| j	 | j	d	\}}| _
}}	||||	fS )Nr1   )rZ   async_finishrN   )r[   r   instanceactive_ranksdispatchr7   r8   rT   rU   rN   rV   )
r!   r   r   rZ   rb   rm   packed_recv_hiddenpacked_recv_countrc   rd   r"   r"   r#   ra      s   
z(_MooncakeEPDispatcherImpl._dispatch_corer   c                 C  s   |  |||\}}}|||fS r   )_combine_core)r!   r   r   r   rc   rd   r"   r"   r#   	combine_a   s   

z#_MooncakeEPDispatcherImpl.combine_ac                 C  s   | j r|  |S |  |S r   )rN   rf   )r!   r   rc   rd   r"   r"   r#   	combine_b   s   z#_MooncakeEPDispatcherImpl.combine_bc           	   
   C  s\   |   }t j}|j||||| jrdn| j| j| j | jd\}}}d| _d | _|||fS )Nr1   )rk   rN   F)	r[   r   rl   rm   combinerT   rU   rV   rN   )	r!   r   r   r   rb   rm   combined_hidden_statesrc   rd   r"   r"   r#   rq      s   


z'_MooncakeEPDispatcherImpl._combine_corec                 C  s    t | j| j| j| j| j| jS r   )r-   rD   r2   r4   rS   r6   r7   r8   r    r"   r"   r#   r[     s   z%_MooncakeEPDispatcherImpl._get_bufferN)r2   rG   rH   r   rI   rJ   r8   r   rK   r   r4   r   rL   rM   rN   rJ   r6   r   r   r   rX   r   )F)r   r   r   r   rZ   rJ   )r   r   r   r   r   r   )r&   r'   r(   rW   re   rj   ra   rr   rs   rq   r[   r"   r"   r"   r#   rF   h   s    

+"

rF   c                   @  s$   e Zd Ze Ze Ze Ze ZdS )_StageN)r&   r'   r(   r   INITIALAFTER_DISPATCH_AAFTER_DISPATCH_BAFTER_COMBINE_Ar"   r"   r"   r#   rw     s
    
rw   c                      s|   e Zd Zdddddejddfd- fddZd.ddZd/ddZdd Zd0d"d#Z	d1d$d%Z
d&d' Zd2d)d*Zd+d, Z  ZS )3MooncakeEPDispatcherFNr2   rG   rH   r   rI   rJ   r8   rK   r4   rL   rM   r6   r   rk   rN   c                   sP   t    || _| j rt||||||||
|d	| _| j r"ttj	| _
d S )N)	r2   rH   rI   r8   rK   r4   rL   rN   r6   )superrW   r6   r?   rF   _low_latency_dispatcherr=   r>   rw   rx   _stage)r!   r2   rH   rI   r8   rK   r4   rL   r6   rk   rN   	__class__r"   r#   rW     s"   


zMooncakeEPDispatcher.__init__r   r   rX   r   r   r   c                 C  s   |  || |  }|S r   )re   rj   )r!   r   rX   retr"   r"   r#   rn   A  s   zMooncakeEPDispatcher.dispatchc                 C  s,   |  tjtj |  j||d}|| _d S )N)r   rX   )_update_stagerw   rx   ry   	_get_implre   _dispatch_intermediate_state)r!   r   rX   inner_stater"   r"   r#   re   J  s   
zMooncakeEPDispatcher.dispatch_ac                 C  (   |  tjtj | j}| `|  j| S r   )r   rw   ry   rz   r   r   rj   r!   r   r"   r"   r#   rj   V     zMooncakeEPDispatcher.dispatch_bcombine_inputr   c                 C  s   |  | |  }|S r   )rr   rs   )r!   r   r   r"   r"   r#   rt   \  s   
zMooncakeEPDispatcher.combinec                 C  s8   |\}}}|  tjtj |  j|||d}|| _d S )N)r   r   r   )r   rw   rz   r{   r   rr   _combine_intermediate_state)r!   r   r   r   r   r   r"   r"   r#   rr   d  s   

zMooncakeEPDispatcher.combine_ac                 C  r   r   )r   rw   r{   rx   r   r   rs   r   r"   r"   r#   rs   q  r   zMooncakeEPDispatcher.combine_brF   c                 C  s@   t  }| j|}|tjkrt|tjkr| jS td| j )NzInvalid deepep_mode: )	r
   r6   resolver   NORMALr>   LOW_LATENCYr~   
ValueError)r!   is_extend_in_batchresolved_deepep_moder"   r"   r#   r   w  s   

zMooncakeEPDispatcher._get_implc                 C  s   | j |ksJ || _ d S r   )r   )r!   	old_stage	new_stager"   r"   r#   r     s   
z"MooncakeEPDispatcher._update_stage)r2   rG   rH   r   rI   rJ   r8   r   rK   r   r4   r   rL   rM   r6   r   rk   rJ   rN   rJ   )r   r   rX   r   r   r   rv   )r   r   r   r   )r   r   )r   rF   )r&   r'   r(   r   AUTOrW   rn   re   rj   rt   rr   rs   r   r   __classcell__r"   r"   r   r#   r|     s$    
"
	



r|   )+
__future__r   loggingdataclassesr   enumr   r   typingr   r   r]   torch.distributeddistributeddist sglang.srt.elastic_ep.elastic_epr   #sglang.srt.eplb.expert_distributionr	   sglang.srt.layers.dp_attentionr
   +sglang.srt.layers.moe.token_dispatcher.baser   r   r   r   r   sglang.srt.layers.moe.topkr   sglang.srt.layers.moe.utilsr   sglang.srt.utilsr   	getLoggerr&   loggerr   rh   r,   r-   rF   rw   r|   r"   r"   r"   r#   <module>   s2    

- /