o
    پiC&                     @  sh  d dl mZ d dlZd dlmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ z d dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' dZ(W n e)y   dZ(Y nw e*e+Z,ej-. Z/G dd deZ0e1e0esJ G dd deZ2e1e2esJ G dd deZ3dS )    )annotationsN)
NamedTupleOptional)envs)get_dp_global_num_tokens)BaseDispatcherCombineInputCombineInputFormatDispatchOutputDispatchOutputFormat)TorchDistributedCommBackend)StandardTopKOutput
TopKOutput)get_moe_runner_backend)get_global_server_args)SpeculativeAlgorithm)get_int_env_var)fp4_quantizenvfp4_block_scale_interleave)MoeAlltoAll#moe_a2a_get_workspace_size_per_rank)Mapping)MnnvlConfigTFc                   @  sD   e Zd ZU dZded< ded< ded< dZded	< edddZdS )FlashinferDispatchOutputzFlashinfer EP dispatch output.torch.Tensorhidden_stateszOptional[torch.Tensor]hidden_states_scaler   topk_outputN
moe_outputreturnr   c                 C     t jS N)r   
FLASHINFERself r%   e/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/token_dispatcher/flashinfer.pyformat2      zFlashinferDispatchOutput.format)r   r   )__name__
__module____qualname____doc____annotations__r   propertyr'   r%   r%   r%   r&   r   )   s   
 r   c                   @  s(   e Zd ZU dZded< ed	ddZdS )
FlashinferCombineInputzFlashinfer combine input.r   r   r   r	   c                 C  r    r!   )r	   r"   r#   r%   r%   r&   r'   ?   r(   zFlashinferCombineInput.formatN)r   r	   )r)   r*   r+   r,   r-   r.   r'   r%   r%   r%   r&   r/   :   s
   
 r/   c                      s@   e Zd ZdZ				dd fddZdddZdddZ  ZS )FlashinferDispatcherz1Main dispatcher class for Flashinfer A2A backend.Ngrouptorch.distributed.ProcessGrouprouter_topkintnum_expertsnum_local_expertshidden_sizeparams_dtypetorch.dtypec           
   	     s  t    tstd| | _| | _|| _|| _	|| _
|| _t  | _tdd| j | _tt j}trP| sP|d |d  | jd  | jd  }n|d | jd  | jd  }|d }	t| j| j||	d| _t| j| j| j| jtj ddd	| _t| j| j| j| j
| jtt |d
d| _!tj"d|ftj#dd| _$tj%d| jfdtj&dd| _'tj%d| jf| j| j tj&dd| _(tj)d| jftj*dd| _+d S )NzjFlashinfer is not installed or does not support A2A. Please install the appropriate version of Flashinfer.2SGLANG_FLASHINFER_NUM_MAX_DISPATCH_TOKENS_PER_RANKi            )ep_sizemax_num_tokens%total_dispatch_payload_size_per_tokencombine_payload_size_per_token   )ranktp_sizemoe_ep_size
world_sizegpus_per_nodepp_sizecp_size)comm_backend)mappingr?   top_kr5   workspace_size_per_rankmnnvl_configcudadtypedevice),super__init__use_flashinferImportErrorsizer>   rC   ep_rankr3   r7   r5   r6   r   is_flashinfer_cutlasspayload_in_workspacer   r?   r   from_stringr   speculative_algorithmMOE_NVFP4_DISPATCHis_eagler   workspace_sizer   torchrO   device_countrK   r   r   r   moe_a2aemptybfloat16dummy_xfullint32dummy_topk_idsdummy_topk_ids_current_rankzerosfloat32dummy_topk_weights)
r$   r1   r3   r5   r6   r7   r8   speculative_algor@   rA   	__class__r%   r&   rU   J   s   
	

	

zFlashinferDispatcher.__init__r   r   r   r   r   r   c                 C  s  |j }|}d }|j}|j}d| _|jd dkr(td d| _| j}| j}| j	}| j
dd }|d ur`|jd dkrDt||dd\}}ntjd| jd tj|jd}tjd| jd	 tj|jd}g }	|	| |d urs|	| d}
nd
}
|	| |	| t d urtt n|jd | _| jj| jr| jn||	| j|
d}|d ur|\}}}}|d|jd }t|}n|\}}}|d|jd }|d|jd }|d|jd }d }| jr| j| j| j|d| j}t||t|||j|S )NFr   z.No tokens on this DP worker, using dummy tokenTinput_global_scale)is_sf_swizzled_layoutr;   rP   r<   rB   )expert_id_payload_indexrS   ) rQ   topk_idstopk_weightshas_dummy_tokenshapeloggerwarningrf   ri   rm   quant_configgetr   ra   rk   r7   uint8rR   appendr   maxruntime_max_tokens_per_rankrc   dispatchrj   viewr   r[   'get_combine_payload_tensor_in_workspacer   r   router_logits)r$   r   r   output_dtypexx_sfrt   ru   global_scalepayloadsrs   recv_tensorsx_recv	x_sf_recvtopk_ids_recvtopk_weights_recvr   r%   r%   r&   r      sz   










zFlashinferDispatcher.dispatchcombine_inputr/   c                 C  sZ   |j }|jd }| jj|| j| j|| j| jd}| jr'|dd d d f }| `| `|S )NrS   )r[   rB   )	r   rw   rc   combiner   r>   r   r[   rv   )r$   r   r   output_hidden_sizer%   r%   r&   r      s   

	zFlashinferDispatcher.combine)NNNN)r1   r2   r3   r4   r5   r4   r6   r4   r7   r4   r8   r9   )r   r   r   r   r   r   )r   r/   r   r   )r)   r*   r+   r,   rU   r   r   __classcell__r%   r%   ro   r&   r0   G   s    
`Lr0   )4
__future__r   loggingtypingr   r   ra   sglang.srt.environr   sglang.srt.layers.dp_attentionr   &sglang.srt.layers.moe.token_dispatcherr   r   r	   r
   r   7sglang.srt.layers.moe.token_dispatcher.flashinfer_utilsr   sglang.srt.layers.moe.topkr   r   sglang.srt.layers.moe.utilsr   sglang.srt.server_argsr    sglang.srt.speculative.spec_infor   sglang.srt.utilsr   
flashinferr   r   flashinfer.commr   r   flashinfer.comm.mappingr   flashinfer.comm.mnnvlr   rV   rW   	getLoggerr)   rx   SGLANG_MOE_NVFP4_DISPATCHr{   r^   r   
isinstancer/   r0   r%   r%   r%   r&   <module>   s:    


