o
    پiF                     @   sb  d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	 d dl
mZmZmZmZmZ d dlZd dlmZmZmZmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; e5 Z<e6 Z=e<oe: Z>e<oe; Z?e4doe8 Z@e7 ZAe9 ZBe@reArd dlCmDZD d dlEmFZF neBrd dlGmHZH dZIdeJfddZKG dd deZLG dd dZMG dd dZNeN ZOdd ZPeG d d! d!ZQeG d"d# d#ZRd$d% ZSG d&d' d'ZTeG d(d) d)ZUG d*d+ d+ZVG d,d- d-ZWG d.d/ d/ZXdS )0    N)contextmanager)	dataclass)Enumauto)partial)CallableDictListOptionalTuple)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group tensor_model_parallel_all_reduce)use_symmetric_memory)is_nsa_enable_prefill_cpnsa_use_prefill_cp)attn_tp_all_gather_into_tensorattn_tp_reduce_scatter_tensordp_gather_partialdp_reduce_scatter_tensor
dp_scatterget_attention_cp_rankget_attention_cp_sizeget_attention_dp_sizeget_attention_tp_rankget_attention_tp_sizeget_global_dp_bufferget_local_dp_bufferis_allocation_symmetricis_dp_attention_enabled)get_moe_a2a_backend/should_use_flashinfer_cutlass_moe_fp4_allgather)ForwardBatch)get_global_server_args)SpeculativeAlgorithm)get_bool_env_varis_cudais_flashinfer_availableis_gfx95_supportedis_hipis_npuis_sm90_supportedis_sm100_supportedSGLANG_USE_AITER)fused_rms_fp8_group_quant)fused_rms_mxfp4_quant)prepare_weight_cachei   
batch_sizec                 C   s,   t stoto| dko| tkot  ot jS Nr   )_is_sm90_supported_is_sm100_supported_is_flashinfer_availableFUSE_ALLREDUCE_MAX_BATCH_SIZEr    r$   "enable_flashinfer_allreduce_fusion)r2    r9   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/communicator.py!apply_flashinfer_allreduce_fusion[   s   r;   c                   @   s.   e Zd ZdZe Ze Ze Zedd Z	dS )ScatterModeaG  
    Suppose we have TP=4, DP=2, enable-dp-attention, and the system handles seq a,b,c,d
    Model input/output: [ab, ab, cd, cd] for four ranks respectively
    SCATTERED: [a, b, c, d]
    TP_ATTN_FULL: [ab, ab, cd, cd], i.e. all ranks inside a TP attn group have full data of the group
    FULL: [abcd, abcd, abcd, abcd]
    c                   C   s   t  rtjS tjS )z=The scatter mode for model forward pass input and output data)r   r<   	SCATTEREDTP_ATTN_FULLr9   r9   r9   r:   model_input_outputu   s   zScatterMode.model_input_outputN)
__name__
__module____qualname____doc__r   r=   r>   FULLstaticmethodr?   r9   r9   r9   r:   r<   h   s    r<   c                   @   s<   e Zd ZdejdedefddZdd Zdd	 Z	d
d Z
dS )AttentionInputshidden_statesforward_batchqkv_latent_funcc                 C   s"   || _ || _|| _d | _d | _d S N)hidden_states_localrH   rI   hidden_states_qkv_latent_)selfrG   rH   rI   r9   r9   r:   __init__   s
   
zAttentionInputs.__init__c                 C   s2   |j jd }|||jd f}t || |S )Nr   )	input_idsshape	new_emptyr   all_gather_into_tensor)rN   rG   rH   total_tokensoutputr9   r9   r:   tp_all_gather_hidden_states   s   z+AttentionInputs.tp_all_gather_hidden_statesc                 C   sP   | j d ur| j S | jd usJ | | j| j| _ t jr%| | j | j| _ | j S rJ   )rM   rI   rK   rH   get_attn_tp_contextinput_scatteredrW   rN   r9   r9   r:   fetch_qkv_latent   s   
z AttentionInputs.fetch_qkv_latentc                 C   s8   | j d ur| j S | j| _ t jr| | j | j| _ | j S rJ   )rL   rK   rX   rY   rW   rH   rZ   r9   r9   r:   fetch_hidden_states   s   
z#AttentionInputs.fetch_hidden_statesN)r@   rA   rB   torchTensorr#   r   rO   rW   r[   r\   r9   r9   r9   r:   rF   }   s    
rF   c                   @   sf   e Zd Zdd Zdd ZdefddZedd	 Zd
e	fddZ
dd Zdd ZedefddZdS )AttnTpContextc                 C   s   d| _ d| _d | _d S )NF)allow_input_scatteredinput_scattered_attn_inputs_rZ   r9   r9   r:   rO      s   
zAttnTpContext.__init__c                 C   s   t  jo)to)|d uo)| o)t dko)t  o)t  o)t  o)t  j o)t  j	dk| _
t  jr@| j
s9td d S td d S d S )N   EAGLE3zIattn_tp_input_scattered is not enabled while other conditions are not metz"attn_tp_input_scattered is enabled)r$   enable_attn_tp_input_scattered_is_cudar   r    r!   is_noneenable_moe_dense_fully_dpenable_piecewise_cuda_graphspeculative_algorithmr`   logginginfo)rN   q_lora_rankis_nsar9   r9   r:   init_context   s4   
	zAttnTpContext.init_contextrH   c                 C   s:   | j o|j o|j  o|j  o|jd uo|j S rJ   )r`   forward_mode	is_extendis_target_verifyis_draft_extendrQ   can_run_tborN   rH   r9   r9   r:   use_input_scattered   s   

z!AttnTpContext.use_input_scatteredc                 C   s   | j S rJ   )ra   rZ   r9   r9   r:   rY      s   zAttnTpContext.input_scatteredattn_inputsc                 C   s
   || _ d S rJ   )rb   )rN   rw   r9   r9   r:   set_attn_inputs   s   
zAttnTpContext.set_attn_inputsc                 C      | j d usJ | j  S rJ   )rb   r[   rZ   r9   r9   r:   r[         
zAttnTpContext.fetch_qkv_latentc                 C   ry   rJ   )rb   r\   rZ   r9   r9   r:   r\      rz   z!AttnTpContext.fetch_hidden_statesc                 c   s.    |  |}| j}|| _d V  || _d | _d S rJ   )rv   rY   ra   rb   )rN   rH   flagold_flagr9   r9   r:   maybe_input_scattered   s   

z#AttnTpContext.maybe_input_scatteredN)r@   rA   rB   rO   ro   r#   rv   propertyrY   rF   rx   r[   r\   r   r}   r9   r9   r9   r:   r_      s    

r_   c                   C   s   t S rJ   )ATTN_TP_CONTEXTr9   r9   r9   r:   rX      s   rX   c                   @   sF   e Zd ZU eed< eed< eed< ee ed< ee ed< dd ZdS )	_LayerModeComputationContext
num_layerslayer_idis_layer_sparseis_previous_layer_sparseis_next_layer_sparsec                 C   s,   | j d usJ t| j| jd | j d | jdS )Nrc   )r   r   r   r   r   )r   r   r   r   r   rZ   r9   r9   r:   previous_layer   s   z+_LayerModeComputationContext.previous_layerN)r@   rA   rB   int__annotations__boolr
   r   r9   r9   r9   r:   r      s   
 r   c                   @   s   e Zd ZU eed< eed< eed< eed< eed< edd Zedefd	d
ZedefddZ	edefddZ
edefddZedefddZdS )LayerScatterModeslayer_input_mode	attn_modemlp_modemiddle_residual_modelayer_output_modec                 K   s:   t di |}| | |tj| || || |dS )N)r   r   r   r   r   r9   )r   _compute_layer_input_moder<   r>   _compute_mlp_mode_compute_middle_residual_mode_compute_layer_output_mode)clskwargscontextr9   r9   r:   init_new
  s   zLayerScatterModes.init_newr   c                 C   s    |j dkr	t S | | S r3   )r   r<   r?   r   r   r   r   r9   r9   r:   r     s   
z+LayerScatterModes._compute_layer_input_modec                 C   s4   |j rt  rt rtjS tjS t rtjS tjS rJ   )r   r!   rg   r"   r<   r=   rD   rh   r   r9   r9   r:   r     s   z#LayerScatterModes._compute_mlp_modec                 C   s   |j  o|jot ot jS rJ   )r   r   rh   r$   enable_two_batch_overlapr   r9   r9   r:   _should_gather_for_tbo.  s   z(LayerScatterModes._should_gather_for_tboc                 C   s.   |  |}|tjkrtjS |tjkrtjS trJ   )r   r<   r=   rD   r>   NotImplementedErrorr   r   r   r9   r9   r:   r   7  s   


z/LayerScatterModes._compute_middle_residual_modec                 C   sV   |  |}|j|jd krt S |tjkr!| |rtjS tjS |tjkr)tjS t	Nrc   )
r   r   r   r<   r?   r=   r   r>   rD   r   r   r9   r9   r:   r   @  s   



z,LayerScatterModes._compute_layer_output_modeN)r@   rA   rB   r<   r   classmethodr   r   r   r   r   r   r   r9   r9   r9   r:   r     s$   
 

r   c                   C   s   t  jdkS r   )r$   moe_dense_tp_sizer9   r9   r9   r:   rh   N  s   rh   c                   @   s.  e Zd Z			d#dedejjdejjdededee	 fd	d
Z
dd Z		d$dejdejdedeeej  deej f
ddZ		d%dejdejdededeej f
ddZdejdejdeejejf fddZ	d&dejdejdefddZdejdejdefddZdefdd Zdedefd!d"ZdS )'LayerCommunicatorFNlayer_scatter_modesinput_layernormpost_attention_layernormallow_reduce_scatteris_last_layerrI   c                 C   sJ   || _ || _|| _|| _|| _|| _t | _| 	  t
t j| _d S rJ   )r   r   r   r   r   rI   CommunicateContextr   _context_post_init_communicater%   from_stringr$   rj   _speculative_algo)rN   r   r   r   r   r   rI   r9   r9   r:   rO   S  s   


zLayerCommunicator.__init__c                 C   sj   t j| jj| jj| jd| _tj| jj| jj| jj| jj	| jd| _
tj| jj| jj	| jj| jd| _d S )N
input_modeoutput_moder   hidden_states_input_moderesidual_input_modehidden_states_output_moderesidual_output_moder   r   r   r   r   )CommunicateSimpleFnget_fnr   r   r   r   _communicate_simple_fn&CommunicateWithAllReduceAndLayerNormFnr   r   ._communicate_with_all_reduce_and_layer_norm_fnCommunicateSummableTensorPairFnr   $_communicate_summable_tensor_pair_fnrZ   r9   r9   r:   r   j  s(   
z(LayerCommunicator._post_init_communicaterG   residualrH   captured_last_layer_outputspost_residual_additionc                 C   sR   | j ||||d\}}|d ur%| j||| jd}||u r | }|| ||fS )N)r   rG   rH   r   )prepare_attnr   r   cloneappend)rN   rG   r   rH   r   r   gathered_last_layer_outputr9   r9   r:   +prepare_attn_and_capture_last_layer_outputs  s    

z=LayerCommunicator.prepare_attn_and_capture_last_layer_outputs quant_formatc           	      C   s  t  jr| ||\}}|jd dkr|}n|d ur,t|dr,|jr,| j||\}}n|d u rs|}trMt	rMd|v rMt
|| jj| jjd d d d ^}}}njtrmt	rmd|v rmt|| jj| jjd d d dtjd dd
\}}}}nJ| |}nDtrt	rd|v rt
|| jj| jjd d d |^}}}n)trt	rd|v rt|| jj| jjd d d dtj|dd
\}}}}n	| |||\}}| j||| jd}| jd urt||| j}t  | ||fS )	Nr   _sglang_needs_allreduce_fusionmxfp4fp8   F)inp2inp2_weightinp2_epsilon
group_sizedtype_quantres1output_unquantized_inp1r   )rX   rY   _tp_reduce_scatterrR   hasattrr   r   forward_with_allreduce_fusion
_use_aiter_is_gfx95_supportedr0   weightvariance_epsilonr/   r]   float8_e4m3fnr   r   rI   rF   rx   )	rN   rG   r   rH   r   r   __resrw   r9   r9   r:   r     s   		
zLayerCommunicator.prepare_attnreturnc                 C   s   |j d dkr||fS |j d | jj dks&J d|j d  d| jj d|j d | jj }|j|g|j dd  R  }t || |d urS|| jj| jj }||fS )Nr   zExpected total tokens z % tp_size z to be 0rc   )rR   r   tp_sizerS   r   reduce_scatter_tensortensor_splittp_rank)rN   rG   r   local_tokensrV   r9   r9   r:   r     s   z$LayerCommunicator._tp_reduce_scatterc                 C   s(   |d ur|| j _| j|||| j| j dS )NrG   r   rH   	layernormr   )r   cacher   r   )rN   rG   r   rH   r   r9   r9   r:   prepare_mlp  s   zLayerCommunicator.prepare_mlpc                 C   s   | j |||| j| jdS )N)rG   r   rH   r   r   )r   r   r   )rN   rG   r   rH   r9   r9   r:   postprocess_layer*  s   z#LayerCommunicator.postprocess_layerc                 C   sF   | j sdS | jtju r|j rdS t|rdS t jr!| j	s!dS dS )NFT)
r   r   r   _scatter_hidden_statesdp_padding_mode
is_max_lenr   rX   rY   r   ru   r9   r9   r:   should_use_reduce_scatter8  s   z+LayerCommunicator.should_use_reduce_scatterc                 C   s`   t  r| jd ur| j rdS t jrdS t|dr |jjd nd}t|o/| j	 o/| j
jdkS )NFrQ   r   rc   )r    r   is_eaglerX   rY   r   rQ   rR   r;   r   r   r   )rN   rH   r2   r9   r9   r:   )should_fuse_mlp_allreduce_with_next_layerH  s"   

z;LayerCommunicator.should_fuse_mlp_allreduce_with_next_layer)FFN)NN)r   NrJ   )r@   rA   rB   r   r]   nnModuler   r
   r   rO   r   r^   r#   r	   r   strr   r   r   r   r   r   r   r9   r9   r9   r:   r   R  s    


i


r   c                   @   sx   e Zd ZU eeef ed< eed< eed< eed< eed< eed< eed< dZeed	< d
edefddZe	dd Z
dS )r   process_group_sizesattn_tp_rankattn_tp_sizeattn_dp_sizeattn_cp_rankattn_cp_sizer   Nr   abc                 C   s   | j | | j | kS rJ   )r   )rN   r   r   r9   r9   r:   is_same_group_sizen  s   z%CommunicateContext.is_same_group_sizec           	   
   C   sX   t  }t }t }t }t }t }t }tjdtj	|tj
|i}| ||||||||dS )Nrc   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r<   r=   r>   rD   )	r   r   r   r   r   r   r   r   r   r9   r9   r:   r   q  s*   zCommunicateContext.init_new)r@   rA   rB   r   r<   r   r   r   r   r   r   r9   r9   r9   r:   r   b  s   
 r   c                	   @   sj   e Zd ZedededefddZedejde	dedejfd	d
Z
edejde	dedejfddZdS )r   r   r   r   c                 C   s@   | | |r	tjS | tjkr|tjkrtjS td| d|)Nzinput_mode= output_mode=)r   r   _trivialr<   r=   r>   _scattered_to_tp_attn_fullr   r   r9   r9   r:   r     s   

zCommunicateSimpleFn.get_fnrG   rH   r   c                 C   s   | S rJ   r9   r   r9   r9   r:   r     s   zCommunicateSimpleFn._trivialc                 C   s   t  | } }t| | | S rJ   r   r   )rG   rH   r   local_hidden_statesr9   r9   r:   r     s   z.CommunicateSimpleFn._scattered_to_tp_attn_fullN)r@   rA   rB   rE   r<   r   r   r]   r^   r#   r   r   r9   r9   r9   r:   r     s:    r   c                   @   s   e Zd ZdZedededededef
ddZed	ej	d
ej	de
dejjdef
ddZed	ej	d
ej	de
dejjdef
ddZed	ej	d
ej	de
dejjdef
ddZed	ej	d
ej	dejjdefddZdS )r   zpBesides communication, needs to
    1. All reduce in tp_attn_group on hidden_states
    2. Apply layer norm
    r   r   r   r   r   c              	   C   s   | | |r| ||r|jdkrtjS | tjkr2|tjtjfv r2|tjkr2|tjkr2ttj	|dS | tjkrP|tjtjfv rP|tjkrP|tjkrPttj
|dS td| d|d|d|)Nrc   )r   hidden_states_input_mode= residual_input_mode=z hidden_states_output_mode=z residual_output_mode=)r   r   r   _simpler<   r>   r=   rD   r   "_gather_hidden_states_and_residual#_scatter_hidden_states_and_residualr   r   r9   r9   r:   r     s6   








z-CommunicateWithAllReduceAndLayerNormFn.get_fnrG   r   rH   r   c                 C   s$   | j d dkr|| |\} }| |fS r3   )rR   r   r9   r9   r:   r     s   	z.CommunicateWithAllReduceAndLayerNormFn._simplec          
      C   sf  t  jrt| |||S |tjkr!|jdkr!t |}}t|| |j	dkr|j
dkr/| |7 } |jdk}|r\| jd dkr\| }tt t  d || } W d    n1 sWw   Y  t | } }t| || |s{t|| | | jd dkr{|| } | |fS t| jd rt|dr|| |\} }| |fS t| } tr|jd urt| |j}	|| |\} }| |fS )Nrc   r   )disabledr   )rX   rY   r   &_tp_all_reduce_with_scattered_residualr<   r=   r   r   r   r   r   rR   r   r   r   r   r   r   r;   r   r   r   _is_npur   r1   )
rG   r   rH   r   r   r   local_residualuse_layer_norm_before_gatherr   r   r9   r9   r:   r     sZ   





zICommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residualc                C   s`   | }|  |j|j } t| | |tjkr| |j|j }| jd dkr,|| |\} }| |fS r3   )r   r   r   r   r<   r>   rR   )rG   r   rH   r   r   r   input_hidden_statesr9   r9   r:   r  7  s   



zJCommunicateWithAllReduceAndLayerNormFn._scatter_hidden_states_and_residualc                 C   sH   | j d dkr| | fS | |j|j }||7 }t| }||} | |fS r3   )rR   r   r   r   r   )rG   r   r   r   scattered_statesr9   r9   r:   r  L  s   zMCommunicateWithAllReduceAndLayerNormFn._tp_all_reduce_with_scattered_residualN)r@   rA   rB   rC   rE   r<   r   r   r]   r^   r#   r   r   r   r   r  r  r9   r9   r9   r:   r     sx    /=r   c                   @   s   e Zd ZdZedd Zededededefdd	Z	ed
e
jde
jdedefddZe	dd
e
jde
jdededef
ddZed
e
jde
jdedefddZed
e
jde
jdedefddZdS )r   z^It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed.c                 K   s    | j ||||ddd|i|S )Nr   r   r9   )r   )r   r   r   r   r   r   r9   r9   r:   execute`  s   	z'CommunicateSummableTensorPairFn.executer   r   r   r   c                 C   s   | | |r| ||rtjS | tjkr!|tjkr!|tjkr!tjS | tjkr3|tjkr3|tjkr3tjS | tjkrE|tjkrE|tjkrEtj	S t
d| d|d|)Nr   r   r   )r   r   r   r<   rD   r>   r   r=   _gather_scatterr   r   r9   r9   r:   r   p  s*   









z&CommunicateSummableTensorPairFn.get_fnrG   r   rH   c                 K   s   | |fS rJ   r9   )rG   r   rH   r   r   r9   r9   r:   r     s   z(CommunicateSummableTensorPairFn._trivialFr   c                 C   s@   t  | } }|r|j rt| | | |fS t| || | |fS rJ   )r   r   r   r   r   )rG   r   rH   r   r   global_hidden_statesr9   r9   r:   r     s   	
z6CommunicateSummableTensorPairFn._scatter_hidden_statesc                 K   s*   | |7 } d }t  | } }t| | | |fS rJ   r   )rG   r   rH   r   r   r   r9   r9   r:   r
    s   z'CommunicateSummableTensorPairFn._gatherc                 C   s2   |d u sJ dt | |j}||j } | |fS )Nznot yet handled residual!=None)listr   r   r   )rG   r   rH   r   tensor_listr9   r9   r:   r    s   
z(CommunicateSummableTensorPairFn._scatterN)F)r@   rA   rB   rC   r   r	  rE   r<   r   r   r]   r^   r#   r   r   r   r
  r  r9   r9   r9   r:   r   ]  sr    
$	r   )Yrk   
contextlibr   dataclassesr   enumr   r   	functoolsr   typingr   r   r	   r
   r   r]   sglang.srt.distributedr   r   r   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   %sglang.srt.layers.attention.nsa.utilsr   r   sglang.srt.layers.dp_attentionr   r   r   r   r   r   r   r   r   r   r   r   r   r    sglang.srt.layers.moer!   r"   ,sglang.srt.model_executor.forward_batch_infor#   sglang.srt.server_argsr$    sglang.srt.speculative.spec_infor%   sglang.srt.utilsr&   r'   r(   r)   r*   r+   r,   r-   rf   r6   r4   r5   r   r   r   aiter.ops.triton.fused_fp8_quantr/   /sglang.srt.layers.quantization.rocm_mxfp4_utilsr0   #sglang.srt.hardware_backend.npu.cmor1   r7   r   r;   r<   rF   r_   r   rX   r   r   rh   r   r   r   r   r   r9   r9   r9   r:   <module>   sb   @(

,>L  )* (