o
    پiE                     @  s0  U d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZmZmZmZmZmZmZmZ d dlm Z  d dl!m"Z"m#Z# e
rmd d	l$m%Z% d d
l&m'Z' e(e)Z*e
rzd dl+m,Z, da-de.d< da/de.d< da0de.d< da1de.d< da2de.d< e# Z3e3oe"dZ4G dd deZ5G dd dZ6	dddd Z7dd#d$Z8dd%d&Z9dd'd(Z:dd)d*Z;dd,d-Z<dd.d/Z=dd1d2Z>dd4d5Z?dd7d8Z@dd9d:ZAdd;d<ZB	=ddd?d@ZCdAdB ZDddGdHZEddIdJZFddKdLZGddNdOZHddPdQZIddRdSZJddTdUZKddVdWZLddXdYZMddZd[ZNdd\d]ZOdd^d_ZPdd`daZQedbdc ZRddgdhZSejTddldmZUdndo ZVdpdq ZWddudvZXddwdxZYddydzZZdd{d|Z[dd}d~Z\dddZ]dddZ^dddZ_dddZ`dddZadddZbdddZcdddZddS )    )annotationsN)contextmanager)IntEnumauto)TYPE_CHECKINGListOptionalTuple)GroupCoordinator$get_attn_context_model_parallel_rank*get_attn_context_model_parallel_world_sizeget_attn_cp_group#get_attn_tensor_model_parallel_rank)get_attn_tensor_model_parallel_world_sizeget_attn_tp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group tensor_model_parallel_all_reduce)use_symmetric_memory)get_bool_env_varis_hip)ModelConfig)
ServerArgs)ForwardBatchzOptional[int]_ATTN_DP_RANK_ATTN_DP_SIZE_LOCAL_ATTN_DP_SIZE_LOCAL_ATTN_DP_RANKFbool_ENABLE_DP_ATTENTION_FLAGSGLANG_USE_ROCM700Ac                   @  sD   e Zd Ze Ze Zdd Zdd Zeddd	Z	edd
dZ
dS )DpPaddingModec                 C  
   | t jkS N)r"   MAX_LENself r(   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/dp_attention.py
is_max_len9      
zDpPaddingMode.is_max_lenc                 C  r#   r$   )r"   SUM_LENr&   r(   r(   r)   
is_sum_len<   r+   zDpPaddingMode.is_sum_lenglobal_num_tokens	List[int]returnc                 C  s8   |rt jS t|}t|}|d |t  kr| jS | jS )N   )r"   r,   maxsumget_attention_dp_sizer%   )clsis_extend_in_batchr.   max_lensum_lenr(   r(   r)   get_dp_padding_mode?   s   z!DpPaddingMode.get_dp_padding_modec                 C  s   t r| jS | jS r$   )_USE_ROCM700A_WAr,   r%   r5   r(   r(   r)   get_default_mode_in_cuda_graphN   s   z,DpPaddingMode.get_default_mode_in_cuda_graphN)r.   r/   r0   r"   )r0   r"   )__name__
__module____qualname__r   r%   r,   r*   r-   classmethodr9   r<   r(   r(   r(   r)   r"   2   s    r"   c                   @  s  e Zd ZU ded< ded< ded< ded< ded< d	ed
< ded< d	ed< ed4ddZe	d5d6ddZed7ddZed7ddZed8d d!Z	ed8d"d#Z
ed9d%d&Zed8d'd(Zed:d)d*Zed;d+d,Zed<d.d/Zed=d0d1Zed=d2d3ZdS )>_DpGatheredBufferWrapperint_hidden_sizetorch.dtype_dtypetorch.device_device_global_dp_buffer_len_local_dp_buffer_lenr   _dp_max_paddingOptional[List[int]]_global_num_tokens_is_extend_in_batchhidden_sizedtypedevicec                 C  s   || _ || _|| _d S r$   )rC   rE   rG   )r5   rN   rO   rP   r(   r(   r)   set_metadatac   s   
z%_DpGatheredBufferWrapper.set_metadataNglobal_dp_buffer_lenlocal_dp_buffer_lendp_max_paddingr.   c                 C  s   || _ || _|| _|| _d S r$   )rH   rI   rJ   rL   )r5   rR   rS   rT   r.   r(   r(   r)   set_dp_buffer_leni   s   
z*_DpGatheredBufferWrapper.set_dp_buffer_lenr0   torch.Tensorc                 C  T   t t | j d tj| j| jf| j| jd}W d    |S 1 s#w   Y  |S N)disabled)rO   rP   )	r   r   rJ   torchemptyrH   rC   rE   rG   r5   bufferr(   r(   r)   get_global_dp_bufferv      

z-_DpGatheredBufferWrapper.get_global_dp_bufferc                 C  rW   rX   )	r   r   rJ   rZ   r[   rI   rC   rE   rG   r\   r(   r(   r)   get_local_dp_buffer   r_   z,_DpGatheredBufferWrapper.get_local_dp_bufferc                 C     | j S r$   )rH   r;   r(   r(   r)   get_global_dp_buffer_len      z1_DpGatheredBufferWrapper.get_global_dp_buffer_lenc                 C  ra   r$   )rI   r;   r(   r(   r)   get_local_dp_buffer_len   rc   z0_DpGatheredBufferWrapper.get_local_dp_buffer_lenr/   c                 C  ra   r$   )rL   r;   r(   r(   r)   get_dp_global_num_tokens   rc   z1_DpGatheredBufferWrapper.get_dp_global_num_tokensc                 C  ra   r$   )rC   r;   r(   r(   r)   get_dp_hidden_size   rc   z+_DpGatheredBufferWrapper.get_dp_hidden_sizec                 C  ra   r$   )rE   r;   r(   r(   r)   get_dp_dtype   rc   z%_DpGatheredBufferWrapper.get_dp_dtypec                 C  ra   r$   )rG   r;   r(   r(   r)   get_dp_device   rc   z&_DpGatheredBufferWrapper.get_dp_devicer6   c                 C  s
   || _ d S r$   rM   )r5   r6   r(   r(   r)   set_is_extend_in_batch   s   
z/_DpGatheredBufferWrapper.set_is_extend_in_batchc                 C  ra   r$   ri   r;   r(   r(   r)   get_is_extend_in_batch   rc   z/_DpGatheredBufferWrapper.get_is_extend_in_batchc                 C  ra   r$   )rJ   r;   r(   r(   r)   is_dp_max_padding   rc   z*_DpGatheredBufferWrapper.is_dp_max_padding)rN   rB   rO   rD   rP   rF   r$   rR   rB   rS   rB   rT   r   r.   rK   r0   rV   r0   rB   r0   r/   r0   rD   r0   rF   r6   r   r0   r   )r=   r>   r?   __annotations__r@   rQ   rU   r^   r`   rb   rd   re   rf   rg   rh   rj   rk   rl   r(   r(   r(   r)   rA   X   sH   
 		rA   rR   rB   rS   rT   r.   rK   c                 C  s   t | ||| d S r$   )rA   rU   )rR   rS   rT   r.   r(   r(   r)   rU      s   rU   r0   rV   c                   C     t  S r$   )rA   r^   r(   r(   r(   r)   r^         r^   c                   C  rv   r$   )rA   r`   r(   r(   r(   r)   r`      rw   r`   c                   C  rv   r$   )rA   rb   r(   r(   r(   r)   rb      rw   rb   c                   C  rv   r$   )rA   rd   r(   r(   r(   r)   rd      rw   rd   r/   c                   C  rv   r$   )rA   re   r(   r(   r(   r)   re      rw   re   c                   C  rv   r$   )rA   rf   r(   r(   r(   r)   rf      rw   rf   rD   c                   C  rv   r$   )rA   rg   r(   r(   r(   r)   rg      rw   rg   rF   c                   C  rv   r$   )rA   rh   r(   r(   r(   r)   rh      rw   rh   r6   c                 C  s   t |  d S r$   )rA   rj   )r6   r(   r(   r)   rj         rj   c                   C  rv   r$   )rA   rk   r(   r(   r(   r)   rk      rw   rk   c                   C  rv   r$   )rA   rl   r(   r(   r(   r)   rl      rw   rl      attn_cp_sizec           	      C  s@   | r|nd}|| | }|| }| sd}n|||  }|||fS Nry   r   r(   )	enable_dp_attentiontp_ranktp_sizedp_sizerz   attn_dp_sizeattn_tp_sizeattn_tp_rankattn_dp_rankr(   r(   r)   compute_dp_attention_world_info   s   
r   c                 C  sV   | s||dfS |r|n|}|| }t d|||  }|| }|| }	|| }
|
||	fS )Nr   ry   )r2   )r|   r}   r~   r   moe_dense_tp_sizelocal_tp_sizelocal_tp_ranklocal_dp_sizelocal_attn_tp_sizelocal_attn_dp_ranklocal_attn_tp_rankr(   r(   r)   compute_dp_attention_local_info   s   

r   server_argsr   model_configr   c           	      C  s   | j }| j}| j}| j}|at }t }t|||||\}}at	|||||\}}a
|r?|a|d u r5tantd|||  andadatj|j|jt| jd d S )Nry   )rN   rO   rP   )r|   r   r   rz   r    r   r   r   r   r   r   r   r   r2   rA   rQ   rN   rO   rZ   rP   )	r   r   r|   r   r   rz   r}   r~   _r(   r(   r)   initialize_dp_attention  s2   





r   c                   C  s   t S r$   )r    r(   r(   r(   r)   is_dp_attention_enabled0  s   r   c                   C  s   t   pt S r$   )r   rl   r(   r(   r(   r)   is_allocation_symmetric4  rx   r   r
   c                   C     t  S r$   )r   r(   r(   r(   r)   get_attention_tp_group8     r   c                   C  r   r$   )r   r(   r(   r(   r)   get_attention_tp_rank<  r   r   c                   C  r   r$   )r   r(   r(   r(   r)   get_attention_tp_size@  r   r   c                   C  r   r$   )r   r(   r(   r(   r)   get_attention_cp_groupD  r   r   c                   C  r   r$   )r   r(   r(   r(   r)   get_attention_cp_rankH  r   r   c                   C  r   r$   )r   r(   r(   r(   r)   get_attention_cp_sizeL  r   r   c                   C     t d usJ dt S Ndp attention not initialized!)r   r(   r(   r(   r)   get_attention_dp_rankP     r   c                   C  r   r   r   r(   r(   r(   r)   r4   U  r   r4   c                   C  r   r   )r   r(   r(   r(   r)   get_local_attention_dp_rankZ  r   r   c                   C  r   r   )r   r(   r(   r(   r)   get_local_attention_dp_size_  r   r   c                  c  s2    t dus	J dt } da zdV  W | a dS | a w )a  Patch the tp group temporarily until this function ends.

    This method is for draft workers of speculative decoding to run draft model
    with different tp degree from that of target model workers.

    Args:
        tp_group (GroupCoordinator): the tp group coordinator
    Nr   ry   r   )old_dp_sizer(   r(   r)   disable_dp_sized  s   r   forward_batchr   !Tuple[torch.Tensor, torch.Tensor]c                 C  sf   t  }| jd u r-tj| jdd}|dkrt|d }n||d  }| j| }|| _|| _| j| jfS )Nr   )dimry   )r   dp_local_start_posrZ   cumsumglobal_num_tokens_gpu
zeros_likedp_local_num_tokens)r   dp_rank	cumtokenslocal_start_poslocal_num_tokensr(   r(   r)   get_dp_local_infoy  s   

r   
offset_srctl.constexpr
BLOCK_SIZEc                 C  s   t jddt j}t |t j| }t |t j| }	|| }
t d|}|
| |	k }|rMt j|| |
 | |d}t j| |
 | ||d d S t j||
 | |d}t j| | |
 | ||d d S )Nr   )axis)mask)tl
program_idtoint64loadarangestore)dst_ptrsrc_ptr
offset_ptrsz_ptrr   
chunk_sizer   pidoffsetszstart_indexoffsr   datar(   r(   r)   memcpy_triton_kernel  s   
 r   c                 C  s   t dd | dS )Nc                 S  s   | | S r$   r(   )abr(   r(   r)   <lambda>  s    zprod.<locals>.<lambda>ry   )	functoolsreduce)xr(   r(   r)   prod  s   r   c           
      C  s   t | |  }|dksJ d|jdd  | jdd  ks#J dt|jdd  }d}t||f}	t|	 | |||||| d S )Nr   zdim != 0 unsupportedry   z src and dst must have same shapei    )minnumelshaper   tritoncdivr   )
dstsrcr   r   r   r   max_sizer   r   gridr(   r(   r)   memcpy_triton  s   $r   global_tokenslocal_tokens
is_partialc                 C  s   t |\}}| d | sJ |  sJ |jd dkr:|s%t dkr:| |  us1J dt| |d||d d}|jjsUt	 |krUddl
m} || t jd d S t| | d d < d S )Nr   z;aliasing between global_tokens and local_tokens not allowedF   )inplace_all_reduce)
group_name)r   fill_is_contiguousr   r   untyped_storager   rO   is_floating_pointr   %sglang.srt.distributed.parallel_stater   r   unique_namer   )r   r   r   r   r   r   NUM_GPUS_PER_NODEr   r(   r(   r)   _dp_gather_via_all_reduce  s$   

r   c                 C  sf   t  dkrt | | d S |st dkr|d |t  t  }t || t | | d S r{   )r   r   all_gather_into_tensorr   r   tensor_splitr   reduce_scatter_tensor)r   r   r   r   scattered_local_tokensr(   r(   r)   _dp_gather_via_all_gather  s   



r   c                 C  s.   |j  rt| ||| d S t| ||| d S r$   )dp_padding_moder*   r   r   )r   r   r   r   r(   r(   r)   
_dp_gather  s   
r   c                 C     t | ||dd d S )NTr   r   r   r   r   r(   r(   r)   dp_gather_partial     r   c                 C  r   )NFr   r   r   r(   r(   r)   dp_gather_replicate  r   r   c                 C  sn   t |\}}| d |  sJ | sJ | jd dkr5|  | us*J dt| |d||d d S d S )Nr   z;aliasing between local_tokens and global_tokens not allowedT)r   r   r   r   r   r   )r   r   r   r   r   r(   r(   r)   
dp_scatter  s   
r   outputinputc                 C  sP   t  t krt | | d S |t  t  }t || t | | d S r$   )r   r4   r   r   r   r   r   r   )r   r   r   r(   r(   r)   dp_reduce_scatter_tensor!  s   r   c                 C     t  | |S r$   )r   r   r   r   r(   r(   r)   attn_tp_reduce_scatter_tensor,  rx   r   c                 C  r   r$   )r   r   r   r(   r(   r)   attn_cp_reduce_scatter_tensor0  rx   r   c                 C  s   t  | S r$   )r   
all_reduce)r   r(   r(   r)   attn_tp_all_reduce4  s   r   c                 C  r   r$   )r   r   r   r(   r(   r)   attn_tp_all_gather_into_tensor8  rx   r  c                 C  r   r$   )r   r   r   r(   r(   r)   attn_cp_all_gather_into_tensor<  rx   r  output_listList[torch.Tensor]c                 C  s   t  j|| dS )N)output_tensor_list)r   
all_gather)r  r   r(   r(   r)   attn_tp_all_gather@  s   r  r$   rm   rn   ro   rp   rq   rr   rs   rt   )ry   )rz   rB   )r   r   r   r   )r0   r
   )r   r   r0   r   )r   r   r   r   )r   rV   r   rV   r   r   r   r   )r   rV   r   rV   r   r   )r   rV   r   rV   r   r   )r   rV   r   rV   )r   rV   )r  r  r   rV   )e
__future__r   r   logging
contextlibr   enumr   r   typingr   r   r   r	   rZ   r   triton.languagelanguager   sglang.srt.distributedr
   r   r   r   r   r   r   r   r   r   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.utilsr   r   sglang.srt.configs.model_configr   sglang.srt.server_argsr   	getLoggerr=   logger,sglang.srt.model_executor.forward_batch_infor   r   ru   r   r   r   r    _is_hipr:   r"   rA   rU   r^   r`   rb   rd   re   rf   rg   rh   rj   rk   rl   r   r   r   r   r   r   r   r   r   r   r   r   r4   r   r   r   r   jitr   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r(   r(   r(   r)   <module>   s    4
&[












(














#









