o
    پicT                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
Z
ddlZddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ eeZedZe	dZe Zejj d de!de!fddZ"deeef deeef fddZ#G dd deZ$G dd de$Z%G dd de$Z&dZ'zze(  dZ'W n e)y   dZ'Y nw W e're*  ne're*  w w e're%ne&Z+zddl,m-Z- e.ee-se+/  W dS W dS  e0y   e+/  Y dS w )z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)Callable)	lru_cachewraps)AnyTypeVar)	ParamSpec)envs)AttentionBackendEnumDeviceCapabilityPlatformPlatformEnum)init_logger)import_pynvml_P_RF	device_idreturnc                 C   sD   dt jv r t jd d}|dgkrd}t|||  }t|S | S )NCUDA_VISIBLE_DEVICES, a
  CUDA_VISIBLE_DEVICES is set to empty string, which means GPU support is disabled. If you are using ray, please unset the environment variable `CUDA_VISIBLE_DEVICES` inside the worker/actor. Check https://github.com/vllm-project/vllm/issues/8402 for more information.)osenvironsplitRuntimeErrorint)r   
device_idsmsgphysical_device_id r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/platforms/cuda.pydevice_id_to_physical_device_id(   s   

r    fnc                    s*   t  dtjdtjdtf fdd}|S )Nargskwargsr   c                     s,   t   z | i |W t   S t   w N)pynvmlnvmlInitnvmlShutdown)r"   r#   r!   r   r   wrapper<   s   z"with_nvml_context.<locals>.wrapper)r   r   r"   r#   r   )r!   r)   r   r(   r   with_nvml_context;   s    r*   c                   @   st  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< edejfd	d
Zed/dededB fddZed/dedefddZeeddd/dedefddZededB defddZedee defddZed0ddZe	d1dejjdB defdd Ze		!	"	d2ded#ed$ed%edef
d&d'Z ed(e!dB d)ed*ej"defd+d,Z#edefd-d.Z$dS )3CudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyr   device_control_env_varr   c                 C   s   t dtj S )Nzcuda:)torchdevicer   
LOCAL_RANKclsr   r   r   get_local_torch_deviceN   s   z'CudaPlatformBase.get_local_torch_devicer   r   Nc                 C      t r$   NotImplementedErrorr6   r   r   r   r   get_device_capabilityR      z&CudaPlatformBase.get_device_capabilityc                 C   r8   r$   r9   r;   r   r   r   get_device_nameV   r=   z CudaPlatformBase.get_device_name   maxsizec                 C   r8   r$   r9   r;   r   r   r   get_device_total_memoryZ   s   z(CudaPlatformBase.get_device_total_memoryenforce_eagerc                 C   s   |r	t d dS dS )NzTo see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be usedFT)loggerwarning)r6   rC   r   r   r   is_async_output_supported_   s   z*CudaPlatformBase.is_async_output_supportedr   c                 C   r8   r$   r9   )r6   r   r   r   r   is_full_nvlinkj   r=   zCudaPlatformBase.is_full_nvlinkc                 C   s   d S r$   r   r5   r   r   r   log_warningsn   r=   zCudaPlatformBase.log_warningsr3   c                 C   s   t j| tt j|S r$   )r2   r,   reset_peak_memory_statsfloatmax_memory_allocated)r6   r3   r   r   r   get_current_memory_usager   s   z)CudaPlatformBase.get_current_memory_usageFTdistributedempty_cache	cpu_groupc           
      C   s   |rt j  t j rt j }t j|}|jr t	 j
}nt j|\}}|rIdd lm} t j|t jdd}	|j|	|jj|d t|	 }|d S )Nr   r,   )dtyper3   )opgroupi   @)r2   r,   rN   rM   is_initializedget_rankget_device_propertiesis_integratedpsutilvirtual_memory	availablemem_get_infotorch.distributedtensorfloat32
all_reduceReduceOpMINrJ   item)
r6   r   rM   rN   rO   device_propsfree_gpu_memory_distr\   r   r   r   get_available_gpu_memoryy   s   


z)CudaPlatformBase.get_available_gpu_memoryselected_backend	head_sizerP   c              
   C   s  d }|t jkr5zddlm} ddlm} td W dS  ty4 } zt	dt
| td|d }~ww |t jkrozddlm} dd	lm}	 td
 W dS  tyn } zt| td t j}W Y d }~nd }~ww |t jkrzddlm}
 td W dS  ty } zt| td t j}W Y d }~nRd }~ww |t jkrzddlm} ddlm} td W dS  ty } zt	dt
| td|d }~ww |t jkrz"ddlm}m} ddlm}m }m!}m"} ddl#m$} td W dS  ty } zt	dt
| td|d }~ww |t j%krNzddl&m'} ddl(m)} td  W d!S  tyM } zt	d"t
| td#|d }~ww |t j*kr[td$ d%S |t jkrhtd& d'S |t j+krutd( d)S |t j,krtd* d+S |t j-krdd,l.m/} td- d.S |t jfv r| 0 rtd/ t j}nF| 1 rdd0l2m3} |d1 t j}n3t j}n/|rt4d2| j5 | 0 rtd3 t j}n| 1 rdd0l2m3} |d1 t j}nt j}|d u rt j}|t jkr| 1 rdd0l2m3} |d1 | 6d4std5 t j}n|t7j8t7j9fvr-td6 t j}|t jkr`zdd7l2m:} |; }||vrLtd8| t j}W n ty_   td9 t j}Y nw |t jkrmtd& d'S td: d;S )<Nr   )sliding_tile_attention)SlidingTileAttentionBackendz$Using Sliding Tile Attention backendzesglang.multimodal_gen.runtime.layers.attention.backends.sliding_tile_attn.SlidingTileAttentionBackendz3Failed to import Sliding Tile Attention backend: %sz1Sliding Tile Attention backend is not installed. )sageattn)SageAttentionBackendzUsing Sage Attention backendzVsglang.multimodal_gen.runtime.layers.attention.backends.sage_attn.SageAttentionBackendzSage Attention backend is not installed (To install it, run `pip install sageattention==2.2.0 --no-build-isolation`). Falling back to Flash Attention.)SageAttention3BackendzUsing Sage Attention 3 backendzXsglang.multimodal_gen.runtime.layers.attention.backends.sage_attn3.SageAttention3BackendzSage Attention 3 backend is not installed (To install it, see https://github.com/thu-ml/SageAttention/tree/main/sageattention3_blackwell#installation). Falling back to Torch SDPA.)block_sparse_attn)VideoSparseAttentionBackendz$Using Video Sparse Attention backendzesglang.multimodal_gen.runtime.layers.attention.backends.video_sparse_attn.VideoSparseAttentionBackendz3Failed to import Video Sparse Attention backend: %sz0Video Sparse Attention backend is not installed.) apply_inverse_permutation_tritonpermute_tensor_by_labels_triton)batch_kmeans_Eucliddensity_calculation#dynamic_block_sparse_fwd_flashinferidentify_dynamic_map)SparseVideoGen2AttentionBackendz0Using Sparse Video Gen 2 (SAP) Attention backendzosglang.multimodal_gen.runtime.layers.attention.backends.sparse_video_gen_2_attn.SparseVideoGen2AttentionBackendz?Failed to import Sparse Video Gen 2 (SAP) Attention backend: %szSparse Video Gen 2 (SAP) Attention backend is not installed. Please install it by following the instructions at https://github.com/svg-project/Sparse-VideoGen)moba_attn_varlen)VMOBAAttentionBackendz"Using Video MOBA Attention backendzSsglang.multimodal_gen.runtime.layers.attention.backends.vmoba.VMOBAAttentionBackendz1Failed to import Video MoBA Attention backend: %sz/Video MoBA Attention backend is not installed. zUsing AITer backendzJsglang.multimodal_gen.runtime.layers.attention.backends.aiter.AITerBackendzUsing Torch SDPA backendzHsglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackendz%Using Sparse Linear Attention backendzgsglang.multimodal_gen.runtime.layers.attention.backends.sparse_linear_attn.SparseLinearAttentionBackendz*Using Sage Sparse Linear Attention backendzksglang.multimodal_gen.runtime.layers.attention.backends.sparse_linear_attn.SageSparseLinearAttentionBackend)FlashAttention2BackendzUsing FlashAttention2 backendz[sglang.multimodal_gen.runtime.layers.attention.backends.flash_attn_2.FlashAttention2BackendzTFlashAttention is not supported on SM12.x in this build; falling back to Torch SDPA.)
set_fa_ver   zInvalid attention backend for z*Defaulting to Torch SDPA backend on SM12.xP   z<Cannot use FlashAttention backend for Volta and Turing GPUs.zWCannot use FlashAttention backend for dtype other than torch.float16 or torch.bfloat16.)FlashAttentionBackendz3Cannot use FlashAttention backend for head size %d.zCannot use FlashAttention backend because the flash_attn package is not found. Make sure that flash_attn was built and installed (on by default).z@Using FlashAttention (FA3 for hopper, FA4 for blackwell) backendzXsglang.multimodal_gen.runtime.layers.attention.backends.flash_attn.FlashAttentionBackend)<r	   SLIDING_TILE_ATTNst_attnri   Isglang.multimodal_gen.runtime.layers.attention.backends.sliding_tile_attnrj   rD   infoImportErrorerrorstr	SAGE_ATTNsageattentionrk   Asglang.multimodal_gen.runtime.layers.attention.backends.sage_attnrl   FASAGE_ATTN_3Bsglang.multimodal_gen.runtime.layers.attention.backends.sage_attn3rm   
TORCH_SDPAVIDEO_SPARSE_ATTNvsarn   Isglang.multimodal_gen.runtime.layers.attention.backends.video_sparse_attnro   SPARSE_VIDEO_GEN_2_ATTNsvg.kernels.triton.permuterp   rq   svg.kmeans_utilsrr   rs   rt   ru   Osglang.multimodal_gen.runtime.layers.attention.backends.sparse_video_gen_2_attnrv   
VMOBA_ATTNkernel.attn.vmoba_attn.vmobarw   =sglang.multimodal_gen.runtime.layers.attention.backends.vmobarx   AITERSLA_ATTNSAGE_SLA_ATTNFA2Dsglang.multimodal_gen.runtime.layers.attention.backends.flash_attn_2ry   is_sm120is_blackwellBsglang.multimodal_gen.runtime.layers.attention.backends.flash_attnrz   
ValueErrorr-   has_device_capabilityr2   float16bfloat16r}   get_supported_head_sizes)r6   rg   rh   rP   target_backendri   rj   erk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r}   supported_sizesr   r   r   get_attn_backend_cls_str   sX  


























	

z)CudaPlatformBase.get_attn_backend_cls_strc                 C   s   dS )Nzasglang.multimodal_gen.runtime.distributed.device_communicators.cuda_communicator.CudaCommunicatorr   r5   r   r   r   get_device_communicator_cls}  r=   z,CudaPlatformBase.get_device_communicator_clsr   r   Nr$   )r   FTN)%__name__
__module____qualname__r   r/   _enumr-   r   __annotations__r.   r0   r1   classmethodr2   r3   r7   r   r
   r<   r>   r   rB   boolrF   listrG   rH   typesDevicerJ   rL   r   rf   r	   rP   r   r   r   r   r   r   r+   G   sr   
 

 gr+   c                
       s0  e Zd ZeeddeddededB fddZeedde	dd	e	eef eB dede
f fd
dZeeddeddedefddZeeddeddedefddZeeddeddedefddZeedee de
fddZeddedefddZeedddZ  ZS )NvmlCudaPlatform   r@   r   r   r   Nc                 C   sD   zt |}t|}t|\}}t||dW S  ty!   Y d S w N)majorminor)r    r%   nvmlDeviceGetHandleByIndex"nvmlDeviceGetCudaComputeCapabilityr
   r   )r6   r   r   handler   r   r   r   r   r<     s   
z&NvmlCudaPlatform.get_device_capability
capabilityc                    s*   z
t t ||W S  ty   Y dS w )NF)r   superr   r   )r6   r   r   	__class__r   r   r     s
   z&NvmlCudaPlatform.has_device_capabilityc                 C   s   t |}| |S r$   )r    _get_physical_device_name)r6   r   r   r   r   r   r>     s   
z NvmlCudaPlatform.get_device_namec                 C   s    t |}t|}tt|S r$   )r    r%   r   r   nvmlDeviceGetUUIDr6   r   r   r   r   r   r   get_device_uuid  s   
z NvmlCudaPlatform.get_device_uuidc                 C   s"   t |}t|}tt|jS r$   )r    r%   r   r   nvmlDeviceGetMemoryInfototalr   r   r   r   rB     s   
z(NvmlCudaPlatform.get_device_total_memoryphysical_device_idsc              
   C   s   dd |D }t |D ]8\}}t |D ]/\}}||k rBzt||tj}|tjkr.W   dS W q tjyA   td Y   dS w qqdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 S   s   g | ]}t |qS r   )r%   r   .0ir   r   r   
<listcomp>      z3NvmlCudaPlatform.is_full_nvlink.<locals>.<listcomp>FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)	enumerater%   nvmlDeviceGetP2PStatusNVML_P2P_CAPS_INDEX_NVLINKNVML_P2P_STATUS_OK	NVMLErrorrD   	exception)r6   r   handlesr   r   jpeer_handle
p2p_statusr   r   r   rG     s,   


zNvmlCudaPlatform.is_full_nvlinkc                 C   s   t |}tt |S r$   )r%   r   r   nvmlDeviceGetName)r6   r   r   r   r   r   r     s   
z*NvmlCudaPlatform._get_physical_device_namec                    sh   t  }|dkr. fddt|D }tt|dkr0tjddkr2t	dd
| d S d S d S d S )Nr?   c                    s   g | ]}  |qS r   )r   r   r5   r   r   r     r   z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>CUDA_DEVICE_ORDER
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.z, )r%   nvmlDeviceGetCountrangelensetr   r   getrD   rE   join)r6   r   device_namesr   r5   r   rH     s   zNvmlCudaPlatform.log_warningsr   r   )r   r   r   r   r   r*   r   r
   r<   tupler   r   r   r>   r   rB   r   rG   r   rH   __classcell__r   r   r   r   r     sH    	
r   c                   @   sx   e Zd ZeddedefddZeddedefddZee	dd	ddedefd
dZ
edee defddZdS )NonNvmlCudaPlatformr   r   r   c                 C   s   t j|\}}t||dS r   )r2   r,   r<   r
   )r6   r   r   r   r   r   r   r<     s   z)NonNvmlCudaPlatform.get_device_capabilityc                 C   s   t tj|S r$   )r   r2   r,   r>   r;   r   r   r   r>     s   z#NonNvmlCudaPlatform.get_device_namer?   r@   c                 C   s   t j|}t|jS r$   )r2   r,   rU   r   total_memory)r6   r   rb   r   r   r   rB     s   
z+NonNvmlCudaPlatform.get_device_total_memoryr   c                 C   s   t d dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)rD   r   )r6   r   r   r   r   rG     s   z"NonNvmlCudaPlatform.is_full_nvlinkNr   )r   r   r   r   r   r
   r<   r   r>   r   rB   r   r   rG   r   r   r   r   r     s    r   T)_MockModule)1__doc__r   collections.abcr   	functoolsr   r   typingr   r   rW   r2   typing_extensionsr   sglang.multimodal_genr   1sglang.multimodal_gen.runtime.platforms.interfacer	   r
   r   r   1sglang.multimodal_gen.runtime.utils.logging_utilsr   sglang.multimodal_gen.utilsr   r   rD   r   r   r%   backendsr,   enable_cudnn_sdpr   r    r*   r+   r   r   nvml_availabler&   	Exceptionr'   CudaPlatformsphinx.ext.autodoc.mockr   
isinstancerH   ModuleNotFoundErrorr   r   r   r   <module>   s`   "  Ab

