o
    i8[                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
Z
ddlmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ eraddlmZ ddlmZ ddlm Z  ndZdZee!Z"edZ#e	dZ$e Z%e
j&j'(d ede)dede*e fddZ+dee#e$f dee#e$f fddZ,G dd deZ-G dd de-Z.G dd  d e-Z/dZ0zze%1  d!Z0W n e2y   dZ0Y nw W e0re%3  ne0re%3  w w e0re.ne/Z4e45  dS )"z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)Callable)cachewraps)TYPE_CHECKINGTypeVar)	ParamSpec)init_logger)import_pynvmlcuda_device_count_statelessAttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)
CacheDType)AttentionSelectorConfig_P_RFuse_mladevice_capabilityreturnc                 C   st   | r!|j dkrtjtjtjtjtjtjgS tjtjtjtjtjgS |j dkr0tjtj	tj
tjgS tj	tjtj
tjgS )zEGet backend priorities with lazy import to avoid circular dependency.
   )majorr   FLASHINFER_MLACUTLASS_MLAFLASH_ATTN_MLAFLASHMLA
TRITON_MLAFLASHMLA_SPARSE
FLASHINFER
FLASH_ATTNTRITON_ATTNFLEX_ATTENTION)r   r    r&   I/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/platforms/cuda.py_get_backend_priorities,   s4   


r(   fnc                    s*   t  dtjdtjdtf fdd}|S )Nargskwargsr   c                     s,   t   z | i |W t   S t   w N)pynvmlnvmlInitnvmlShutdown)r*   r+   r)   r&   r'   wrapperV   s   z"with_nvml_context.<locals>.wrapper)r   r   r*   r+   r   )r)   r1   r&   r0   r'   with_nvml_contextU   s    r2   c                   @   s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< dgZee ed< edeej fddZedejddfddZedUdededB fddZedUdedefddZedUdedefddZedee defddZed d! ZedVd$d%Ze	dWdej j!dB de"fd&d'Z#ed(ed)d*de$ee$d+ef  e%d+ee f f fd,d-Z&ed.d+d)d*defd/d0Z'eded+ fd1d2Z(e	dWd3ed4ejd5d6dd+fd7d8Z)edefd9d:Z*edefd;d<Z+edefd=d>Z,edefd?d@Z-edefdAdBZ.edefdCdDZ/edefdEdFZ0ed4ejfdGdHZ1edIej2dJej2dKej2dLej2ddf
dMdNZ3edIej2dJej2dKej2dLej2ddf
dOdPZ4edefdQdRZ5edefdSdTZ6dS )XCudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyGPUray_device_keynccldist_backendCUDA_VISIBLE_DEVICESdevice_control_env_var+RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICESray_noset_device_env_varsr   c                 C   s8   |  drtjtjtjgS |  drtjtjgS tjgS )NP   <   )has_device_capabilitytorchbfloat16float16float32)selfr&   r&   r'   supported_dtypesm   s
   

z!CudaPlatformBase.supported_dtypesdeviceNc                 C   s   t j| t jd|d}dS )z:
        Set the device for the current platform.
        r   )rJ   N)rD   r4   
set_devicezeros)clsrJ   _r&   r&   r'   rK   y   s   zCudaPlatformBase.set_devicer   	device_idc                 C      t r,   NotImplementedErrorrM   rO   r&   r&   r'   get_device_capability      z&CudaPlatformBase.get_device_capabilityc                 C   rP   r,   rQ   rS   r&   r&   r'   get_device_name   rU   z CudaPlatformBase.get_device_namec                 C   rP   r,   rQ   rS   r&   r&   r'   get_device_total_memory   rU   z(CudaPlatformBase.get_device_total_memory
device_idsc                 C   rP   r,   rQ   )rM   rX   r&   r&   r'   is_fully_connected   rU   z#CudaPlatformBase.is_fully_connectedc                 C   s   d S r,   r&   rM   r&   r&   r'   log_warnings   rU   zCudaPlatformBase.log_warningsvllm_configr   c                 C   s  ddl m} |j}|j}|jdkrd|_|j}|r!|jd u r!d|_|d ur|jr|jd urt|jj	d}d}d}d}	ddl
m}
 |jjd u rv|j}t|d	d
}| drb|sb|dkrbd}	|j|j_n'| drl|sld}n|
 d rtd}n	 n|jj}||jk}||jk}||jk}	|r|
 d r|jd dkrd|_td |r|jd dkrd|_td |	r|jdkr|jd dkrd|_td |r|jdkrd|_td |j}|d ur|jr|jr|jstd d|_d S d S d S d S d S )Nr   r   autoz vllm.v1.worker.gpu_worker.Worker   
index_topkF)is_flashmla_dense_supportedqk_nope_head_dimr   d      T@   z7Forcing kv cache block size to 64 for FlashMLA backend.z;Forcing kv cache block size to 128 for CUTLASS_MLA backend.    z<Forcing kv cache block size to 64 for FlashInferMLA backend.z=Forcing kv cache block size to 64 for FlashMLASparse backend.zVForcing --disable_chunked_mm_input for models with multimodal-bidirectional attention.)#vllm.v1.attention.backends.registryr   parallel_configmodel_config
worker_clscache_config
block_sizer   hasattr	hf_configvllm.v1.attention.ops.flashmlar`   attention_configbackendhf_text_configgetattris_device_capability_familyr   r   r   loggerinfoscheduler_configis_mm_prefix_lmis_multimodal_modeldisable_chunked_mm_inputwarning)rM   r\   r   rg   rh   rj   
use_sparseuse_flashmlause_cutlass_mlause_flashinfer_mlar`   rq   ra   rp   rv   r&   r&   r'   check_and_update_config   s   
	







z(CudaPlatformBase.check_and_update_configc                 C   s"   t j  t j| t j|S r,   )rD   r4   empty_cachereset_peak_memory_statsmax_memory_allocated)rM   rJ   r&   r&   r'   get_current_memory_usage
  s   
z)CudaPlatformBase.get_current_memory_usager   attn_selector_configr   r   c           
   	   C   s   g }i }t |j|}t|D ]1\}}z| }|jdd|i| }	W n ty0   dg}	Y nw |	r8|	||< q|||f q||fS )Nr   ImportErrorr&   )r(   r   	enumerate	get_classvalidate_configuration_asdictr   append)
rM   r   r   valid_backends_prioritiesinvalid_reasonsbackend_prioritiespriorityrp   backend_classinvalid_reasons_ir&   r&   r'   get_valid_backends  s(   	


z#CudaPlatformBase.get_valid_backendsselected_backendc           
   	      sz  |   }|d us
J |jd d}|d urIz| }|jdd|i| }W n ty2   dg}Y nw |r?td| d| td| |	 S | j
||d\ }dd	d
d | D  d }| }td| j d| d| d t dkrtd| j d| d| dttt  fddd}|d }	 |	 d }tjd|jdd	dd  D  d dd |	 S )N)rk   r   r   zSelected backend z. is not valid for this configuration. Reason: zUsing %s backend.)r   r   {, c                 s   s,    | ]\}}|j  d d| dV  qdS )z: [r   ]N)namejoin).0rp   reasonsr&   r&   r'   	<genexpr>V  s
    
z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>}z*Some attention backends are not valid for z with z. Reasons: .r   z%No valid attention backend found for c                    s    |  d S )Nr   r&   )ir   r&   r'   <lambda>k  s    z7CudaPlatformBase.get_attn_backend_cls.<locals>.<lambda>)keyz9Using %s attention backend out of potential backends: %s.[c                 s   s"    | ]}d |d j  d V  qdS )'r   N)r   )r   br&   r&   r'   r   r  s     r   local)scoper&   )rT   _replacer   r   r   r   
ValueErrorrt   ru   get_pathr   r   items__repr__
debug_oncer5   lensortedrange	info_oncer   )
rM   r   r   r   r   r   reasons_str
config_strsorted_indicesselected_indexr&   r   r'   get_attn_backend_cls1  s~   







z%CudaPlatformBase.get_attn_backend_clsc                 C   s   t jt jgS r,   )r   
TORCH_SDPAr#   rZ   r&   r&   r'   get_supported_vit_attn_backendsx  s   z0CudaPlatformBase.get_supported_vit_attn_backends	head_sizedtyperp   zAttentionBackendEnum | Nonec                 C   s   |d ur!||   v sJ d| d|    td| d |S |   }rS|jdkrSztj }||r@|	|rDtjW S W tjS W tjS  t
yR   Y tjS w tjS )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attention   )r   rt   r   rT   r   r   r#   r   supports_head_sizesupports_dtyper   r   )rM   r   r   rp   ccr   r&   r&   r'   get_vit_attn_backend  s4   
z%CudaPlatformBase.get_vit_attn_backendc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr&   rZ   r&   r&   r'   get_punica_wrapper  rU   z#CudaPlatformBase.get_punica_wrapperc                 C   s   	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr&   rZ   r&   r&   r'   get_device_communicator_cls  s   z,CudaPlatformBase.get_device_communicator_clsc                 C   s
   |  dS )NY   )rC   rZ   r&   r&   r'   supports_fp8  s   
zCudaPlatformBase.supports_fp8c                 C   r   NTr&   rZ   r&   r&   r'   use_custom_allreduce  rU   z%CudaPlatformBase.use_custom_allreducec                 C   r   r   r&   rZ   r&   r&   r'   opaque_attention_op  rU   z$CudaPlatformBase.opaque_attention_opc                 C   r   )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr&   rZ   r&   r&   r'   get_static_graph_wrapper_cls  rU   z-CudaPlatformBase.get_static_graph_wrapper_clsc                 C   s   t  S r,   r
   rZ   r&   r&   r'   device_count  s   zCudaPlatformBase.device_countc                 C   sb   |t jkr-| ds/|  }|  }|d u rd}n	| }d| }td| d| dd S d S )NrA   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rD   rE   rC   rT   rV   as_version_strr   )rM   r   
capabilitygpu_namecompute_strversion_strr&   r&   r'   check_if_supports_dtype  s"   


z(CudaPlatformBase.check_if_supports_dtype	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on GPU.N)torJ   rM   r   r   r   r   
_src_cacher&   r&   r'   insert_blocks_to_device  s   	z(CudaPlatformBase.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from GPU to host (CPU).N)cpur   r&   r&   r'   swap_out_blocks_to_host  s   	z(CudaPlatformBase.swap_out_blocks_to_hostc                 C   r   r   r&   rZ   r&   r&   r'   support_hybrid_kv_cache  rU   z(CudaPlatformBase.support_hybrid_kv_cachec                 C   r   r   r&   rZ   r&   r&   r'   support_static_graph_mode  rU   z*CudaPlatformBase.support_static_graph_moder   )r\   r   r   Nr,   )7__name__
__module____qualname__r   r7   _enumr5   str__annotations__r6   r8   r:   r<   r>   r@   listpropertyrD   r   rI   classmethodrJ   rK   intr   rT   rV   rW   boolrY   r[   r   typesDevicefloatr   tupledictr   r   r   r   r   r   r   r   r   r   r   r   Tensorr   r   r   r   r&   r&   r&   r'   r3   a   s   
 

q
Fr3   c                	       s   e Zd ZeeeddededB fddZee	dde	eef eB dede
f fdd	Zeeddedefd
dZeeddedefddZeeddedefddZeedee de
fddZeddedefddZeedd Z  ZS )NvmlCudaPlatformr   rO   r   Nc                 C   sF   z|  |}t|}t|\}}t||dW S  ty"   Y d S w N)r   minor)device_id_to_physical_device_idr-   nvmlDeviceGetHandleByIndex"nvmlDeviceGetCudaComputeCapabilityr   RuntimeError)rM   rO   physical_device_idhandler   r   r&   r&   r'   rT     s   

z&NvmlCudaPlatform.get_device_capabilityr   c                    s&   zt  ||W S  ty   Y dS w )NF)superrC   r   )rM   r   rO   	__class__r&   r'   rC      s
   z&NvmlCudaPlatform.has_device_capabilityc                 C   s   |  |}| |S r,   )r   _get_physical_device_name)rM   rO   r   r&   r&   r'   rV     s   

z NvmlCudaPlatform.get_device_namec                 C   s   |  |}t|}t|S r,   )r   r-   r   nvmlDeviceGetUUIDrM   rO   r   r   r&   r&   r'   get_device_uuid  s   


z NvmlCudaPlatform.get_device_uuidc                 C   s$   |  |}t|}tt|jS r,   )r   r-   r   r   nvmlDeviceGetMemoryInfototalr   r&   r&   r'   rW     s   

z(NvmlCudaPlatform.get_device_total_memoryphysical_device_idsc              
   C   s   dd |D }t |D ]8\}}t |D ]/\}}||k rBzt||tj}|tjkr.W   dS W q tjyA   td Y   dS w qqdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 S   s   g | ]}t |qS r&   )r-   r   r   r   r&   r&   r'   
<listcomp>&      z7NvmlCudaPlatform.is_fully_connected.<locals>.<listcomp>FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)r   r-   nvmlDeviceGetP2PStatusNVML_P2P_CAPS_INDEX_NVLINKNVML_P2P_STATUS_OK	NVMLErrorrt   	exception)rM   r   handlesr   r   jpeer_handle
p2p_statusr&   r&   r'   rY      s,   


z#NvmlCudaPlatform.is_fully_connectedc                 C   s   t |}t |S r,   )r-   r   nvmlDeviceGetName)rM   rO   r   r&   r&   r'   r   :  s   

z*NvmlCudaPlatform._get_physical_device_namec                    sh   t  }|dkr. fddt|D }tt|dkr0tjddkr2t	dd
| d S d S d S d S )Nr   c                    s   g | ]}  |qS r&   )r   r   rZ   r&   r'   r   D  r  z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>CUDA_DEVICE_ORDER
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.r   )r-   nvmlDeviceGetCountr   r   setosenvirongetrt   rz   r   )rM   rX   device_namesr&   rZ   r'   r[   ?  s   zNvmlCudaPlatform.log_warningsr   )r   r   r   r   r   r2   r   r   rT   r   r   rC   r   rV   r   rW   r   rY   r   r[   __classcell__r&   r&   r   r'   r     s@    	
r   c                   @   sr   e Zd ZeeddedefddZeddedefddZ	eddedefdd	Z
ed
ee defddZdS )NonNvmlCudaPlatformr   rO   r   c                 C   s   t j|\}}t||dS r   )rD   r4   rT   r   )rM   rO   r   r   r&   r&   r'   rT   R  s   z)NonNvmlCudaPlatform.get_device_capabilityc                 C   s   t j|S r,   )rD   r4   rV   rS   r&   r&   r'   rV   X  s   z#NonNvmlCudaPlatform.get_device_namec                 C   s   t j|}|jS r,   )rD   r4   get_device_propertiestotal_memory)rM   rO   device_propsr&   r&   r'   rW   \  s   z+NonNvmlCudaPlatform.get_device_total_memoryr   c                 C   s   t d dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)rt   r  )rM   r   r&   r&   r'   rY   a  s   z&NonNvmlCudaPlatform.is_fully_connectedNr   )r   r   r   r   r   r   r   rT   r   rV   rW   r   r   rY   r&   r&   r&   r'   r  Q  s    r  T)6__doc__r  collections.abcr   	functoolsr   r   typingr   r   rD   typing_extensionsr   vllm._Cvllmvllm.loggerr   vllm.utils.import_utilsr	   vllm.utils.torch_utilsr   rf   r   	interfacer   r   r   vllm.configr   vllm.config.cacher   vllm.v1.attention.selectorr   r   rt   r   r   r-   backendsr4   enable_cudnn_sdpr   r   r(   r2   r3   r   r  nvml_availabler.   	Exceptionr/   CudaPlatformr[   r&   r&   r&   r'   <module>   sp   "(   ^
