o
    -iZ                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ercddlmZ ddlmZ ddl m!Z! ndZdZee"Z#edZ$e
dZ%e Z&ej'j()d ede*dede+e fddZ,dee$e%f dee$e%f fddZ-G dd deZ.G dd de.Z/G dd  d e.Z0dZ1zze&2  d!Z1W n e3y   dZ1Y nw W e1re&4  ne1re&4  w w e1re/ne0Z5e56  dS )"z~Code inside this file can safely assume cuda platform, e.g. importing
pynvml. However, it should not initialize cuda context.
    N)Callable)cachewraps)TYPE_CHECKINGOptionalTypeVar)	ParamSpec)init_logger)import_pynvmlcuda_device_count_statelessAttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)
CacheDType)AttentionSelectorConfig_P_RFuse_mladevice_capabilityreturnc                 C   st   | r!|j dkrtjtjtjtjtjtjgS tjtjtjtjtjgS |j dkr0tjtj	tj
tjgS tj	tjtj
tjgS )zEGet backend priorities with lazy import to avoid circular dependency.
   )majorr   FLASHINFER_MLACUTLASS_MLAFLASH_ATTN_MLAFLASHMLA
TRITON_MLAFLASHMLA_SPARSE
FLASHINFER
FLASH_ATTNTRITON_ATTNFLEX_ATTENTION)r   r    r'   P/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/platforms/cuda.py_get_backend_priorities,   s4   


r)   fnc                    s*   t  dtjdtjdtf fdd}|S )Nargskwargsr   c                     s,   t   z | i |W t   S t   w N)pynvmlnvmlInitnvmlShutdown)r+   r,   r*   r'   r(   wrapperV   s   z"with_nvml_context.<locals>.wrapper)r   r   r+   r,   r   )r*   r2   r'   r1   r(   with_nvml_contextU   s    r3   c                   @   s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< edeej fddZedejddfddZedRdededB fddZedRdedefddZedRdedefddZedee defddZedd ZedSd"d#Ze	dTdejj dB de!fd$d%Z"ed&ed'd(de#ee#d)ef  e$d)ee f f fd*d+Z%ed,d)d'd(defd-d.Z&eded) fd/d0Z'e	dTd1ed2ejd3e(d) dd)fd4d5Z)edefd6d7Z*edefd8d9Z+edefd:d;Z,edefd<d=Z-edefd>d?Z.edefd@dAZ/edefdBdCZ0ed2ejfdDdEZ1edFej2dGej2dHej2dIej2ddf
dJdKZ3edFej2dGej2dHej2dIej2ddf
dLdMZ4edefdNdOZ5edefdPdQZ6dS )UCudaPlatformBasecudadevice_namedevice_typeCUDAdispatch_keyGPUray_device_keynccldist_backendCUDA_VISIBLE_DEVICESdevice_control_env_varr   c                 C   s8   |  drtjtjtjgS |  drtjtjgS tjgS )NP   <   )has_device_capabilitytorchbfloat16float16float32)selfr'   r'   r(   supported_dtypesj   s
   

z!CudaPlatformBase.supported_dtypesdeviceNc                 C   s   t j| t jd|d}dS )z:
        Set the device for the current platform.
        r   )rI   N)rC   r5   
set_devicezeros)clsrI   _r'   r'   r(   rJ   v   s   zCudaPlatformBase.set_devicer   	device_idc                 C      t r-   NotImplementedErrorrL   rN   r'   r'   r(   get_device_capability      z&CudaPlatformBase.get_device_capabilityc                 C   rO   r-   rP   rR   r'   r'   r(   get_device_name   rT   z CudaPlatformBase.get_device_namec                 C   rO   r-   rP   rR   r'   r'   r(   get_device_total_memory   rT   z(CudaPlatformBase.get_device_total_memory
device_idsc                 C   rO   r-   rP   )rL   rW   r'   r'   r(   is_fully_connected   rT   z#CudaPlatformBase.is_fully_connectedc                 C   s   d S r-   r'   rL   r'   r'   r(   log_warnings   rT   zCudaPlatformBase.log_warningsvllm_configr   c                 C   s  ddl m} |j}|j}|jdkrd|_|j}|r!|jd u r!d|_|d ur|jr|jd urt|jj	d}d}d}d}	ddl
m}
 |jjd u rv|j}t|d	d
}| drb|sb|dkrbd}	|j|j_n'| drl|sld}n|
 d rtd}n	 n|jj}||jk}||jk}||jk}	|r|
 d r|jd dkrd|_td |r|jd dkrd|_td |	r|jdkr|jd dkrd|_td |r|jdkrd|_td |j}|d ur|jr|jr|jstd d|_d S d S d S d S d S )Nr   r   autoz vllm.v1.worker.gpu_worker.Worker   
index_topkF)is_flashmla_dense_supportedqk_nope_head_dimr   d      T@   z7Forcing kv cache block size to 64 for FlashMLA backend.z;Forcing kv cache block size to 128 for CUTLASS_MLA backend.    z<Forcing kv cache block size to 64 for FlashInferMLA backend.z=Forcing kv cache block size to 64 for FlashMLASparse backend.zVForcing --disable_chunked_mm_input for models with multimodal-bidirectional attention.)#vllm.v1.attention.backends.registryr   parallel_configmodel_config
worker_clscache_config
block_sizer   hasattr	hf_configvllm.v1.attention.ops.flashmlar_   attention_configbackendhf_text_configgetattris_device_capability_familyr   r    r   loggerinfoscheduler_configis_mm_prefix_lmis_multimodal_modeldisable_chunked_mm_inputwarning)rL   r[   r   rf   rg   ri   
use_sparseuse_flashmlause_cutlass_mlause_flashinfer_mlar_   rp   r`   ro   ru   r'   r'   r(   check_and_update_config   s   
	







z(CudaPlatformBase.check_and_update_configc                 C   s"   t j  t j| t j|S r-   )rC   r5   empty_cachereset_peak_memory_statsmax_memory_allocated)rL   rI   r'   r'   r(   get_current_memory_usage  s   
z)CudaPlatformBase.get_current_memory_usager   attn_selector_configr   r   c           
   	   C   s   g }i }t |j|}t|D ]1\}}z| }|jdd|i| }	W n ty0   dg}	Y nw |	r8|	||< q|||f q||fS )Nr   ImportErrorr'   )r)   r   	enumerate	get_classvalidate_configuration_asdictr   append)
rL   r   r   valid_backends_prioritiesinvalid_reasonsbackend_prioritiespriorityro   backend_classinvalid_reasons_ir'   r'   r(   get_valid_backends  s(   	


z#CudaPlatformBase.get_valid_backendsselected_backendc           
   	      sp  |   }|d us
J |jd d}|d urIz| }|jdd|i| }W n ty2   dg}Y nw |r?td| d| td| |	 S | j
||d\ }dd	d
d | D  d }| }td| j d| d| d t dkrtd| j d| d| dttt  fddd}|d }	 |	 d }tjd|jtdd  D dd |	 S )N)rj   r   r   zSelected backend z. is not valid for this configuration. Reason: zUsing %s backend.)r   r   {, c                 s   s,    | ]\}}|j  d d| dV  qdS )z: [r   ]N)namejoin).0ro   reasonsr'   r'   r(   	<genexpr>S  s
    
z8CudaPlatformBase.get_attn_backend_cls.<locals>.<genexpr>}z*Some attention backends are not valid for z with z. Reasons: .r   z%No valid attention backend found for c                    s    |  d S )Nr   r'   )ir   r'   r(   <lambda>h  s    z7CudaPlatformBase.get_attn_backend_cls.<locals>.<lambda>)keyz8Using %s attention backend out of potential backends: %sc                 s   s    | ]}|d  j V  qdS )r   N)r   )r   br'   r'   r(   r   o  s    local)scoper'   )rS   _replacer   r   r   r   
ValueErrorrs   rt   get_pathr   r   items__repr__
debug_oncer6   lensortedrange	info_oncer   tuple)
rL   r   r   r   r   r   reasons_str
config_strsorted_indicesselected_indexr'   r   r(   get_attn_backend_cls.  s~   







z%CudaPlatformBase.get_attn_backend_clsc                 C   s   t jt jgS r-   )r   
TORCH_SDPAr$   rY   r'   r'   r(   get_supported_vit_attn_backendsu  s   z0CudaPlatformBase.get_supported_vit_attn_backends	head_sizedtypero   c                 C   s   |d ur!||   v sJ d| d|    td| d |S |   }rS|jdkrSztj }||r@|	|rDtjW S W tjS W tjS  t
yR   Y tjS w tjS )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attention   )r   rs   r   rS   r   r   r$   r   supports_head_sizesupports_dtyper   r   )rL   r   r   ro   ccr   r'   r'   r(   get_vit_attn_backend|  s4   
z%CudaPlatformBase.get_vit_attn_backendc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr'   rY   r'   r'   r(   get_punica_wrapper  rT   z#CudaPlatformBase.get_punica_wrapperc                 C   s   	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr'   rY   r'   r'   r(   get_device_communicator_cls  s   z,CudaPlatformBase.get_device_communicator_clsc                 C   s
   |  dS )NY   )rB   rY   r'   r'   r(   supports_fp8  s   
zCudaPlatformBase.supports_fp8c                 C   r   NTr'   rY   r'   r'   r(   use_custom_allreduce  rT   z%CudaPlatformBase.use_custom_allreducec                 C   r   r   r'   rY   r'   r'   r(   opaque_attention_op  rT   z$CudaPlatformBase.opaque_attention_opc                 C   r   )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr'   rY   r'   r'   r(   get_static_graph_wrapper_cls  rT   z-CudaPlatformBase.get_static_graph_wrapper_clsc                 C   s   t  S r-   r   rY   r'   r'   r(   device_count  s   zCudaPlatformBase.device_countc                 C   sb   |t jkr-| ds/|  }|  }|d u rd}n	| }d| }td| d| dd S d S )Nr@   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rC   rD   rB   rS   rU   as_version_strr   )rL   r   
capabilitygpu_namecompute_strversion_strr'   r'   r(   check_if_supports_dtype  s"   


z(CudaPlatformBase.check_if_supports_dtype	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on GPU.N)torI   rL   r   r   r   r   
_src_cacher'   r'   r(   insert_blocks_to_device  s   	z(CudaPlatformBase.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from GPU to host (CPU).N)cpur   r'   r'   r(   swap_out_blocks_to_host  s   	z(CudaPlatformBase.swap_out_blocks_to_hostc                 C   r   r   r'   rY   r'   r'   r(   support_hybrid_kv_cache  rT   z(CudaPlatformBase.support_hybrid_kv_cachec                 C   r   r   r'   rY   r'   r'   r(   support_static_graph_mode  rT   z*CudaPlatformBase.support_static_graph_moder   )r[   r   r   Nr-   )7__name__
__module____qualname__r   r8   _enumr6   str__annotations__r7   r9   r;   r=   r?   propertylistrC   r   rH   classmethodrI   rJ   intr   rS   rU   rV   boolrX   rZ   r~   typesDevicefloatr   r   dictr   r   r   r   r   r   r   r   r   r   r   r   r   Tensorr   r   r   r   r'   r'   r'   r(   r4   a   s   
 

q
Fr4   c                	       s   e Zd ZeeeddededB fddZee	dde	eef eB dede
f fdd	Zeeddedefd
dZeeddedefddZeeddedefddZeedee de
fddZeddedefddZeedd Z  ZS )NvmlCudaPlatformr   rN   r   Nc                 C   sF   z|  |}t|}t|\}}t||dW S  ty"   Y d S w N)r   minor)device_id_to_physical_device_idr.   nvmlDeviceGetHandleByIndex"nvmlDeviceGetCudaComputeCapabilityr   RuntimeError)rL   rN   physical_device_idhandler   r   r'   r'   r(   rS     s   

z&NvmlCudaPlatform.get_device_capabilityr   c                    s&   zt  ||W S  ty   Y dS w )NF)superrB   r   )rL   r   rN   	__class__r'   r(   rB     s
   z&NvmlCudaPlatform.has_device_capabilityc                 C   s   |  |}| |S r-   )r   _get_physical_device_name)rL   rN   r   r'   r'   r(   rU   	  s   

z NvmlCudaPlatform.get_device_namec                 C   s   |  |}t|}t|S r-   )r   r.   r   nvmlDeviceGetUUIDrL   rN   r   r   r'   r'   r(   get_device_uuid  s   


z NvmlCudaPlatform.get_device_uuidc                 C   s$   |  |}t|}tt|jS r-   )r   r.   r   r   nvmlDeviceGetMemoryInfototalr   r'   r'   r(   rV     s   

z(NvmlCudaPlatform.get_device_total_memoryphysical_device_idsc              
   C   s   dd |D }t |D ]8\}}t |D ]/\}}||k rBzt||tj}|tjkr.W   dS W q tjyA   td Y   dS w qqdS )zP
        query if the set of gpus are fully connected by nvlink (1 hop)
        c                 S   s   g | ]}t |qS r'   )r.   r   r   r   r'   r'   r(   
<listcomp>#      z7NvmlCudaPlatform.is_fully_connected.<locals>.<listcomp>FzONVLink detection failed. This is normal if your machine has no NVLink equipped.T)r   r.   nvmlDeviceGetP2PStatusNVML_P2P_CAPS_INDEX_NVLINKNVML_P2P_STATUS_OK	NVMLErrorrs   	exception)rL   r   handlesr   r   jpeer_handle
p2p_statusr'   r'   r(   rX     s,   


z#NvmlCudaPlatform.is_fully_connectedc                 C   s   t |}t |S r-   )r.   r   nvmlDeviceGetName)rL   rN   r   r'   r'   r(   r   7  s   

z*NvmlCudaPlatform._get_physical_device_namec                    sh   t  }|dkr. fddt|D }tt|dkr0tjddkr2t	dd
| d S d S d S d S )Nr   c                    s   g | ]}  |qS r'   )r   r   rY   r'   r(   r   A  r   z1NvmlCudaPlatform.log_warnings.<locals>.<listcomp>CUDA_DEVICE_ORDER
PCI_BUS_IDzDetected different devices in the system: %s. Please make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to avoid unexpected behavior.r   )r.   nvmlDeviceGetCountr   r   setosenvirongetrs   ry   r   )rL   rW   device_namesr'   rY   r(   rZ   <  s   zNvmlCudaPlatform.log_warningsr   )r   r   r   r   r   r3   r   r   rS   r   r   rB   r   rU   r   rV   r   rX   r   rZ   __classcell__r'   r'   r   r(   r     s@    	
r   c                   @   sr   e Zd ZeeddedefddZeddedefddZ	eddedefdd	Z
ed
ee defddZdS )NonNvmlCudaPlatformr   rN   r   c                 C   s   t j|\}}t||dS r   )rC   r5   rS   r   )rL   rN   r   r   r'   r'   r(   rS   O  s   z)NonNvmlCudaPlatform.get_device_capabilityc                 C   s   t j|S r-   )rC   r5   rU   rR   r'   r'   r(   rU   U  s   z#NonNvmlCudaPlatform.get_device_namec                 C   s   t j|}|jS r-   )rC   r5   get_device_propertiestotal_memory)rL   rN   device_propsr'   r'   r(   rV   Y  s   z+NonNvmlCudaPlatform.get_device_total_memoryr   c                 C   s   t d dS )Nz^NVLink detection not possible, as context support was not found. Assuming no NVLink available.F)rs   r  )rL   r   r'   r'   r(   rX   ^  s   z&NonNvmlCudaPlatform.is_fully_connectedNr   )r   r   r   r   r   r   r   rS   r   rU   rV   r   r   rX   r'   r'   r'   r(   r  N  s    r  T)7__doc__r  collections.abcr   	functoolsr   r   typingr   r   r   rC   typing_extensionsr   vllm._Cvllmvllm.loggerr	   vllm.utils.import_utilsr
   vllm.utils.torch_utilsr   re   r   	interfacer   r   r   vllm.configr   vllm.config.cacher   vllm.v1.attention.selectorr   r   rs   r   r   r.   backendsr5   enable_cudnn_sdpr   r   r)   r3   r4   r   r  nvml_availabler/   	Exceptionr0   CudaPlatformrZ   r'   r'   r'   r(   <module>   sp   "(   ^
