o
    i'                     @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ ddlmZmZmZ erCd dlmZ d dlmZ ndZe
eZG d	d
 d
eZdS )    N)TYPE_CHECKING)init_logger)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfigc                   @   sn  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< edLddZedddddefddZeded fddZe	dMdedejddddfddZedejddfdd Ze	!dNd"ededB fd#d$ZedNd"edefd%d&Zedefd'd(ZedNd"edefd)d*Zed+d, Zed-eddfd.d/Z ede!fd0d1Z"ede!fd2d3Z#ed4d5 Z$e	dMdej%j&dB de'fd6d7Z(edejfd8d9Z)ede!fd:d;Z*edefd<d=Z+edefd>d?Z,edejfd@dAZ-ede!fdBdCZ.edDej/dEej/dFej/dGej/ddf
dHdIZ0edDej/dEej/dFej/dGej/ddf
dJdKZ1dS )OXPUPlatformxpudevice_namedevice_typeXPUdispatch_keyGPUray_device_keyxccldist_backendZE_AFFINITY_MASKdevice_control_env_varreturnNc                 C   s8   t t dd l}W d    d S 1 sw   Y  d S )Nr   )
contextlibsuppressImportErrorvllm._moe_C)clsvllm r   H/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/platforms/xpu.pyimport_kernels(   s   
"zXPUPlatform.import_kernelsselected_backendr   attn_selector_configr
   c                 C   s   ddl m} |d td |j}|jrtd|jr&td t	j
 S |t	jkr5td t	j S |tjkrDtd t	j S |t	jkrStd	 t	j S |ratd
| j d|j td	 t	j S )Nr   )set_kv_cache_layoutNHDzeSetting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; only NHD layout is supported by XPU attention kernels.z)Sparse Attention is not supported on XPU.z&Using Triton MLA backend on V1 engine.zUsing Triton backend.z`Flash Attention on XPU does not support float32 dtype. Falling back to Triton Attention backend.zUsing Flash Attention backend.zInvalid attention backend for z, with use_mla: ) vllm.v1.attention.backends.utilsr#   loggerinfodtype
use_sparseNotImplementedErroruse_mla	info_oncer   
TRITON_MLAget_pathTRITON_ATTNtorchfloat32warning_once
FLASH_ATTN
ValueErrorr   )r   r!   r"   r#   r(   r   r   r   get_attn_backend_cls.   s<   












z XPUPlatform.get_attn_backend_clsc                 C   s   t jt jgS N)r   r3   
TORCH_SDPAr   r   r   r   get_supported_vit_attn_backendsW   s   z+XPUPlatform.get_supported_vit_attn_backends	head_sizer(   backendzAttentionBackendEnum | Nonec                 C   s^   |d ur"||   v sJ d| d|    dtd| d |S tdtj d tjS )NzBackend z= is not supported for vit attention. Supported backends are: .zUsing backend z for vit attention)r9   r&   r,   r   r3   )r   r:   r(   r;   r   r   r   get_vit_attn_backend^   s   z XPUPlatform.get_vit_attn_backenddevicec                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)r0   r   
set_devicer   r>   r   r   r   r?   s   s   zXPUPlatform.set_devicer   	device_idc                 C   s   d S r6   r   r   rA   r   r   r   get_device_capabilityz   s   z!XPUPlatform.get_device_capabilityc                 C   s   t j|S r6   )r0   r   get_device_namerB   r   r   r   rD      s   zXPUPlatform.get_device_namec                 C   s   t dddk}|sdS dS )NXPU_USE_TRITON_KERNEL01z4vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPUz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU)osgetenv)r   xpu_use_triton_kernelr   r   r   get_punica_wrapper   s   zXPUPlatform.get_punica_wrapperc                 C   s   t j|}|jS r6   )r0   r   get_device_propertiestotal_memory)r   rA   device_propsr   r   r   get_device_total_memory   s   z#XPUPlatform.get_device_total_memoryc                 C   s   t  S r6   )r0   no_gradr8   r   r   r   inference_mode   s   zXPUPlatform.inference_modevllm_configc                 C   s   |j }|j}|r|jd u rd|_ddlm}m} |j}|jd u r#g |_|j|j	ks-J d|j
d ur6|j	|_|jd ur@dtjd< |j}|jdkrKd|_|jd urTd	|j_|ro|jrqtd
 d|j_t|jj|jj|j_d S d S d S )N@   r   )CompilationModeCUDAGraphModez%CUDA graph mode should be NONE on XPU16IGC_ForceOCLSIMDWidthautoz#vllm.v1.worker.xpu_worker.XPUWorkerTz`MLA is enabled on a non-GPU platform; forcing chunked prefill and prefix caching to be disabled.F)cache_configmodel_config
block_sizevllm.configrT   rU   compilation_configcompile_sizescudagraph_modeNONElora_configmodespeculative_configrH   environparallel_config
worker_clskv_transfer_configenable_permute_local_kvr+   r&   r'   scheduler_configenable_chunked_prefillmaxmax_model_lenDEFAULT_MAX_NUM_BATCHED_TOKENSmax_num_batched_tokens)r   rR   rY   rZ   rT   rU   r]   re   r   r   r   check_and_update_config   s<   






z#XPUPlatform.check_and_update_configc                 C      dS NTr   r8   r   r   r   support_hybrid_kv_cache      z#XPUPlatform.support_hybrid_kv_cachec                 C   rp   )NFr   r8   r   r   r   support_static_graph_mode   rs   z%XPUPlatform.support_static_graph_modec                 C   rp   rq   r   r8   r   r   r   is_pin_memory_available   rs   z#XPUPlatform.is_pin_memory_availablec                 C   s   t j| t j|S r6   )r0   r   reset_peak_memory_statsmax_memory_allocatedr@   r   r   r   get_current_memory_usage   s   z$XPUPlatform.get_current_memory_usagec                 C   s   t jS r6   )r0   float8_e4m3fnr8   r   r   r   	fp8_dtype   s   zXPUPlatform.fp8_dtypec                 C   s   |    }|ddkS )Nzdata center gpur   )rD   lowercount)r   r   r   r   r   is_data_center_gpu   s   zXPUPlatform.is_data_center_gpuc                 C   s    ddl m} | std dS )Nr   )supports_xcclzHxccl is not enabled in this torch build, communication is not available.zFvllm.distributed.device_communicators.xpu_communicator.XpuCommunicator)vllm.utils.torch_utilsr~   r&   warning)r   r~   r   r   r   get_device_communicator_cls   s   z'XPUPlatform.get_device_communicator_clsc                 C   s
   t j S r6   )r0   r   device_countr8   r   r   r   r      s   
zXPUPlatform.device_countc                 C   s4   |t jkr|   }|ddkrtdd S d S )Na770r   zIntel Arc A770 have bfloat16 accuracy known issue. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)r0   bfloat16rD   r{   r|   r4   )r   r(   r   r   r   r   check_if_supports_dtype   s   
z#XPUPlatform.check_if_supports_dtypec                 C   rp   rq   r   r8   r   r   r   opaque_attention_op   rs   zXPUPlatform.opaque_attention_op	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on XPU.N)tor>   r   r   r   r   r   
_src_cacher   r   r   insert_blocks_to_device   s   	z#XPUPlatform.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from XPU to host (CPU).N)cpur   r   r   r   swap_out_blocks_to_host  s   	z#XPUPlatform.swap_out_blocks_to_host)r   Nr6   )r   )2__name__
__module____qualname__r   r   _enumr   str__annotations__r   r   r   r   r   classmethodr    r5   listr9   intr0   r(   r=   r>   r?   r   rC   rD   rK   rO   rQ   r	   ro   boolrr   rt   ru   typesDevicefloatrx   rz   r}   r   r   r   r   Tensorr   r   r   r   r   r   r      s   
 (
+


r   )r   rH   typingr   r0   vllm_xpu_kernels._Cvllm_xpu_kernelsvllm_xpu_kernels._moe_Cvllm_xpu_kernels._xpu_Cvllm.loggerr   #vllm.v1.attention.backends.registryr   	interfacer   r   r   r\   r	   vllm.v1.attention.selectorr
   r   r&   r   r   r   r   r   <module>   s    