o
    -i$                     @   s   d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZmZ er9d dlmZ d dlmZ ndZeeZG d	d
 d
eZdS )    N)TYPE_CHECKINGOptional)init_logger)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfigc                   @   sr  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< dZeed	< d
Zeed< edKddZedddddefddZeded fddZe	dLdedejded ddfddZedejddfddZe	 dMd!ededB fd"d#ZedMd!edefd$d%Zedefd&d'ZedMd!edefd(d)Zed*d+ Zed,e ddfd-d.Z!ede"fd/d0Z#ede"fd1d2Z$ed3d4 Z%e	dLdej&j'dB de(fd5d6Z)edejfd7d8Z*ede"fd9d:Z+edefd;d<Z,edefd=d>Z-edejfd?d@Z.ede"fdAdBZ/edCej0dDej0dEej0dFej0ddf
dGdHZ1edCej0dDej0dEej0dFej0ddf
dIdJZ2dS )NXPUPlatformxpudevice_namedevice_typeXPUdispatch_keyGPUray_device_keyccldist_backendZE_AFFINITY_MASKdevice_control_env_varreturnNc                 C   s8   t t dd l}W d    d S 1 sw   Y  d S )Nr   )
contextlibsuppressImportErrorvllm._moe_C)clsvllm r   O/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/platforms/xpu.pyimport_kernels#   s   
"zXPUPlatform.import_kernelsselected_backendr   attn_selector_configr   c                 C   s   ddl m} |d td |j}|jrtd|tjkr(t	d tj
 S |tjkr7td tj
 S |tjkrFt	d tj
 S |rTtd	| j d
|j td tj
 S )Nr   )set_kv_cache_layoutNHDzeSetting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; only NHD layout is supported by XPU attention kernels.z)Sparse Attention is not supported on XPU.zUsing Triton backend.z`Flash Attention on XPU does not support float32 dtype. Falling back to Triton Attention backend.zUsing Flash Attention backend.zInvalid attention backend for z, with use_mla: ) vllm.v1.attention.backends.utilsr$   loggerinfodtype
use_sparseNotImplementedErrorr   TRITON_ATTN	info_onceget_pathtorchfloat32warning_once
FLASH_ATTN
ValueErrorr   use_mla)r   r"   r#   r$   r)   r   r   r    get_attn_backend_cls)   s6   










z XPUPlatform.get_attn_backend_clsc                 C   s   t jgS N)r   r2   r   r   r   r    get_supported_vit_attn_backendsO   s   z+XPUPlatform.get_supported_vit_attn_backends	head_sizer)   backendc                 C   s^   |d ur"||   v sJ d| d|    dtd| d |S tdtj d tjS )NzBackend z= is not supported for vit attention. Supported backends are: .zUsing backend z for vit attention)r8   r'   r-   r   r2   )r   r9   r)   r:   r   r   r    get_vit_attn_backendV   s   z XPUPlatform.get_vit_attn_backenddevicec                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)r/   r   
set_devicer   r=   r   r   r    r>   k   s   zXPUPlatform.set_devicer   	device_idc                 C   s   d S r6   r   r   r@   r   r   r    get_device_capabilityr   s   z!XPUPlatform.get_device_capabilityc                 C   s   t j|S r6   )r/   r   get_device_namerA   r   r   r    rC   {   s   zXPUPlatform.get_device_namec                 C   s   t dddk}|sdS dS )NXPU_USE_TRITON_KERNEL01z4vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPUz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU)osgetenv)r   xpu_use_triton_kernelr   r   r    get_punica_wrapper   s   zXPUPlatform.get_punica_wrapperc                 C   s   t j|}|jS r6   )r/   r   get_device_propertiestotal_memory)r   r@   device_propsr   r   r    get_device_total_memory   s   z#XPUPlatform.get_device_total_memoryc                 C   s   t  S r6   )r/   no_gradr7   r   r   r    inference_mode   s   zXPUPlatform.inference_modevllm_configc                 C   s   |j }|j}|r|jd u rd|_ddlm}m} |j}|jd u r#g |_|j|j	ks-J d|j
d ur6|j	|_|jd ur@dtjd< |j}|jdkrKd|_|jd urTd	|j_|ro|jrqtd
 d|j_t|jj|jj|j_d S d S d S )N@   r   )CompilationModeCUDAGraphModez%CUDA graph mode should be NONE on XPU16IGC_ForceOCLSIMDWidthautoz#vllm.v1.worker.xpu_worker.XPUWorkerTz`MLA is enabled on a non-GPU platform; forcing chunked prefill and prefix caching to be disabled.F)cache_configmodel_config
block_sizevllm.configrS   rT   compilation_configcompile_sizescudagraph_modeNONElora_configmodespeculative_configrG   environparallel_config
worker_clskv_transfer_configenable_permute_local_kvr4   r'   r(   scheduler_configenable_chunked_prefillmaxmax_model_lenDEFAULT_MAX_NUM_BATCHED_TOKENSmax_num_batched_tokens)r   rQ   rX   rY   rS   rT   r\   rd   r   r   r    check_and_update_config   s<   






z#XPUPlatform.check_and_update_configc                 C      dS NTr   r7   r   r   r    support_hybrid_kv_cache      z#XPUPlatform.support_hybrid_kv_cachec                 C   ro   )NFr   r7   r   r   r    support_static_graph_mode   rr   z%XPUPlatform.support_static_graph_modec                 C   ro   rp   r   r7   r   r   r    is_pin_memory_available   rr   z#XPUPlatform.is_pin_memory_availablec                 C   s   t j| t j|S r6   )r/   r   reset_peak_memory_statsmax_memory_allocatedr?   r   r   r    get_current_memory_usage   s   z$XPUPlatform.get_current_memory_usagec                 C   s   t jS r6   )r/   float8_e5m2r7   r   r   r    	fp8_dtype   s   zXPUPlatform.fp8_dtypec                 C   s   |    }|ddkS )Nzdata center gpur   )rC   lowercount)r   r   r   r   r    is_data_center_gpu   s   zXPUPlatform.is_data_center_gpuc                 C   ro   )NzFvllm.distributed.device_communicators.xpu_communicator.XpuCommunicatorr   r7   r   r   r    get_device_communicator_cls   rr   z'XPUPlatform.get_device_communicator_clsc                 C   s
   t j S r6   )r/   r   device_countr7   r   r   r    r~      s   
zXPUPlatform.device_countc                 C   s4   |t jkr|   }|ddkrtdd S d S )Na770r   zIntel Arc A770 have bfloat16 accuracy known issue. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)r/   bfloat16rC   rz   r{   r3   )r   r)   r   r   r   r    check_if_supports_dtype   s   
z#XPUPlatform.check_if_supports_dtypec                 C   ro   rp   r   r7   r   r   r    opaque_attention_op   rr   zXPUPlatform.opaque_attention_op	src_cache	dst_cachesrc_block_indicesdst_block_indicesc                 C   s,   |dd|f }| |j|dd|f< dS )z/Copy blocks from src_cache to dst_cache on XPU.N)tor=   r   r   r   r   r   
_src_cacher   r   r    insert_blocks_to_device   s   	z#XPUPlatform.insert_blocks_to_devicec                 C   s(   |dd|f }|  |dd|f< dS )z#Copy blocks from XPU to host (CPU).N)cpur   r   r   r    swap_out_blocks_to_host   s   	z#XPUPlatform.swap_out_blocks_to_host)r   Nr6   )r   )3__name__
__module____qualname__r	   r   _enumr   str__annotations__r   r   r   r   r   classmethodr!   r5   listr8   intr/   r)   r   r<   r=   r>   r   rB   rC   rJ   rN   rP   r
   rn   boolrq   rs   rt   typesDevicefloatrw   ry   r|   r}   r~   r   r   Tensorr   r   r   r   r   r    r      s   
 %
+

r   )r   rG   typingr   r   r/   vllm.loggerr   #vllm.v1.attention.backends.registryr   	interfacer   r   r	   r[   r
   vllm.v1.attention.selectorr   r   r'   r   r   r   r   r    <module>   s   