o
    -ik_                     @   s  U d dl Z d dlmZmZmZ d dlmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ ddlmZmZmZ erKd d	lmZ d d
lmZ eeZzd dlmZmZmZmZm Z m!Z! W n e"yy Z# ze$de# W Y dZ#[#ndZ#[#ww zd dl%Z&W n e"y Z# ze$de# W Y dZ#[#ndZ#[#ww zd dl'Z&W n e"y Z# ze$de# W Y dZ#[#ndZ#[#ww g Z(e)e* e+d< i Z,e-e*e*f e+d< dddddddddd	Z.e-e*e*f e+d< de j/v re j/d Z0e j/1dd Z2re0e2ksJ ne0e j/d< dd Z3ede4fddZ5ede4fd d!Z6ede4fd"d#Z7ede4fd$d%Z8ede4fd&d'Z9e		d7d(ej:d)e;d*e;d+e;d,e;d-e;d.e*d/ej<dB d0ej<dB de4fd1d2Z=ede4fd3d4Z>G d5d6 d6eZ?dS )8    N)cache	lru_cachewraps)TYPE_CHECKINGOptional)init_loggercuda_device_count_stateless)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfig)AmdSmiExceptionamdsmi_get_gpu_asic_infoamdsmi_get_processor_handlesamdsmi_initamdsmi_shut_downamdsmi_topo_get_link_typez$Failed to import from amdsmi with %rz%Failed to import from vllm._C with %rz*Failed to import from vllm._rocm_C with %r_ROCM_UNSUPPORTED_MODELS _ROCM_PARTIALLY_SUPPORTED_MODELSAMD_Instinct_MI300AAMD_Instinct_MI300XAMD_Instinct_MI308XAMD_Instinct_MI325XAMD_Instinct_MI300X_HFAMD_Radeon_RX7900XTX)	0x74a00x74a10x74b50x74a20x74a50x74b90x74a90x74bd0x744c_ROCM_DEVICE_ID_NAME_MAPHIP_VISIBLE_DEVICESCUDA_VISIBLE_DEVICESc                    s   t   fdd}|S )Nc                     s&   t   z | i |W t  S t  w N)r   r   )argskwargsfn P/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/platforms/rocm.pywrapperO   s   z$with_amdsmi_context.<locals>.wrapper)r   )r/   r2   r0   r.   r1   with_amdsmi_contextN   s   r3   returnc                      $   t jdj t fdddD S )Ncudac                 3       | ]}| v V  qd S r+   r0   .0archGPU_ARCHr0   r1   	<genexpr>]       zon_gfx1x.<locals>.<genexpr>gfx11gfx12torchr6   get_device_propertiesgcnArchNameanyr0   r0   r;   r1   on_gfx1xZ      rG   c                      r5   )Nr6   c                 3   r7   r+   r0   r8   r;   r0   r1   r=   c   r>   zon_mi3xx.<locals>.<genexpr>)gfx942gfx950rB   r0   r0   r;   r1   on_mi3xx`   rH   rK   c                      r5   )Nr6   c                 3   r7   r+   r0   r8   r;   r0   r1   r=   i   r>   zon_gfx9.<locals>.<genexpr>gfx90arI   rJ   rB   r0   r0   r;   r1   on_gfx9f   rH   rN   c                      r5   )Nr6   c                 3   r7   r+   r0   r8   r;   r0   r1   r=   o   r>   zon_gfx942.<locals>.<genexpr>)rI   rB   r0   r0   r;   r1   	on_gfx942l   rH   rO   c                      r5   )Nr6   c                 3   r7   r+   r0   r8   r;   r0   r1   r=   u   r>   zon_gfx950.<locals>.<genexpr>)rJ   rB   r0   r0   r;   r1   	on_gfx950r   rH   rP   qtype	head_size
block_size	gqa_ratiomax_seq_lensliding_windowkv_cache_dtypealibi_slopessinksc	                    s  t jdj t fdddD }	t fdddD }
|	rT|dks'|dkoS| t jkp0| t jkoS|d	kp8|d
koS|dkp@|dkoS|dkoH|dkoS|dkoStjoS|d u S |
o|dkp]|dko| t jkpg| t jko|d
ko|dko|dkow|dko|dko|d u o|dkotjo|d u S )Nr6   c                 3   r7   r+   r0   r8   r;   r0   r1   r=      r>   z2use_rocm_custom_paged_attention.<locals>.<genexpr>rL   c                 3   r7   r+   r0   r8   r;   r0   r1   r=      r>   r?   r   )rZ   @             r   i      auto)	rC   r6   rD   rE   rF   halfbfloat16envsVLLM_ROCM_CUSTOM_PAGED_ATTN)rQ   rR   rS   rT   rU   rV   rW   rX   rY   ON_GFX9ON_GFX11_GFX12r0   r;   r1   use_rocm_custom_paged_attentionx   sP   	
rg   c                  C   sv   t  sdS z+ddlm}  | dd u rW dS | dd u rW dS tjddkr.td W dS W d	S  ty:   Y dS w )
NFr   	find_spec
flash_attnz flash_attn.flash_attn_triton_amd!FLASH_ATTENTION_TRITON_AMD_ENABLETRUEz\Set FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE to enable Flash Attention Triton backend on RDNA.T)	rG   importlib.utilri   osenvirongetlogger	info_onceImportErrorrh   r0   r0   r1   flash_attn_triton_available   s"   rt   c                
       s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< d	Zeed
< dZeed< g dZee ed< e s>edg7 ZedY fddZedddddefddZeded fddZe	dZdedejded ddfdd Zed!ejddfd"d#Zeed$d%d[d'ededB fd(d)Zeed*ee de fd+d,Z!eeed$d%d[d'edefd-d.Z"ed[d'edefd/d0Z#ed\d3d4Z$ed5eddfd6d7Z%ed8eddf fd9d:Z&edefd;d<Z'e	dZd!ej(j)dB de*fd=d>Z+edefd?d@Z,ede fdAdBZ-ede fdCdDZ.ede fdEdFZ/edejfdGdHZ0ede fdIdJZ1ede fdKdLZ2ede fdMdNZ3edefdOdPZ4edefdQdRZ5edejfdSdTZ6ede fdUdVZ7ede fdWdXZ8  Z9S )]RocmPlatformrocmdevice_namer6   device_typeCUDAdispatch_keyGPUray_device_keynccldist_backendr*   device_control_env_var)awq
awq_marlingptqgptq_marlinfp8zcompressed-tensors
fbgemm_fp8ggufquarkptpc_fp8mxfp4petit_nvfp4torchaosupported_quantizationbitsandbytesr4   Nc                    sJ   t    ddl}|t ddl}W d   dS 1 sw   Y  dS )zImport ROCm-specific kernels.r   N)superimport_kernels
contextlibsuppressrs   vllm._rocm_C)clsr   vllm	__class__r0   r1   r      s
   

"zRocmPlatform.import_kernelsselected_backendr
   attn_selector_configr   c                 C   s  ddl m} |j}|j}|jr,|r|drtd|dks"J dtd t	j
 S |jr|d u rA| s;|dkr>t	jnt	j}|t	jkr`|dkrTtd t	j S td	|j d
| d|t	jkrotd t	j S |t	jkr~td t	j S td	|j d|t	jkrtd t	j S |t	jkrtd t	j S |t	jkrtd t	j S |t	jkrt rtd t	j S td|j d|t	jkrtd t	j S |d u rHtjrtjrtd t	j S tjr	tjr	t r	td t	j S ddlm} | }|d ur&|jj r&td t	j S tjr>t r>tjdur>td t	j S td t	j S t!d|j d)Nr   rocm_aiter_opsr   z=ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype.r   z>Sparse MLA backend on ROCm only supports block size 1 for now.zUsing Sparse MLA backend.zUsing Triton MLA backend.z The selected backend, z,does not support block size .zUsing AITER MLA backend.zUsing AITER TRITON MLA backend.z1,is not MLA type while requested for MLA backend.zUsing FlexAttention backend.zUsing Triton Attention backend.zUsing Rocm Attention backend.z$Using Aiter Flash Attention backend.zThe selected backend, z*, is only supported on gfx9 architectures.z&Using Aiter Unified Attention backend.)get_current_vllm_config_or_noneFzAttention backend zM is not supported on ROCm. Note that V0 attention backends have been removed.)"vllm._aiter_opsr   rS   rW   
use_sparse
startswith
ValueErrorrq   rr   r
   ROCM_AITER_MLA_SPARSEget_pathuse_mlais_mla_enabledROCM_AITER_MLA
TRITON_MLAnameinfoROCM_AITER_TRITON_MLAFLEX_ATTENTIONTRITON_ATTN	ROCM_ATTNROCM_AITER_FArN   ROCM_AITER_UNIFIED_ATTNrc   VLLM_ROCM_USE_AITER%VLLM_ROCM_USE_AITER_UNIFIED_ATTENTIONVLLM_ROCM_USE_AITER_MHAvllm.configr   attention_configuse_prefill_decode_attentionRuntimeError)r   r   r   r   rS   rW   r   vllm_configr0   r0   r1   get_attn_backend_cls   s   







































z!RocmPlatform.get_attn_backend_clsc                 C   s   t jt jt jgS r+   )r
   
FLASH_ATTNr   
TORCH_SDPAr   r0   r0   r1   get_supported_vit_attn_backendsZ  s   z,RocmPlatform.get_supported_vit_attn_backendsrR   dtypebackendc                 C   s   |d ur!||   v sJ d| d|    td| d |S ddlm} ddlm} | r<t r<td t	j
S t rW|d	d urW|tjksO|tjkrWtd
 t	jS t rot ro|tjksg|tjkrotd t	jS td t	jS )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attentionr   rh   r   z2Using AITER Flash Attention backend for ViT model.rj   z,Using Flash Attention backend for ViT model.z=Using Flash Attention (Triton backend) for ViT model on RDNA.z'Using Torch SDPA backend for ViT model.)r   rq   rr   rm   ri   r   r   
is_enabledrN   r
   r   rC   float16rb   r   rG   rt   r   )r   rR   r   r   ri   r   r0   r0   r1   get_vit_attn_backendb  s<   


z!RocmPlatform.get_vit_attn_backenddevicec                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)rC   r6   
set_devicer   r   r0   r0   r1   r     s   zRocmPlatform.set_device   )maxsizer   	device_idc                 C   s   t j|\}}t||dS )N)majorminor)rC   r6   get_device_capabilityr   )r   r   r   r   r0   r0   r1   r     s   z"RocmPlatform.get_device_capabilityphysical_device_idsc           	      C   s   dd |D }t |D ]F\}}t |D ]=\}}||k rPzt||}|d dks-|d dkr2W   dS W q tyO } ztjd|d	 W Y d
}~  dS d
}~ww qqdS )zN
        Query if the set of gpus are fully connected by xgmi (1 hop)
        c                 S   s   g | ]}t  | qS r0   )r   )r9   ir0   r0   r1   
<listcomp>  s    z3RocmPlatform.is_fully_connected.<locals>.<listcomp>hopsr   type   Fz AMD 1 hop XGMI detection failed.)exc_infoNT)	enumerater   r   rq   error)	r   r   handlesr   handlejpeer_handle	link_typer   r0   r0   r1   is_fully_connected  s"   


zRocmPlatform.is_fully_connectedc                 C   s<   |  |}t | }t|}|d }|tv rt| S |d S )Nr   market_name)device_id_to_physical_device_idr   r   r(   )r   r   physical_device_idr   	asic_inforw   r0   r0   r1   get_device_name  s   

zRocmPlatform.get_device_namec                 C   s   t j|}|jS r+   )rC   r6   rD   total_memory)r   r   device_propsr0   r0   r1   get_device_total_memory  s   z$RocmPlatform.get_device_total_memoryr   r   c                 C   sj  ddl m} ddlm} |j}|j}|j}||jk}| }|	 }	|
 }
| }|j rL|jdkr>td |j|_n|jdkrLtd |j|_|re|jd u retjrbtjrbd|_td nd	|_|jd
krmd|_|	r||s|d|jvr||jd |
rd|jvr|jd |rd|jv rtd |jd |rd|jvrd|jvr|jd |jd d S )Nr   r   )CUDAGraphModer   z~Decode context parallel (DCP) is enabled, which is incompatible with full CUDA graphs. Overriding cudagraph_mode to PIECEWISE.zPrefill context parallel (PCP) is enabled, which is incompatible with full CUDA graphs. Overriding cudagraph_mode to PIECEWISE.r[   z=[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64.r]   r`   z vllm.v1.worker.gpu_worker.Workerz	-rms_normz	+rms_normz
-quant_fp8z
+quant_fp8z-grouped_topkzVLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled, which requires the 'grouped_topk' custom op. Overriding the user-provided '-grouped_topk'.z+grouped_topkz+sparse_attn_indexer)r   r   vllm.config.compilationr   cache_configcompilation_configparallel_configNONEis_fused_moe_enabledis_rmsnorm_enabledis_linear_fp8_enabled$is_fusion_moe_shared_experts_enabledcudagraph_modehas_full_cudagraphsdecode_context_parallel_sizerq   warning_once	PIECEWISEprefill_context_parallel_sizerS   rc   r   r   warning
worker_cls
custom_opsappendremove)r   r   r   r   r   r   r   is_eager_executionuse_aiter_fused_moeuse_aiter_rms_normuse_aiter_fp8_linearuse_aiter_fused_ser0   r0   r1   check_and_update_config  sh   




	



z$RocmPlatform.check_and_update_config
model_archc                 C   s>   |t v rtd| d|tv rt| }td|| d S d S )NzModel architecture 'z#' is not supported by ROCm for now.z:Model architecture '%s' is partially supported by ROCm: %s)r   r   r   rq   r   )r   r   msgr0   r0   r1   verify_model_arch  s   
zRocmPlatform.verify_model_archquantc                    s2   t  | |dkrtjstd dtjd< d S )Nr   zcUsing AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ is not set, enabling VLLM_USE_TRITON_AWQ.1VLLM_USE_TRITON_AWQ)r   verify_quantizationrc   r  rq   r   rn   ro   )r   r   r   r0   r1   r  !  s   z RocmPlatform.verify_quantizationc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr0   r   r0   r0   r1   get_punica_wrapper+     zRocmPlatform.get_punica_wrapperc                 C   s,   t j| t j|d t j|d  S )Nr   r   )rC   r6   reset_peak_memory_statsmem_get_infor   r0   r0   r1   get_current_memory_usage/  s    z%RocmPlatform.get_current_memory_usagec                 C   s   	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr0   r   r0   r0   r1   get_device_communicator_cls6  s   z(RocmPlatform.get_device_communicator_clsc                    r5   )Nr   c                 3   r7   r+   r0   r9   gfxgcn_archr0   r1   r=   ?  r>   z+RocmPlatform.supports_mx.<locals>.<genexpr>)gfx95rB   r   r0   r  r1   supports_mx<  rH   zRocmPlatform.supports_mxc                    r5   )Nr   c                 3   r7   r+   r0   r
  r  r0   r1   r=   D  r>   z,RocmPlatform.supports_fp8.<locals>.<genexpr>)gfx94r  rA   rB   r   r0   r  r1   supports_fp8A  rH   zRocmPlatform.supports_fp8c                 C      dt jdjv S )Nr  r   rC   r6   rD   rE   r   r0   r0   r1   is_fp8_fnuzF  s   zRocmPlatform.is_fp8_fnuzc                 C   s   |   rtjS tjS r+   )r  rC   float8_e4m3fnuzfloat8_e4m3fnr   r0   r0   r1   	fp8_dtypeK  s   zRocmPlatform.fp8_dtypec                    s,   t jdj ddg}t fdd|D S )Nr   r  r  c                 3   r7   r+   r0   r
  r  r0   r1   r=   W  r>   z4RocmPlatform.use_custom_allreduce.<locals>.<genexpr>rB   )r   supported_archsr0   r  r1   use_custom_allreduceR  s   z!RocmPlatform.use_custom_allreducec                 C   r  NTr0   r   r0   r0   r1   opaque_attention_opY  r  z RocmPlatform.opaque_attention_opc                 C   r  )Ngfx1r   r  r   r0   r0   r1   is_navi]  s   zRocmPlatform.is_navic                 C   r  )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr0   r   r0   r0   r1   get_static_graph_wrapper_clsa  r  z)RocmPlatform.get_static_graph_wrapper_clsc                 C   s   t  S r+   r   r   r0   r0   r1   device_counte  s   zRocmPlatform.device_countc                 C   sb   |t jkr-| ds/|  }|  }|d u rd}n	| }d| }td| d| dd S d S )NP   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rC   rb   has_device_capabilityr   r   as_version_strr   )r   r   
capabilitygpu_namecompute_strversion_strr0   r0   r1   check_if_supports_dtypei  s"   


z$RocmPlatform.check_if_supports_dtypec                 C   r  r  r0   r   r0   r0   r1   support_hybrid_kv_cache~  r  z$RocmPlatform.support_hybrid_kv_cachec                 C   r  r  r0   r   r0   r0   r1   support_static_graph_mode  r  z&RocmPlatform.support_static_graph_mode)r4   Nr+   )r   )r   r   r4   N):__name__
__module____qualname__r   ROCM_enumrw   str__annotations__rx   rz   r|   r~   r   r   listrN   classmethodr   r   r   intrC   r   r   r   r   r   r   r   r   r3   boolr   r   r   r   r   r  r  typesDevicefloatr  r	  r  r  r  r  r  r  r  r  r  r'  r(  r)  __classcell__r0   r0   r   r1   ru      s   
 

u,	P	
ru   )NN)@rn   	functoolsr   r   r   typingr   r   rC   	vllm.envsrc   vllm.loggerr   vllm.utils.torch_utilsr	   #vllm.v1.attention.backends.registryr
   	interfacer   r   r   r   r   vllm.v1.attention.selectorr   r*  rq   amdsmir   r   r   r   r   r   rs   er   vllm._Cr   r   r   r1  r/  r0  r   dictr(   ro   valrp   cuda_valr3   r4  rG   rK   rN   rO   rP   r   r3  Tensorrg   rt   ru   r0   r0   r0   r1   <module>   s   
$


		
-