o
    
۾id                     @   s  U d dl Z d dlmZmZmZ d dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ ddlmZmZmZ erId d	lmZ d d
lmZ eeZzd dlmZmZmZmZmZm Z  W n e!yw Z" ze#de" W Y dZ"["ndZ"["ww zd dl$Z%W n e!y Z" ze#de" W Y dZ"["ndZ"["ww zd dl&Z%W n e!y Z" ze#de" W Y dZ"["ndZ"["ww g Z'e(e) e*d< i Z+e,e)e)f e*d< dddddddddd	Z-e,e)e)f e*d< de j.v re j.d Z/e j.0dd Z1re/e1ksJ ne/e j.d< dd Z2e2de)fddZ3ede)fd d!Z4ede5fd"d#Z6ede5fd$d%Z7ede5fd&d'Z8ede5fd(d)Z9ede5fd*d+Z:e		d;d,ej;d-e<d.e<d/e<d0e<d1e<d2e)d3ej=dB d4ej=dB de5fd5d6Z>ede5fd7d8Z?G d9d: d:eZ@dS )<    N)cache	lru_cachewraps)TYPE_CHECKING)init_loggercuda_device_count_stateless)AttentionBackendEnum   )DeviceCapabilityPlatformPlatformEnum)
VllmConfig)AttentionSelectorConfig)AmdSmiExceptionamdsmi_get_gpu_asic_infoamdsmi_get_processor_handlesamdsmi_initamdsmi_shut_downamdsmi_topo_get_link_typez$Failed to import from amdsmi with %rz%Failed to import from vllm._C with %rz*Failed to import from vllm._rocm_C with %r_ROCM_UNSUPPORTED_MODELS _ROCM_PARTIALLY_SUPPORTED_MODELSAMD_Instinct_MI300AAMD_Instinct_MI300XAMD_Instinct_MI308XAMD_Instinct_MI325XAMD_Instinct_MI300X_HFAMD_Radeon_RX7900XTX)	0x74a00x74a10x74b50x74a20x74a50x74b90x74a90x74bd0x744c_ROCM_DEVICE_ID_NAME_MAPHIP_VISIBLE_DEVICESCUDA_VISIBLE_DEVICESc                    s   t   fdd}|S )Nc                     s&   t   z | i |W t  S t  w N)r   r   )argskwargsfn G/home/ubuntu/.local/lib/python3.10/site-packages/vllm/platforms/rocm.pywrapperO   s   z$with_amdsmi_context.<locals>.wrapper)r   )r.   r1   r/   r-   r0   with_amdsmi_contextN   s   r2   returnc                  C   s2   t  } | rt| d }|dd}|r|S td)z4Query GCN arch from amdsmi. Raises if not available.r   target_graphics_version z$amdsmi did not return valid GCN arch)r   r   getRuntimeError)handles	asic_info
target_gfxr/   r/   r0   _query_gcn_arch_from_amdsmiZ   s   r;   c               
   C   sR   zt  W S  ty! }  ztd|  td W Y d} ~ nd} ~ ww tjdjS )z
    Get the GCN architecture name using amdsmi instead of torch.cuda.
    This avoids initializing CUDA, which is important for Ray workers
    that need to set CUDA_VISIBLE_DEVICES after importing vLLM.
    z%Failed to get GCN arch via amdsmi: %szFailed to get GCN arch via amdsmi, falling back to torch.cuda. This will initialize CUDA and may cause issues if CUDA_VISIBLE_DEVICES is not set yet.Ncuda)	r;   	Exceptionloggerdebugwarning_oncetorchr<   get_device_propertiesgcnArchName)er/   r/   r0   _get_gcn_arch_via_amdsmih   s   rE   c                         t   t fdddD S )Nc                 3       | ]}| v V  qd S r*   r/   .0archGPU_ARCHr/   r0   	<genexpr>       zon_gfx1x.<locals>.<genexpr>gfx11gfx12rE   anyr/   r/   rK   r0   on_gfx1x|      rT   c                      rF   )Nc                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   zon_mi3xx.<locals>.<genexpr>)gfx942gfx950rR   r/   r/   rK   r0   on_mi3xx   rU   rX   c                      rF   )Nc                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   zon_gfx9.<locals>.<genexpr>gfx90arV   rW   rR   r/   r/   rK   r0   on_gfx9   rU   r[   c                      rF   )Nc                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   zon_gfx942.<locals>.<genexpr>)rV   rR   r/   r/   rK   r0   	on_gfx942   rU   r\   c                      rF   )Nc                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   zon_gfx950.<locals>.<genexpr>)rW   rR   r/   r/   rK   r0   	on_gfx950   rU   r]   qtype	head_size
block_size	gqa_ratiomax_seq_lensliding_windowkv_cache_dtypealibi_slopessinksc	                    s  t   t fdddD }	t fdddD }
|	rP|dks#|dkoO| tjkp,| tjkoO|dkp4|d	koO|d
kp<|dkoO|dkoD|d
koO|dkoOtjoO|d u S |
o|dkpY|dko| tjkpc| tjko|d	ko|d
ko|dkos|d
ko|dko|d u o|dkotjo|d u S )Nc                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   z2use_rocm_custom_paged_attention.<locals>.<genexpr>rY   c                 3   rG   r*   r/   rH   rK   r/   r0   rM      rN   rO   r   )rg   @             r
   i      auto)rE   rS   rA   halfbfloat16envsVLLM_ROCM_CUSTOM_PAGED_ATTN)r^   r_   r`   ra   rb   rc   rd   re   rf   ON_GFX9ON_GFX11_GFX12r/   rK   r0   use_rocm_custom_paged_attention   sP   	
rt   c                  C   sv   t  sdS z+ddlm}  | dd u rW dS | dd u rW dS tjddkr.td W dS W d	S  ty:   Y dS w )
NFr   	find_spec
flash_attnz flash_attn.flash_attn_triton_amd!FLASH_ATTENTION_TRITON_AMD_ENABLETRUEz\Set FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE to enable Flash Attention Triton backend on RDNA.T)	rT   importlib.utilrv   osenvironr6   r>   	info_onceImportErrorru   r/   r/   r0   flash_attn_triton_available   s"   r   c                
       s  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< d	Zeed
< dZeed< g dZee ed< g dZee ed< e sHedg7 Zed\ fddZedddddefddZeded fddZe	d]dedejd d!ddfd"d#Zed$ejddfd%d&Zeed'd(d^d*ededB fd+d,Zeed-ee de fd.d/Z!eeed'd(d^d*edefd0d1Z"ed^d*edefd2d3Z#ed_d6d7Z$ed8eddfd9d:Z%ed;eddf fd<d=Z&edefd>d?Z'e	d]d$ej(j)dB de*fd@dAZ+edefdBdCZ,ede fdDdEZ-ede fdFdGZ.ede fdHdIZ/edejfdJdKZ0ede fdLdMZ1ede fdNdOZ2ede fdPdQZ3edefdRdSZ4edefdTdUZ5edejfdVdWZ6ede fdXdYZ7ede fdZd[Z8  Z9S )`RocmPlatformrocmdevice_namer<   device_typeCUDAdispatch_keyGPUray_device_keynccldist_backendr)   device_control_env_var)*RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES+RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES+RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICESray_noset_device_env_vars)awq
awq_marlingptqgptq_marlinfp8zcompressed-tensors
fbgemm_fp8ggufquarkptpc_fp8mxfp4petit_nvfp4torchaosupported_quantizationbitsandbytesr3   Nc                    sJ   t    ddl}|t ddl}W d   dS 1 sw   Y  dS )zImport ROCm-specific kernels.r   N)superimport_kernels
contextlibsuppressr~   vllm._rocm_C)clsr   vllm	__class__r/   r0   r      s
   

"zRocmPlatform.import_kernelsselected_backendr	   attn_selector_configr   c                 C   s  ddl m} |j}|j}|jr,|r|drtd|dks"J dtd t	j
 S |jr|d u rA| s;|dkr>t	jnt	j}|t	jkr`|dkrTtd t	j S td	|j d
| d|t	jkrotd t	j S |t	jkr~td t	j S td	|j d|t	jkrtd t	j S |t	jkrtd t	j S |t	jkrtd t	j S |t	jkrt rtd t	j S td|j d|t	jkrtd t	j S |d u rHtjrtjrtd t	j S tjr	tjr	t r	td t	j S ddlm} | }|d ur&|jj r&td t	j S tjr>t r>tjdur>td t	j S td t	j S t!d|j d)Nr   rocm_aiter_opsr   z=ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype.r
   z>Sparse MLA backend on ROCm only supports block size 1 for now.zUsing Sparse MLA backend.zUsing Triton MLA backend.z The selected backend, z,does not support block size .zUsing AITER MLA backend.zUsing AITER TRITON MLA backend.z1,is not MLA type while requested for MLA backend.zUsing FlexAttention backend.zUsing Triton Attention backend.zUsing Rocm Attention backend.z$Using Aiter Flash Attention backend.zThe selected backend, z*, is only supported on gfx9 architectures.z&Using Aiter Unified Attention backend.)get_current_vllm_config_or_noneFzAttention backend zM is not supported on ROCm. Note that V0 attention backends have been removed.)"vllm._aiter_opsr   r`   rd   
use_sparse
startswith
ValueErrorr>   r}   r	   ROCM_AITER_MLA_SPARSEget_pathuse_mlais_mla_enabledROCM_AITER_MLA
TRITON_MLAnameinfoROCM_AITER_TRITON_MLAFLEX_ATTENTIONTRITON_ATTN	ROCM_ATTNROCM_AITER_FAr[   ROCM_AITER_UNIFIED_ATTNrp   VLLM_ROCM_USE_AITER%VLLM_ROCM_USE_AITER_UNIFIED_ATTENTIONVLLM_ROCM_USE_AITER_MHAvllm.configr   attention_configuse_prefill_decode_attentionr7   )r   r   r   r   r`   rd   r   vllm_configr/   r/   r0   get_attn_backend_cls  s   







































z!RocmPlatform.get_attn_backend_clsc                 C   s   t jt jt jgS r*   )r	   
FLASH_ATTNr   
TORCH_SDPAr   r/   r/   r0   get_supported_vit_attn_backends  s   z,RocmPlatform.get_supported_vit_attn_backendsr_   dtypebackendzAttentionBackendEnum | Nonec                 C   s   |d ur!||   v sJ d| d|    td| d |S ddlm} ddlm} | r<t r<td t	j
S t rW|d	d urW|tjksO|tjkrWtd
 t	jS t rot ro|tjksg|tjkrotd t	jS td t	jS )NzBackend z= is not supported for vit attention. Supported backends are: zUsing backend z for vit attentionr   ru   r   z2Using AITER Flash Attention backend for ViT model.rw   z,Using Flash Attention backend for ViT model.z=Using Flash Attention (Triton backend) for ViT model on RDNA.z'Using Torch SDPA backend for ViT model.)r   r>   r}   rz   rv   r   r   
is_enabledr[   r	   r   rA   float16ro   r   rT   r   r   )r   r_   r   r   rv   r   r/   r/   r0   get_vit_attn_backend  s<   


z!RocmPlatform.get_vit_attn_backenddevicec                 C   s   t j| dS )z:
        Set the device for the current platform.
        N)rA   r<   
set_device)r   r   r/   r/   r0   r     s   zRocmPlatform.set_device   )maxsizer   	device_idc                 C   s   t j|\}}t||dS )N)majorminor)rA   r<   get_device_capabilityr   )r   r   r   r   r/   r/   r0   r     s   z"RocmPlatform.get_device_capabilityphysical_device_idsc           	      C   s   dd |D }t |D ]F\}}t |D ]=\}}||k rPzt||}|d dks-|d dkr2W   dS W q tyO } ztjd|d	 W Y d
}~  dS d
}~ww qqdS )zN
        Query if the set of gpus are fully connected by xgmi (1 hop)
        c                 S   s   g | ]}t  | qS r/   )r   )rI   ir/   r/   r0   
<listcomp>  s    z3RocmPlatform.is_fully_connected.<locals>.<listcomp>hopsr
   type   Fz AMD 1 hop XGMI detection failed.)exc_infoNT)	enumerater   r   r>   error)	r   r   r8   r   handlejpeer_handle	link_typer   r/   r/   r0   is_fully_connected  s"   


zRocmPlatform.is_fully_connectedc                 C   s<   |  |}t | }t|}|d }|tv rt| S |d S )Nr   market_name)device_id_to_physical_device_idr   r   r'   )r   r   physical_device_idr   r9   r   r/   r/   r0   get_device_name  s   

zRocmPlatform.get_device_namec                 C   s   t j|}|jS r*   )rA   r<   rB   total_memory)r   r   device_propsr/   r/   r0   get_device_total_memory  s   z$RocmPlatform.get_device_total_memoryr   r   c                 C   sj  ddl m} ddlm} |j}|j}|j}||jk}| }|	 }	|
 }
| }|j rL|jdkr>td |j|_n|jdkrLtd |j|_|re|jd u retjrbtjrbd|_td nd	|_|jd
krmd|_|	r||s|d|jvr||jd |
rd|jvr|jd |rd|jv rtd |jd |rd|jvrd|jvr|jd |jd d S )Nr   r   )CUDAGraphModer
   z~Decode context parallel (DCP) is enabled, which is incompatible with full CUDA graphs. Overriding cudagraph_mode to PIECEWISE.zPrefill context parallel (PCP) is enabled, which is incompatible with full CUDA graphs. Overriding cudagraph_mode to PIECEWISE.rh   z=[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64.rj   rm   z vllm.v1.worker.gpu_worker.Workerz	-rms_normz	+rms_normz
-quant_fp8z
+quant_fp8z-grouped_topkzVLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled, which requires the 'grouped_topk' custom op. Overriding the user-provided '-grouped_topk'.z+grouped_topkz+sparse_attn_indexer)r   r   vllm.config.compilationr   cache_configcompilation_configparallel_configNONEis_fused_moe_enabledis_rmsnorm_enabledis_linear_fp8_enabled$is_fusion_moe_shared_experts_enabledcudagraph_modehas_full_cudagraphsdecode_context_parallel_sizer>   r@   	PIECEWISEprefill_context_parallel_sizer`   rp   r   r   warning
worker_cls
custom_opsappendremove)r   r   r   r   r   r   r   is_eager_executionuse_aiter_fused_moeuse_aiter_rms_normuse_aiter_fp8_linearuse_aiter_fused_ser/   r/   r0   check_and_update_config  sh   




	



z$RocmPlatform.check_and_update_config
model_archc                 C   s>   |t v rtd| d|tv rt| }td|| d S d S )NzModel architecture 'z#' is not supported by ROCm for now.z:Model architecture '%s' is partially supported by ROCm: %s)r   r   r   r>   r   )r   r  msgr/   r/   r0   verify_model_arch9  s   
zRocmPlatform.verify_model_archquantc                    s2   t  | |dkrtjstd dtjd< d S )Nr   zcUsing AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ is not set, enabling VLLM_USE_TRITON_AWQ.1VLLM_USE_TRITON_AWQ)r   verify_quantizationrp   r  r>   r   r{   r|   )r   r	  r   r/   r0   r  H  s   z RocmPlatform.verify_quantizationc                 C      dS )Nz4vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPUr/   r   r/   r/   r0   get_punica_wrapperR     zRocmPlatform.get_punica_wrapperc                 C   s$   t j| t j|\}}|| S r*   )rA   r<   reset_peak_memory_statsmem_get_info)r   r   free_mem	total_memr/   r/   r0   get_current_memory_usageV  s   z%RocmPlatform.get_current_memory_usagec                 C   s   	 dS )NzHvllm.distributed.device_communicators.cuda_communicator.CudaCommunicatorr/   r   r/   r/   r0   get_device_communicator_cls^  s   z(RocmPlatform.get_device_communicator_clsc                    $   t jdj t fdddD S )Nr   c                 3   rG   r*   r/   rI   gfxgcn_archr/   r0   rM   g  rN   z+RocmPlatform.supports_mx.<locals>.<genexpr>)gfx95rA   r<   rB   rC   rS   r   r/   r  r0   supports_mxd     zRocmPlatform.supports_mxc                    r  )Nr   c                 3   rG   r*   r/   r  r  r/   r0   rM   l  rN   z,RocmPlatform.supports_fp8.<locals>.<genexpr>)gfx94r  rQ   r  r   r/   r  r0   supports_fp8i  r  zRocmPlatform.supports_fp8c                 C      dt jdjv S )Nr  r   rA   r<   rB   rC   r   r/   r/   r0   is_fp8_fnuzn  s   zRocmPlatform.is_fp8_fnuzc                 C   s   |   rtjS tjS r*   )r#  rA   float8_e4m3fnuzfloat8_e4m3fnr   r/   r/   r0   	fp8_dtypes  s   zRocmPlatform.fp8_dtypec                    s,   t jdj ddg}t fdd|D S )Nr   r  r  c                 3   rG   r*   r/   r  r  r/   r0   rM     rN   z4RocmPlatform.use_custom_allreduce.<locals>.<genexpr>r  )r   supported_archsr/   r  r0   use_custom_allreducez  s   z!RocmPlatform.use_custom_allreducec                 C   r  NTr/   r   r/   r/   r0   opaque_attention_op  r  z RocmPlatform.opaque_attention_opc                 C   r!  )Ngfx1r   r"  r   r/   r/   r0   is_navi  s   zRocmPlatform.is_navic                 C   r  )Nz,vllm.compilation.cuda_graph.CUDAGraphWrapperr/   r   r/   r/   r0   get_static_graph_wrapper_cls  r  z)RocmPlatform.get_static_graph_wrapper_clsc                 C   s   t  S r*   r   r   r/   r/   r0   device_count  s   zRocmPlatform.device_countc                 C   sb   |t jkr-| ds/|  }|  }|d u rd}n	| }d| }td| d| dd S d S )NP   z"does not have a compute capabilityzhas compute capability zQBfloat16 is only supported on GPUs with compute capability of at least 8.0. Your z GPU zg. You can use float16 instead by explicitly setting the `dtype` flag in CLI, for example: --dtype=half.)rA   ro   has_device_capabilityr   r   as_version_strr   )r   r   
capabilitygpu_namecompute_strversion_strr/   r/   r0   check_if_supports_dtype  s"   


z$RocmPlatform.check_if_supports_dtypec                 C   r  r)  r/   r   r/   r/   r0   support_hybrid_kv_cache  r  z$RocmPlatform.support_hybrid_kv_cachec                 C   r  r)  r/   r   r/   r/   r0   support_static_graph_mode  r  z&RocmPlatform.support_static_graph_mode)r3   Nr*   )r   )r   r   r3   N):__name__
__module____qualname__r   ROCM_enumr   str__annotations__r   r   r   r   r   r   listr   r[   classmethodr   r   r   intrA   r   r   r   r   r   r   r   r2   boolr   r   r   r  r  r  r  typesDevicefloatr  r  r  r   r#  r&  r(  r*  r,  r-  r.  r6  r7  r8  __classcell__r/   r/   r   r0   r      s   
 

u,	P	
r   )NN)Ar{   	functoolsr   r   r   typingr   rA   	vllm.envsrp   vllm.loggerr   vllm.utils.torch_utilsr   #vllm.v1.attention.backends.registryr	   	interfacer   r   r   r   r   vllm.v1.attention.selectorr   r9  r>   amdsmir   r   r   r   r   r   r~   rD   r   vllm._Cr   r   r   r@  r>  r?  r   dictr'   r|   valr6   cuda_valr2   r;   rE   rC  rT   rX   r[   r\   r]   r   rB  Tensorrt   r   r   r/   r/   r/   r0   <module>   s   
$


		
-