o
    پi                     @   sv   d Z ddlmZ ddlmZ ddlZddlm  mZ ddl	m
Z
mZmZmZ ddlmZ eeZG dd deZdS )	zp
This file is a platform abstraction for ROCm GPUs,
adjusted to match the structure and interface of `cuda.py`.
    )	lru_cache)AnyN)AttentionBackendEnumDeviceCapabilityPlatformPlatformEnum)init_loggerc                   @   sd  e Zd ZU ejZdZeed< dZ	eed< dZ
eed< dZeed< ed	ejfd
dZed/ded	efddZed/ded	efddZeeddd/ded	efddZededB d	efddZed0ddZed1dejdB d	efddZe			 	d2ded!ed"ed#ed	ef
d$d%Zed&edB d'ed(ejd	efd)d*Zed	efd+d,Z ed	efd-d.Z!dS )3RocmPlatformrocmdevice_namecudadevice_typeCUDAdispatch_keyCUDA_VISIBLE_DEVICESdevice_control_env_varreturnc                 C   s   t dtj S )Nzcuda:)torchdeviceenvs
LOCAL_RANKcls r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/platforms/rocm.pyget_local_torch_device#   s   z#RocmPlatform.get_local_torch_devicer   	device_idc                 C   s   t j|\}}t||dS )N)majorminor)r   r   get_device_capabilityr   )r   r   r   r   r   r   r   r   '   s   z"RocmPlatform.get_device_capabilityc                 C   s   t tj|S N)strr   r   get_device_namer   r   r   r   r   r"   ,   s   zRocmPlatform.get_device_name   )maxsizec                 C   s   t j|jS r    )r   r   get_device_propertiestotal_memoryr#   r   r   r   get_device_total_memory0   s   z$RocmPlatform.get_device_total_memoryenforce_eagerNc                 C   s   |r	t d dS dS )NzTo see benefits of async output processing, enable CUDA graph. Since enforce-eager is enabled, async output processor cannot be usedFT)loggerwarning)r   r)   r   r   r   is_async_output_supported5   s   z&RocmPlatform.is_async_output_supportedc                 C   s   d S r    r   r   r   r   r   log_warnings?      zRocmPlatform.log_warningsr   c                 C   s   t j| tt j|S r    )r   r   reset_peak_memory_statsfloatmax_memory_allocated)r   r   r   r   r   get_current_memory_usageC   s   z%RocmPlatform.get_current_memory_usageFTdistributedempty_cache	cpu_groupc           	      C   sh   |rt j  t j|\}}|r0dd lm} t j|t jdd}|j||j	j
|d t| }|d S )Nr   r   )dtyper   )opgroupi   @)r   r   r4   mem_get_infotorch.distributedr3   tensorfloat32
all_reduceReduceOpMINr0   item)	r   r   r3   r4   r5   free_gpu_memory_distr;   r   r   r   get_available_gpu_memoryH   s   
z%RocmPlatform.get_available_gpu_memoryselected_backend	head_sizer6   c                 C   sR  |t jkrtd dS |t jd fv rn;|t jkr.|tjtjfvr't	d| td dS |t j
t jfv rBt|j d| j d|rOtd| j d	| t j}|tjtjfvrbtd
 t j}|t jkrzdd l}ddlm} | }||vrtd| t j}W n ty   td t j}Y nw |t jkrtd dS td dS )NzUsing Torch SDPA backend.zHsglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackendz^AITer backend works best with fp16/bf16 inputs but got dtype=%s. Proceeding with AITer anyway.zUsing AITer backend on ROCm.zJsglang.multimodal_gen.runtime.layers.attention.backends.aiter.AITerBackendz is not supported on .zInvalid attention backend for z: zWCannot use FlashAttention backend for dtype other than torch.float16 or torch.bfloat16.r   )FlashAttentionBackendz5Cannot use FlashAttention-2 backend for head size %d.zCannot use FlashAttention backend because the flash_attn package is not found. Make sure that flash_attn was built and installed (on by default).zUsing Flash Attention backend.zXsglang.multimodal_gen.runtime.layers.attention.backends.flash_attn.FlashAttentionBackend)r   
TORCH_SDPAr*   infoFAAITERr   float16bfloat16r+   SLIDING_TILE_ATTN	SAGE_ATTN
ValueErrornamer   
flash_attnBsglang.multimodal_gen.runtime.layers.attention.backends.flash_attnrH   get_supported_head_sizesImportError)r   rE   rF   r6   target_backendrS   rH   supported_sizesr   r   r   get_attn_backend_cls_str^   sj   






	

z%RocmPlatform.get_attn_backend_cls_strc                 C      dS )Nzasglang.multimodal_gen.runtime.distributed.device_communicators.cuda_communicator.CudaCommunicatorr   r   r   r   r   get_device_communicator_cls   r.   z(RocmPlatform.get_device_communicator_clsc                 C   rZ   )z:ROCm performs better without DIT layerwise offload on Wan.Fr   r   r   r   r   /enable_dit_layerwise_offload_for_wan_by_default   s   z<RocmPlatform.enable_dit_layerwise_offload_for_wan_by_default)r   )r   Nr    )r   FTN)"__name__
__module____qualname__r   ROCM_enumr   r!   __annotations__r   r   r   classmethodr   r   r   intr   r   r"   r   r(   boolr,   r-   r0   r2   r   rD   r   r6   rY   r[   r\   r   r   r   r   r	      sf   
 	Lr	   )__doc__	functoolsr   typingr   r   sglang.multimodal_gen.envsmultimodal_genr   1sglang.multimodal_gen.runtime.platforms.interfacer   r   r   r   1sglang.multimodal_gen.runtime.utils.logging_utilsr   r]   r*   r	   r   r   r   r   <module>   s   