o
    	۾i                  !   @   s
  U d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ e	 Zdefdd	Ze Zdefd
dZdedefddZ		 	 							ddejdejdejdejdejdejdB dedededejdB dejdB dejdB dejdB dejdB dejdB dejf dd Z		 	 							ddejdejdejdejdejdejdB dedededejdB dejdB dejdB dejdB dejdB dejdB dejf d!d"Z								 ddejdejdejd#ejdejd$ejdB d%ejdB d&ejdB d'ejdB d(ed)ejdB dejdB dedejfd*d+Z								 ddejdejdejd#ejdejd$ejdB d%ejdB d&ejdB d'ejdB d(ed)ejdB dejdB dedejfd,d-Zd#ejd.ejd/ejd0ejd1eddfd2d3Zd#ejd.ejd/ejd0ejd1eddfd4d5Zd#ejd.ejd0ejddfd6d7Zd#ejd.ejd0ejddfd8d9Z 	:dd0ejd;ejd#ejdejd<ed=ed>ed?e!ddfd@dAZ"	:dd0ejd;ejd#ejdejd<ed=ed>ed?e!ddfdBdCZ#	D	:dd0ejd#ejdejd<ed=ed>edEe$d?e!ddfdFdGZ%	D	:dd0ejd#ejdejd<ed=ed>edEe$d?e!ddfdHdIZ&da'edB e(dJ< defdKdLZ)				:	M		ddNejdOejdPejdQejdRedSejdB dTejdB dUejdB dVe!dWe!dXejdB dYejdB ddfdZd[Z*				:	M		ddNejdOejdPejdQejdRedSejdB dTejdB dUejdB dVe!dWe!dXejdB dYejdB ddfd\d]Z+dej,fd^ejd_ejd`ejdaejdbejdB dejdejfdcddZ-dej,fd^ejd_ejd`ejdaejdbejdB dejdejfdedfZ.ej,fd^ejd_ejd`ejdaejdejdejfdgdhZ/ej,fd^ejd_ejd`ejdaejdejdejfdidjZ0ej,fd^ejd_ejd`ejdaejdejdejfdkdlZ1ej,fd^ejd_ejd`ejdaejdejdejfdmdnZ2doejdpejdqe!dejfdrdsZ3doejdpejdqe!dejfdtduZ4doejdvejdpejdqe!de5ejejf f
dwdxZ6doejdvejdpejdqe!de5ejejf f
dydzZ7doejdvejdpejd{e!d|ejde5ejejejf fd}d~Z8doejdvejdpejd{e!d|ejde5ejejejf fddZ9doejdpejd{e!d|ejde5ejejf f
ddZ:doejdpejd{e!d|ejde5ejejf f
ddZ;	ddoejd|ejdejdB de5ejejf fddZ<	ddoejd|ejdejdB de5ejejf fddZ=	ddoejd|ejdejdB de5ejejf fddZ>	ddoejd|ejdejdB de5ejejf fddZ?doejdvejdpejdqe!dede5ejejejf fddZ@doejdvejdpejdqe!dede5ejejejf fddZAdoejdpejdqe!dede5ejejf f
ddZBdoejdpejdqe!dede5ejejf f
ddZCdoejdede5ejejf fddZDdoejdede5ejejf fddZEdoejdede5ejejf fddZFdoejdede5ejejf fddZGdoejdpejdqe!dvejdede5ejejf fddZHdoejdpejdqe!dvejdede5ejejf fddZIdaJG dd dZKeKL  dS )    N)Callable)
OpOverload)current_platform)direct_register_custom_op)rocm_aiter_sparse_attn_indexer#rocm_aiter_sparse_attn_indexer_fakereturnc                  C   s   ddl m}  | dd uS )Nr   	find_specaiter)importlib.utilr
   r	    r   C/home/ubuntu/.local/lib/python3.10/site-packages/vllm/_aiter_ops.pyis_aiter_found   s   r   c                  C   s"   t  rtrddlm}  |  S dS )a  Check if AITER library is available and platform supports it.

    Checks: platform (ROCm), device arch (gfx9), and library existence.
    Does NOT check environment variables - that's handled by rocm_aiter_ops.is_enabled().

    This function determines if aiter CAN be used, not if it SHOULD be used.

    Separation of concerns:
    - This function: Can aiter work on this system? (platform + library availability)
    - rocm_aiter_ops.is_enabled(): Should aiter be used by default? (adds env var check)
    - Backend selection: Can explicitly request aiter regardless of env var

    This allows explicit backend selection via attention_config to work even when
    VLLM_ROCM_USE_AITER=0, while preventing unwanted JIT warnings for auto-discovery.
    r   on_gfx9F)r   is_rocmIS_AITER_FOUNDvllm.platforms.rocmr   r   r   r   r   is_aiter_found_and_supported$   s   r   funcc                    s   t   fdd}|S )zpDecorator that only executes the function if
    ROCm AITER package is supported and enabled on gfx9 archs.
    c                     s   t  r
 | i |S d S N)r   )argskwargsr   r   r   wrapper@   s   z#if_aiter_supported.<locals>.wrapper)	functoolswraps)r   r   r   r   r   if_aiter_supported;   s   r   Fhidden_statesw1w2topk_weighttopk_idsexpert_maskactivation_methodquant_methoddoweight_stage1w1_scalew2_scalea1_scalea2_scalenum_local_tokensoutput_dtypec                 C   sR   ddl m}m} ddlm} ||}||}|| |||||||||	|
||||dS )Nr   )ActivationType	QuantType)	fused_moe)r,   dtype)r   r.   r/   aiter.fused_moer0   )r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   
activation
quant_typer   r   r   _rocm_aiter_fused_moe_implJ   s*   r5   c                 C   s    |d urt j| |dS t | S )Nr1   torch
empty_liker   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r   r   r   _rocm_aiter_fused_moe_faket   s   
r;   topk_weights	fc1_scale	fc2_scalefc1_smooth_scalefc2_smooth_scalea16per_tensor_quant_scalec                 C   sB   ddl m} ddlm} ||}|| |||||||||	|
||dS )Nr   )r.   )asm_moe_tkw1)r=   r>   r?   r@   rA   rB   r$   r3   )r   r.   aiter.fused_moe_bf16_asmrC   )r   r    r!   r<   r#   r=   r>   r?   r@   rA   rB   r$   r%   r.   rC   r3   r   r   r   _rocm_aiter_asm_moe_tkw1_impl   s$   rE   c                 C   
   t | S r   r7   r   r    r!   r<   r#   r=   r>   r?   r@   rA   rB   r$   r%   r   r   r   _rocm_aiter_asm_moe_tkw1_fake   s   
rH   topk_indicestoken_expert_indicesgating_outputrenormalizec                 C   s    ddl m} || |||| d S )Nr   )topk_softmax)r   rM   )r<   rI   rJ   rK   rL   rM   r   r   r   _rocm_aiter_topk_softmax_impl   s   
rN   c                 C      d S r   r   r<   rI   rJ   rK   rL   r   r   r   _rocm_aiter_topk_softmax_fake   s   rQ   c                 C   s   ddl m} || || d S )Nr   )topk_sigmoid)r   rR   )r<   rI   rK   rR   r   r   r   _rocm_aiter_topk_sigmoid_impl   s   rS   c                 C   rO   r   r   )r<   rI   rK   r   r   r   _rocm_aiter_topk_sigmoid_fake   s   rT         ?correction_biasnum_expert_group
topk_groupneed_renormrouted_scaling_factorc           	   	   C   s&   ddl m} || ||||||| d S )Nr   )biased_grouped_topk)r   r[   )	rK   rV   r<   r#   rW   rX   rY   rZ   r[   r   r   r   $_rocm_aiter_biased_grouped_topk_impl   s   
r\   c                 C   rO   r   r   rK   rV   r<   r#   rW   rX   rY   rZ   r   r   r   $_rocm_aiter_biased_grouped_topk_fake     
r^   softmaxscoring_funcc           
   	   C   s.   |dk}ddl m}	 |	| ||||||| d S )Nr`   r   )grouped_topk)r   rb   )
rK   r<   r#   rW   rX   rY   ra   rZ   
is_softmaxrb   r   r   r   _rocm_aiter_grouped_topk_impl  s   
rd   c                 C   rO   r   r   rK   r<   r#   rW   rX   rY   ra   rZ   r   r   r   _rocm_aiter_grouped_topk_fake)  r_   rf   _AITER_MLA_SUPPORTS_FP8c                  C   sh   t du r2zddl} ddlm} | |}d|jv od|jv a W t S  tttt	t
fy1   da Y t S w t S )zKCheck if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters.Nr   mla_decode_fwdq_scalekv_scaleF)rg   inspect	aiter.mlari   	signature
parametersImportErrorModuleNotFoundErrorAttributeError
ValueError	TypeError)rl   ri   sigr   r   r   _check_aiter_mla_fp8_support:  s&   

rv           q	kv_buffero	qo_indptrmax_seqlen_qo	kv_indptr
kv_indiceskv_last_page_lenssm_scale	logit_caprj   rk   c              	   C   s`   ddl m} ||	d}t r|
|d< ||d< || |ddd| jd ||||||fi | d S )Nr   rh   )r   r   rj   rk      )rm   ri   rv   viewshape)rx   ry   rz   r{   r|   r}   r~   r   r   r   rj   rk   ri   r   r   r   r   _rocm_aiter_mla_decode_fwd_implV  s&   	
r   c                 C   rO   r   r   )rx   ry   rz   r{   r|   r}   r~   r   r   r   rj   rk   r   r   r   _rocm_aiter_mla_decode_fwd_fake}  s   r   ABAsBsbiasc                 C   s   ddl m} || |||||S )Nr   )gemm_a8w8_CK)r   r   )r   r   r   r   r   r-   r   r   r   r   _rocm_aiter_gemm_a8w8_impl  s   r   c           	      C   s,   | j d }|j d }tj|||| jd}|S Nr   r1   devicer   r8   emptyr   )	r   r   r   r   r   r-   mnYr   r   r   _rocm_aiter_gemm_a8w8_fake  s   

r   c                 C      ddl m} || ||||dS Nr   )gemm_a8w8_blockscaler6   )%aiter.ops.triton.gemm_a8w8_blockscaler   r   r   r   r   r-   r   r   r   r   ,_rocm_aiter_triton_gemm_a8w8_blockscale_impl     r   c                 C   ,   | j d }|j d }tj|||| jd}|S r   r   r   r   r   r   r-   r   r   r   r   r   r   ,_rocm_aiter_triton_gemm_a8w8_blockscale_fake     

r   c                 C   r   r   )r   r   r   r   r   r   %_rocm_aiter_gemm_a8w8_blockscale_impl  r   r   c                 C   r   r   r   r   r   r   r   %_rocm_aiter_gemm_a8w8_blockscale_fake  r   r   xweightvariance_epsilonc                 C   sP   ddl m} |  dkr"| j}| d|d } || ||} | |S || ||S )Nr   )rms_norm   r   )r   r   dimr   reshape)r   r   r   r   x_original_shaper   r   r   _rocm_aiter_rms_norm_impl  s   
r   c                 C   rF   r   r7   r   r   r   r   r   r   _rocm_aiter_rms_norm_fake  s   
r   residualc                 C   s:   ddl m} t|}t| }||| |||| ||fS )Nr   )rmsnorm2d_fwd_with_add)r   r   r8   r9   )r   r   r   r   r   residual_outoutr   r   r   '_rocm_aiter_rmsnorm2d_fwd_with_add_impl  s   

r   c                 C   s   t |}t | }||fS r   r7   )r   r   r   r   r   r   r   r   r   '_rocm_aiter_rmsnorm2d_fwd_with_add_fake  s   

r   epsilonquant_dtypec           	   
   C   sx   dd l }|tjtfv sJ tj| jd dtj| jd}tj| j|| jd}t| }|j	|| |||||dd |||fS Nr   r   r   )use_model_sensitive_rmsnorm)
r   r8   int8	FP8_DTYPEr   r   float32r   r9   #rmsnorm2d_fwd_with_add_dynamicquant)	r   r   r   r   r   
rocm_aitery_scaler   r   r   r   r   0_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl  s    

r   c                 C   sD   t j| jd dt j| jd}t j| j|| jd}t | }|||fS Nr   r   r   )r8   r   r   r   r   r9   )r   r   r   r   r   r   r   r   r   r   r   0_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake0  s   

r   c                 C   sh   dd l }|tjtfv sJ tj| jd dtj| jd}tj| j|| jd}|j|| |||dd ||fS r   )	r   r8   r   r   r   r   r   r   rmsnorm2d_fwd_with_dynamicquant)r   r   r   r   r   r   r   r   r   r   ,_rocm_aiter_rmsnorm_fused_dynamic_quant_impl>  s   r   c                 C   s8   t j| jd dt j| jd}t j| j|| jd}||fS r   )r8   r   r   r   r   )r   r   r   r   r   r   r   r   r   ,_rocm_aiter_rmsnorm_fused_dynamic_quant_fakeR  s   r   scalec                 C   s   ddl m} || ||S )Nr   )per_tensor_quant_hip)aiter.ops.quantr   )r   r   r   r   r   r   r   !_rocm_aiter_per_tensor_quant_impl^  s   r   c                 C   s"   t j| |dt jdt j| jdfS )Nr6   r   r   )r8   r9   r   r   r   r   r   r   r   r   r   !_rocm_aiter_per_tensor_quant_fakeh  s   
r   c              	   C   s   ddl m} |tjtfv sJ | j}tj| jt| jd}|d u r3tjg |d d dR tj| jd}||| |d dd dd ||fS )Nr   )dynamic_per_token_scaled_quantr   r   r   F)scale_ubshuffle_scalenum_rowsnum_rows_factor)	r   r   r8   r   r   r   r   r   r   )r   r   r   r   	out_shaper   r   r   r    _rocm_aiter_per_token_quant_implr  s    &	r   c                 C   s@   | j }tj| j t| jdtjg |d d dR tj| jdfS )Nr   r   r   r   r8   r   r   r   r   )r   r   r   r   r   r   r    _rocm_aiter_per_token_quant_fake  s   $r   
group_sizec           
      C   s<   ddl m} || ||d d d |t|d	\\}}}}}	||	|fS Nr   )fused_rms_fp8_group_quant)r   dtype_quantres1 aiter.ops.triton.fused_fp8_quantr   r   )
r   r   r   r   r   r   x_quantx_quant_scales_resr   r   r   1_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl  s    r   c                 C   sR   | j \}}||| d | f}tj| t| jdtj||jdtj|tj| jdfS )Nr   r   )r   r   r8   r9   r   r   r   r   )r   r   r   r   r   MNscale_shaper   r   r   1_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake  s   
r   c           	      C   s:   ddl m} || ||d d d |td d	\\}}}}}||fS r   r   )	r   r   r   r   r   r   r   r   r   r   r   r   (_rocm_aiter_rmsnorm_fp8_group_quant_impl  s   r   c                 C   sD   | j \}}||| d | f}tj| t| jdtj|tj| jdfS )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   (_rocm_aiter_rmsnorm_fp8_group_quant_fake  s
   
r   c                 C   sD   | j d | dksJ dddlm}m} ||j}||  tdS )Nr   r   z+Input shape must be divisible by group size)r/   get_hip_quant)r   )r   r   r/   r   	per_1x128
contiguousr   )r   r   r/   r   aiter_per1x128_quantr   r   r    _rocm_aiter_group_fp8_quant_impl  s   
r   c                 C   sL   | j \}}tj||ft| jd}tj||| d | ftj| jd}||fS )Nr   r   r   )r   r   r   r   x_fp8out_bsr   r   r    _rocm_aiter_group_fp8_quant_fake  s   
r   c                 C   s   ddl m} || d|tdS )Nr   )act_mul_and_fp8_group_quantsilu)r3   r   r   )aiter.ops.triton.activationr   r   )r   r   r   r   r   r   ,_rocm_aiter_act_mul_and_fp8_group_quant_impl  s   r   c                 C   sd   | j \}}|d dksJ |d }tj||ft| jd}tj||| d | ftj| jd}||fS )Nr   r   r   r   r   )r   r   r   r   N_halfr   r   r   r   r   ,_rocm_aiter_act_mul_and_fp8_group_quant_fake  s   
r   x_pad_to_multiplec                 C   r   )Nr   )fused_add_rmsnorm_pad)r   )&aiter.ops.triton.fused_add_rmsnorm_padr   )r   r   r   r   r   r   r   r   r   '_rocm_aiter_triton_add_rmsnorm_pad_impl  s   r   c           
      C   sV   | j \}}|dkr|| d | | }n|}tj||f| j| jd}t|}	||	fS r   )r   r8   r   r1   r   r9   )
r   r   r   r   r   r   r   N_outr   r   r   r   r   '_rocm_aiter_triton_add_rmsnorm_pad_fake1  s   

r   c                "   @   sF	  e Zd ZdZejZejZej	Z
ejZejZejZejZejZejZejZejZejZejZejZ e!dd Z"e!e#de$fddZ%e!e#de$fddZ&e!e#de$fd	d
Z'e!e#de$fddZ(e!e#de$fddZ)e!e#de$fddZ*e!e#de$fddZ+e!e#de$fddZ,e!e#de$fddZ-e!e#de$fddZ.e!e#de$fddZ/e!e#de$fddZ0e!e#de$fddZ1e!e#de$fdd Z2e!e#de$fd!d"Z3e4e#dd$d%Z5e4de6fd&d'Z7e4de6fd(d)Z8e4de6fd*d+Z9e4de6fd,d-Z:e4de6fd.d/Z;e4de6fd0d1Z<e4de6fd2d3Z=e4de6fd4d5Z>e4de6fd6d7Z?e4de6fd8d9Z@e4d:eAjBd;eAjBd<eCdeAjBfd=d>ZDe4d:eAjBd?eAjBd;eAjBd<eCdeEeAjBeAjBf f
d@dAZFe4d#eAjGfdBeAjBdCeAjBdDeAjBdEeAjBdFeAjBd#B dGeAjHdeAjBfdHdIZIe4eAjGfdBeAjBdCeAjBdDeAjBdEeAjBdJeJeK dGeAjHdeAjBfdKdLZLe4eAjGfdBeAjBdCeAjBdDeAjBdEeAjBdJeJeK dGeAjHdeAjBfdMdNZMe4	#	O	O	P	#	#	#	#	#	#ddQeAjBdReAjBdSeAjBdTeAjBdUeAjBdVeAjBd#B dWeKdXeKdYe$dZeAjBd#B d[eAjBd#B d\eAjBd#B d]eAjBd#B d^eAjBd#B dGeAjHd#B deAjBf d_d`ZNe4	#	#	#	#	P	#	#	OddQeAjBdReAjBdSeAjBdaeAjBdUeAjBdbeAjBd#B dceAjBd#B ddeAjBd#B deeAjBd#B dfe$dgeAjBd#B dVeAjBd#B dWeKdeAjBfdhdiZOe4daeAjBdjeAjBdkeAjBdleAjBdme$deEeAjBdnf fdodpZPe4daeAjBdjeAjBdkeAjBdleAjBdme$deEeAjBdnf fdqdrZQe4	sddleAjBdteAjBdaeAjBdUeAjBdueKdveKdwe$dxeCdd#fdydzZRe4	{	sddleAjBdaeAjBdUeAjBdueKdveKdwe$d|eSdxeCdd#fd}d~ZTe4	#	#	#		#	#ddeAjBdeAjBdeAjBdeCdeAjBdeKdeAjBd#B deAjBd#B deAjBd#B deCdeAjBd#B deAjBd#B fddZUe4	#dd:eAjBdeAjHdeAjBd#B deEeAjBeAjBf fddZVe4	#dd:eAjBdeAjHdeAjBd#B deEeAjBeAjBf fddZWe4eAjXd#fd:eAjBd;eAjBdeAjBdeAjHd#B deAjBd#B deAjBfddZYe4deAjBdeAjBdeAjBdeAjBdeKdeKde$fddZZe4	P	P	#ddeAjBdeAjBdeAjBdeAjBde$d#B de$d#B deAjBd#B deAjBfddZ[e4dd#eAjXd#d#dPd#fdeAjBdeAjBdeAjBdeKdFeAjBd#B deAjHd#B deKd#B deAjBd#B de$d#B de\d#B deAjBfddZ]e4	ddeAjBdeKdeEeAjBeAjBf fddZ^e4deKdeKde$fddZ_e4deKdeKde$fddZ`e4	ddeAjBdeEeKeKf deAjBfddZae4dddeAjBdeEeKeKf deEeAjBdnf fddńZbe4	#		#	P	#	#	P	#ddeAjBdeAjBdeAjBdeAjBdeAjBdeKdeKdeKd#B deCdeCd#B de$deEeKeKf d#B deAjBd#B de$deAjBd#B fddԄZce4deAjBdeAjBdeAjBdeAjBdeAjBdeKdeAjBdeAjBdeAjBfdd߄Zdd#S )rocm_aiter_opsa  ROCm AITER operations wrapper for AMD GPU acceleration in vLLM.

    This class centralizes the import and registration of AITER ops,
    and provides a unified interface for checking if AITER is enabled.
    Operations are only available on supported gfx9
    architectures when aiter is installed.

    The class uses environment variables to control which features are enabled,
    allowing fine-grained control over which AITER optimizations are used.

    Environment Variables:
        VLLM_ROCM_USE_AITER: Main toggle for all AITER operations.
        VLLM_ROCM_USE_AITER_LINEAR: Controls GEMM and quantization ops.
        VLLM_ROCM_USE_AITER_RMSNORM: Controls RMSNorm operations.
        VLLM_ROCM_USE_AITER_MOE: Controls MoE (Mixture of Experts) ops.
        VLLM_ROCM_USE_AITER_MLA: Controls MLA (Multi-head Latent Attention) ops.
        VLLM_ROCM_USE_AITER_MHA: Controls MHA ops including flash_attn_varlen.
        VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: Controls Triton unified attention.
        VLLM_ROCM_USE_AITER_FP8BMM: Controls FP8 batched matrix multiply.
        VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: Controls FP4 assembly GEMM.
        VLLM_ROCM_USE_AITER_TRITON_ROPE: Controls Triton rotary embeddings.
        VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: Controls shared expert fusion.
        VLLM_ROCM_USE_AITER_TRITON_GEMM: Controls Triton unquantized GEMM.

    Note:
        The environment variables are assigned when the module is imported,
        so you can't change the environment variables after the module is imported.
        This is done out of performance consideration. Accessing environment variables
        is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
        so we don't want to do it repeatedly, especially in the hot path (the forward pass).
        You can call the refresh_env_variables() function to reload the env variables
        after monkey patching the env variables in the unit test.

    Check Functions:
        All check functions (is_*_enabled) are decorated with @if_aiter_supported,
        which verifies: (1) platform is ROCm, (2) device arch is gfx9, and
        (3) aiter library is installed. The check function then also verifies
        the corresponding environment variable is enabled.
        i.e.                                             ___
        is_enabled() == current_platform.is_rocm() and      |     checked by
                        current_platform.is_on_gfx9() and   | @if_aiter_supported
                        IS_AITER_FOUND and   _______________|
                        cls._AITER_ENABLED   -----> Check by the logic in `is_enabled()`

    Example:
        from vllm._aiter_ops import rocm_aiter_ops

        # Check if aiter is enabled before using operations
        if rocm_aiter_ops.is_enabled():
            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)

    Operations:
        - RMS normalization: rms_norm, rms_norm2d_with_add
        - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
        - Fused MoE: fused_moe, asm_moe_tkw1
        - Routing: topk_softmax, biased_grouped_topk, grouped_topk
        - MLA decode: mla_decode_fwd
        - Quantization: per_tensor_quant, per_token_quant, group_fp8_quant
        - Triton ops: triton_rotary_embed, triton_fp8_bmm, triton_gemm_a8w8_blockscale
    c                 C   st   t j| _t j| _t j| _t j| _t j	| _
t j| _t j| _t j| _t j| _t j| _t j| _t j| _t j| _t j| _dS )aT  
        Since the environment variables are assigned when the module is imported,
        This is a helper function to reload all the env variables from
        the environment variables.
        for example, after monkey patching the env variables in the unit test,
        you can call this function to reload the env variables.
        N)envsVLLM_ROCM_USE_AITER_AITER_ENABLEDVLLM_ROCM_USE_AITER_LINEAR_LINEAR_ENABLEDVLLM_ROCM_USE_AITER_RMSNORM_RMSNORM_ENABLEDVLLM_ROCM_USE_AITER_MOE_FMOE_ENABLEDVLLM_ROCM_USE_AITER_MLA_MLA_ENABLEDVLLM_ROCM_USE_AITER_MHA_MHA_ENABLED!VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT_SHUFFLE_KV_CACHE_ENABLED%VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION_TRITON_UNIFIED_ATTN_ENABLEDVLLM_ROCM_USE_AITER_FP8BMM_FP8BMM_ENABLEDVLLM_ROCM_USE_AITER_FP4BMM_FP4BMM_ENABLED VLLM_ROCM_USE_AITER_FP4_ASM_GEMM_FP4_GEMM_DYNAMIC_QUANT_ASMVLLM_ROCM_USE_AITER_TRITON_ROPE_TRITON_ROTARY_EMBED)VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS_MOE_SHARED_EXPERTS_ENABLEDVLLM_ROCM_USE_AITER_TRITON_GEMM_TRITON_UNQUANT_GEMMclsr   r   r   refresh_env_variables  s   	z$rocm_aiter_ops.refresh_env_variablesr   c                 C      | j S r   )r   r  r   r   r   
is_enabled     zrocm_aiter_ops.is_enabledc                 C      | j o| jS r   )r   r  r  r   r   r   is_linear_enabled     z rocm_aiter_ops.is_linear_enabledc                 C   s   |   S r   )r!  r  r   r   r   is_linear_fp8_enabled  s   z$rocm_aiter_ops.is_linear_fp8_enabledc                 C   r   r   )r   r  r  r   r   r   is_rmsnorm_enabled  r"  z!rocm_aiter_ops.is_rmsnorm_enabledc                 C   r   r   )r   r  r  r   r   r   is_fused_moe_enabled  r"  z#rocm_aiter_ops.is_fused_moe_enabledc                 C   s   |   o| jS r   )r%  r  r  r   r   r   $is_fusion_moe_shared_experts_enabled  s   z3rocm_aiter_ops.is_fusion_moe_shared_experts_enabledc                 C   r   r   )r   r  r  r   r   r   is_mla_enabled  r"  zrocm_aiter_ops.is_mla_enabledc                 C   r   r   )r   r	  r  r   r   r   is_mha_enabled  r"  zrocm_aiter_ops.is_mha_enabledc                 C   r  r   )r  r  r   r   r   is_shuffle_kv_cache_enabled  r  z*rocm_aiter_ops.is_shuffle_kv_cache_enabledc                 C   r   r   )r   r  r  r   r   r   is_triton_unified_attn_enabled  r"  z-rocm_aiter_ops.is_triton_unified_attn_enabledc                 C   r   r   )r   r  r  r   r   r   is_fp8bmm_enabled  r"  z rocm_aiter_ops.is_fp8bmm_enabledc                 C   r   r   )r   r  r  r   r   r   is_fp4bmm_enabled  r"  z rocm_aiter_ops.is_fp4bmm_enabledc                 C   r   r   )r   r  r  r   r   r   %is_asm_fp4_gemm_dynamic_quant_enabled  r"  z4rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabledc                 C   r   r   )r   r  r  r   r   r   is_triton_rotary_embed_enabled  r"  z-rocm_aiter_ops.is_triton_rotary_embed_enabledc                 C   r   r   )r   r  r  r   r   r   is_triton_gemm_enabled  r"  z%rocm_aiter_ops.is_triton_gemm_enabledNc                   C   s  t stdtg ttjd tdtg ttjd tdtg dt	tjd tdt
ddgttjd td	tdd
gttjd tdtdd
gttjd tdtdgtd tdtg ttjd tdttd tdttd tdttd tdtttjd tdtttjd tdtttjd tdt t!d tdt"t#d tdt$t%d tdt&t'tjd tdt(t)d tdt*g t+tjd tdt,t-tjd tdt.d gt/tjd d!a d S d S )"Nrocm_aiter_asm_moe_tkw1)op_nameop_funcmutates_args	fake_impldispatch_keyrocm_aiter_fused_moerocm_aiter_topk_softmax)r<   rI   rJ   rocm_aiter_topk_sigmoidr<   rI   rocm_aiter_biased_grouped_topkr#   rocm_aiter_grouped_topkrocm_aiter_mla_decode_fwdrz   )r1  r2  r3  r4  rocm_aiter_gemm_a8w8&rocm_aiter_triton_gemm_a8w8_blockscale)r1  r2  r4  rocm_aiter_gemm_a8w8_blockscalerocm_aiter_rms_norm!rocm_aiter_rmsnorm2d_fwd_with_add)r1  r2  r4  r5  &rocm_aiter_rmsnorm_fused_dynamic_quant*rocm_aiter_rmsnorm_fused_add_dynamic_quant"rocm_aiter_rmsnorm_fp8_group_quant+rocm_aiter_rmsnorm_with_add_fp8_group_quant&rocm_aiter_act_mul_and_fp8_group_quant!rocm_aiter_triton_add_rmsnorm_padrocm_aiter_group_fp8_quantrocm_aiter_per_tensor_quantrocm_aiter_per_token_quantr   topk_indices_bufferT)0_OPS_REGISTEREDr   rE   rH   r   r5  r5   r;   rN   rQ   rS   rT   r\   r^   rd   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   register_ops_once  s   z rocm_aiter_ops.register_ops_oncec                   C      t jjjjS r   )r8   opsvllmr@  defaultr   r   r   r   get_rmsnorm_fused_add_op     z'rocm_aiter_ops.get_rmsnorm_fused_add_opc                   C   rM  r   )r8   rN  rO  r?  rP  r   r   r   r   get_rmsnorm_op  rR  zrocm_aiter_ops.get_rmsnorm_opc                   C   rM  r   )r8   rN  rO  rB  rP  r   r   r   r   &get_rmsnorm_fused_add_dynamic_quant_op  rR  z5rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_opc                   C   rM  r   )r8   rN  rO  rA  rP  r   r   r   r   "get_rmsnorm_fused_dynamic_quant_op  rR  z1rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_opc                   C   rM  r   )r8   rN  rO  rC  rP  r   r   r   r    get_rmsnorm_group_fused_quant_op  rR  z/rocm_aiter_ops.get_rmsnorm_group_fused_quant_opc                   C   rM  r   )r8   rN  rO  rD  rP  r   r   r   r   $get_rmsnorm_group_add_fused_quant_op  rR  z3rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_opc                   C   rM  r   )r8   rN  rO  rI  rP  r   r   r   r   get_per_token_quant_op  rR  z%rocm_aiter_ops.get_per_token_quant_opc                   C   rM  r   )r8   rN  rO  rG  rP  r   r   r   r   get_group_quant_op  rR  z!rocm_aiter_ops.get_group_quant_opc                   C   rM  r   )r8   rN  rO  rE  rP  r   r   r   r   $get_act_mul_fused_fp8_group_quant_op  rR  z3rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_opc                   C   rM  r   )r8   rN  rO  rF  rP  r   r   r   r   get_triton_add_rmsnorm_pad_op  rR  z,rocm_aiter_ops.get_triton_add_rmsnorm_pad_opr   r   r   c                 C      t jj| ||S r   )r8   rN  rO  r?  r   r   r   r   r     s   zrocm_aiter_ops.rms_normr   c                 C   s   t jj| |||S r   )r8   rN  rO  r@  )r   r   r   r   r   r   r   rms_norm2d_with_add  s   z"rocm_aiter_ops.rms_norm2d_with_addr   r   r   r   r   r-   c                 C   s   t jj| |||||S r   )r8   rN  rO  r<  )r   r   r   r   r   r-   r   r   r   	gemm_a8w8  s   	zrocm_aiter_ops.gemm_a8w8
block_sizec                 C      t jj| ||||S r   )r8   rN  rO  r=  r   r   r   r   r_  r-   r   r   r   triton_gemm_a8w8_blockscale     	
z*rocm_aiter_ops.triton_gemm_a8w8_blockscalec                 C   r`  r   )r8   rN  rO  r>  ra  r   r   r   r     rc  z#rocm_aiter_ops.gemm_a8w8_blockscaler   Fr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   c                 C   s*   t jj| |||||||||	|
||||S r   )r8   rN  rO  r6  r:   r   r   r   r0     s"   zrocm_aiter_ops.fused_moer<   r=   r>   r?   r@   rA   rB   c                 C   s&   t jj| |||||||||	|
||S r   )r8   rN  rO  r0  rG   r   r   r   rC   !  s   zrocm_aiter_ops.asm_moe_tkw1rI   rJ   rK   rL   .c                 C   s   t jj| |||| | |fS r   )r8   rN  rO  r7  rP   r   r   r   rM   A  s   
zrocm_aiter_ops.topk_softmaxc                 C   s   t jj| || | |fS r   )r8   rN  rO  r8  rP   r   r   r   rR   N  s   zrocm_aiter_ops.topk_sigmoidrU   rV   rW   rX   rY   rZ   c              
   C       t jj| ||||||| d S r   )r8   rN  rO  r9  r]   r   r   r   r[   [     z"rocm_aiter_ops.biased_grouped_topkr`   ra   c              
   C   rd  r   )r8   rN  rO  r:  re   r   r   r   rb   q  re  zrocm_aiter_ops.grouped_topkrw   rx   ry   rz   r   r{   r|   r}   r~   r   r   rj   rk   c                 C   s<   t jjj| |ddd| jd ||||||||	|
|d d S )Nr   r   )r   r   rj   rk   )r8   rN  rO  r;  r   r   )rx   ry   rz   r   r{   r|   r}   r~   r   r   rj   rk   r   r   r   ri     s   
zrocm_aiter_ops.mla_decode_fwdr   r   c                 C   r\  r   )r8   rN  rO  rH  r   r   r   r   per_tensor_quant     zrocm_aiter_ops.per_tensor_quantc                 C   r\  r   )r8   rN  rO  rI  r   r   r   r   per_token_quant  rg  zrocm_aiter_ops.per_token_quantweight_scale	out_dtypex_scalesc           
      C   sn   ddl m} ddlm} |d u r|| \}}n| }|}tj|jd |jd |j|d}	|||||j||	 |	S )Nr   )gemm_afp4wfp4)dynamic_mxfp4_quant)r   r1   )	aiter.ops.triton.gemm_afp4wfp4rl  aiter.ops.triton.quantrm  r8   r   r   r   T)
r   r   ri  rj  rk  rl  rm  x_qx_syr   r   r   triton_fp4_gemm_dynamic_qaunt  s   z,rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt	positionsquerykeycos_sin_cache	head_size
rotary_dimis_neox_stylec              
   C   s   ddl m} |  }|jddd\}	}
|j}|j}|rdnd}||d|}||d|}|dd |f }|dd |f }| j|jd d  } ||||	|
| |dd	d
 ||}||}d S )Nr   )(rope_cached_thd_positions_2c_fwd_inplacer   r   )r   r   .TF)reuse_freqs_front_part
nope_first)aiter.ops.triton.roper|  numelchunkr   r   )ru  rv  rw  rx  ry  rz  r{  r|  
num_tokenscossinquery_shape	key_shaperotate_stylequery_key_r   r   r   triton_rotary_embed  s.   


z"rocm_aiter_ops.triton_rotary_embedXWw_scaler   transpose_bmprequantr   c              	   C   s"   ddl m} || ||||||dS )Nr   )batched_gemm_a16wfp4)rs  r  r  r   )%aiter.ops.triton.batched_gemm_a16wfp4r  )r  r  r  r   r  r  r   r  r   r   r   r    s   z#rocm_aiter_ops.batched_gemm_a16wfp4   WQr   r1   splitKYQconfigc
                 C   s(   ddl m}
 |
| |||||||||	d
S )Nr   )Gbatched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant)r   r   r1   r  r  r  r  )Xaiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quantr  )r  r  r  r   r   r1   r  r  r  r  aiter_triton_fp8_bmmr   r   r   triton_fp8_bmm  s   zrocm_aiter_ops.triton_fp8_bmminput_2dc                 C   s    |dksJ dt jj| |S )Nr  zGroup size must be 128)r8   rN  rO  rG  )r  r   r   r   r   group_fp8_quant'  s   zrocm_aiter_ops.group_fp8_quantr   kc                 C      | |fdv S )N))       )i@     )i   i   )   r  )   r  )i   r  )i   r  )r     )r     r  r  )r  r  r   r   r  r   r   r   is_triton_gemm_w8a8_tuned/  rR  z(rocm_aiter_ops.is_triton_gemm_w8a8_tunedc                 C   r  )N))r  r  )i   r  ) @  i   )i  r  )i   r  )r  r  )i 
  r  )i (  r  )r  r  )r   p  )r  r  )i H  r  r  )r  r  )i   r  )r  r  )r  r  ) 8  r  )r  r  )r  i   r   r  r   r   r   &is_triton_gemm_afp4wfp4_presh_ws_tuned?  rR  z5rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned   r  tensorlayoutc                 C   s   ddl m} |||dS )Nr   shuffle_weightr  )aiter.ops.shuffler  )selfr  r  r  r   r   r   r  X  s   zrocm_aiter_ops.shuffle_weightr  tensorsc                    s$   ddl m t fdd|D S )a  
        Applies shuffle_weight function from AITER to each
        input tensor and returns them.

        Rearranges (shuffles) the input tensor/s
        into a specified block layout for optimized computation.

        Args:
            *tensors: Variable number of torch.Tensor objects.
            layout: A pair of integers specifying the block sizes used to divide
                the tensors during shuffling. Default is (16, 16).

        Returns:
        A Tuple of shuffled tensors.
        r   r  c                 3   s    | ]	}| d V  qdS )r  Nr   ).0r  r  r  r   r   	<genexpr>u  s    z1rocm_aiter_ops.shuffle_weights.<locals>.<genexpr>)r  r  tuple)r  r  r   r  r   shuffle_weights`  s   zrocm_aiter_ops.shuffle_weightsvcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kmin_seqlen_q	dropout_psoftmax_scalecausalwindow_sizealibi_slopes
return_lser   c                 C   s2   ddl m} || |||||||||	|
||||dS )aE  
        Flash attention with variable length sequences.

        This function is NOT wrapped with @is_aiter_supported decorator
        to allow explicit backend selection via attention_config to work
        even when VLLM_ROCM_USE_AITER=0.

        Note: This performs lazy import of aiter.flash_attn_varlen_func
        r   )flash_attn_varlen_func)rx   r  r  r  r  r  r  r  r  r  r  r  r  r  r   )r   r  )rx   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r   r   r  w  s$   z%rocm_aiter_ops.flash_attn_varlen_funcQKVblock_tablescontext_lensblock_tables_stride0K_QScaleV_QScaleout_c	           
      C   s&   ddl m}	 |	| ||||||||d	S )a=  
        Paged attention forward pass using assembly kernel.

        This function is NOT wrapped with @is_aiter_supported decorator
        to allow explicit backend selection via attention_config to work
        even when VLLM_ROCM_USE_AITER=0.

        Note: This performs lazy import of aiter.pa_fwd_asm
        r   )
pa_fwd_asm)	r  r  r  r  r  r  r  r  r  )r   r  )
r  r  r  r  r  r  r  r  r  r  r   r   r   r    s   zrocm_aiter_ops.pa_fwd_asm)r   N
Nr   r   FNNNNNNNNNNFNNr   rU   r`   rU   )NNNrw   NNr   )FFN)r  )r  )Nrw   NFNNFN)e__name__
__module____qualname____doc__r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  classmethodr  r   boolr  r!  r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r-  r.  r/  staticmethodrL  r   rQ  rS  rT  rU  rV  rW  rX  rY  rZ  r[  r8   Tensorfloatr   r  r]  float16r1   r^  listintrb  r   r0   rC   rM   rR   r[   strrb   ri   rf  rh  bfloat16rt  r  r  dictr  r  r  r  r  r  r  r  r   r   r   r   r   F  s>   >
 #

	
#	
				
#	


		
.	r   r  r  r  r  )NNNrU   rw   NNr   )Mr   collections.abcr   r8   
torch._opsr   	vllm.envsr   vllm.platformsr   vllm.utils.torch_utilsr   +vllm.v1.attention.ops.rocm_aiter_mla_sparser   r   	fp8_dtyper   r  r   r   r   r   r  r  r1   r5   r;   rE   rH   rN   rQ   rS   rT   r  r\   r^   r  rd   rf   rg   __annotations__rv   r   r   r  r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rK  r   rL  r   r   r   r   <module>   sv  

	

0	

	

+	







	
 	
	
 	
"	

-	






























       