o
    ,i                  !   @   sz
  U d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZmZ e	 Zdefdd	Ze Zdefd
dZdedefddZe r[d dlmZ ejZ		 	 							ddejdejdejdejdejdejdB dedededejdB dejdB dejdB dejdB dejdB dejdB dejf d d!Z		 	 							ddejdejdejdejdejdejdB dedededejdB dejdB dejdB dejdB dejdB dejdB dejf d"d#Z								 ddejdejdejd$ejdejd%ejdB d&ejdB d'ejdB d(ejdB d)ed*ejdB dejdB dedejfd+d,Z 								 ddejdejdejd$ejdejd%ejdB d&ejdB d'ejdB d(ejdB d)ed*ejdB dejdB dedejfd-d.Z!d$ejd/ejd0ejd1ejd2eddfd3d4Z"d$ejd/ejd0ejd1ejd2eddfd5d6Z#d$ejd/ejd1ejddfd7d8Z$d$ejd/ejd1ejddfd9d:Z%	;dd1ejd<ejd$ejdejd=ed>ed?ed@e&ddfdAdBZ'	;dd1ejd<ejd$ejdejd=ed>ed?ed@e&ddfdCdDZ(	E	;dd1ejd$ejdejd=ed>ed?edFe)d@e&ddfdGdHZ*	E	;dd1ejd$ejdejd=ed>ed?edFe)d@e&ddfdIdJZ+da,edB e-dK< defdLdMZ.				;	N		ddOejdPejdQejdRejdSedTejdB dUejdB dVejdB dWe&dXe&dYejdB dZejdB ddfd[d\Z/				;	N		ddOejdPejdQejdRejdSedTejdB dUejdB dVejdB dWe&dXe&dYejdB dZejdB ddfd]d^Z0dej1fd_ejd`ejdaejdbejdcejdB dejdejfdddeZ2dej1fd_ejd`ejdaejdbejdcejdB dejdejfdfdgZ3ej1fd_ejd`ejdaejdbejdejdejfdhdiZ4ej1fd_ejd`ejdaejdbejdejdejfdjdkZ5ej1fd_ejd`ejdaejdbejdejdejfdldmZ6ej1fd_ejd`ejdaejdbejdejdejfdndoZ7dpejdqejdre&dejfdsdtZ8dpejdqejdre&dejfdudvZ9dpejdwejdqejdre&de:ejejf f
dxdyZ;dpejdwejdqejdre&de:ejejf f
dzd{Z<dpejdwejdqejd|e&d}ejde:ejejejf fd~dZ=dpejdwejdqejd|e&d}ejde:ejejejf fddZ>dpejdqejd|e&d}ejde:ejejf f
ddZ?dpejdqejd|e&d}ejde:ejejf f
ddZ@	ddpejd}ejdejdB de:ejejf fddZA	ddpejd}ejdejdB de:ejejf fddZB	ddpejd}ejdejdB de:ejejf fddZC	ddpejd}ejdejdB de:ejejf fddZDdpejdwejdqejdre&dede:ejejejf fddZEdpejdwejdqejdre&dede:ejejejf fddZFdpejdqejdre&dede:ejejf f
ddZGdpejdqejdre&dede:ejejf f
ddZHdpejdede:ejejf fddZIdpejdede:ejejf fddZJdpejdede:ejejf fddZKdpejdede:ejejf fddZLdaMG dd dZNeNO  dS )    N)Callable)
OpOverload)current_platform)direct_register_custom_opis_torch_equal_or_newer)rocm_aiter_sparse_attn_indexer#rocm_aiter_sparse_attn_indexer_fakereturnc                  C   s   ddl m}  | dd uS )Nr   	find_specaiter)importlib.utilr   r
    r   L/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/_aiter_ops.pyis_aiter_found   s   r   c                  C   s"   t  rtrddlm}  |  S dS )Nr   on_gfx9F)r   is_rocmIS_AITER_FOUNDvllm.platforms.rocmr   r   r   r   r   is_aiter_found_and_supported!   s   r   funcc                    s   t   fdd}|S )zdDecorator that only executes the function if
    ROCm AITER package is supported on gfx9 archs.
    c                     s   t  r
 | i |S d S N)r   )argskwargsr   r   r   wrapper.   s   z#if_aiter_supported.<locals>.wrapper)	functoolswraps)r   r   r   r   r   if_aiter_supported)   s   r   )dtypesFhidden_statesw1w2topk_weighttopk_idsexpert_maskactivation_methodquant_methoddoweight_stage1w1_scalew2_scalea1_scalea2_scalenum_local_tokensoutput_dtypec                 C   sR   ddl m}m} ddlm} ||}||}|| |||||||||	|
||||dS )Nr   )ActivationType	QuantType)	fused_moe)r.   dtype)r   r0   r1   aiter.fused_moer2   )r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   
activation
quant_typer   r   r   _rocm_aiter_fused_moe_implD   s*   r7   c                 C   s    |d urt j| |dS t | S )Nr3   torch
empty_liker!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r   r   r   _rocm_aiter_fused_moe_faken   s   
r=   topk_weights	fc1_scale	fc2_scalefc1_smooth_scalefc2_smooth_scalea16per_tensor_quant_scalec                 C   sB   ddl m} ddlm} ||}|| |||||||||	|
||dS )Nr   )r0   )asm_moe_tkw1)r?   r@   rA   rB   rC   rD   r&   r5   )r   r0   aiter.fused_moe_bf16_asmrE   )r!   r"   r#   r>   r%   r?   r@   rA   rB   rC   rD   r&   r'   r0   rE   r5   r   r   r   _rocm_aiter_asm_moe_tkw1_impl   s$   rG   c                 C   
   t | S r   r9   r!   r"   r#   r>   r%   r?   r@   rA   rB   rC   rD   r&   r'   r   r   r   _rocm_aiter_asm_moe_tkw1_fake   s   
rJ   topk_indicestoken_expert_indicesgating_outputrenormalizec                 C   s    ddl m} || |||| d S )Nr   )topk_softmax)r   rO   )r>   rK   rL   rM   rN   rO   r   r   r   _rocm_aiter_topk_softmax_impl   s   
rP   c                 C      d S r   r   r>   rK   rL   rM   rN   r   r   r   _rocm_aiter_topk_softmax_fake   s   rS   c                 C   s   ddl m} || || d S )Nr   )topk_sigmoid)r   rT   )r>   rK   rM   rT   r   r   r   _rocm_aiter_topk_sigmoid_impl   s   rU   c                 C   rQ   r   r   )r>   rK   rM   r   r   r   _rocm_aiter_topk_sigmoid_fake   s   rV         ?correction_biasnum_expert_group
topk_groupneed_renormrouted_scaling_factorc           	   	   C   s&   ddl m} || ||||||| d S )Nr   )biased_grouped_topk)r   r]   )	rM   rX   r>   r%   rY   rZ   r[   r\   r]   r   r   r   $_rocm_aiter_biased_grouped_topk_impl   s   
r^   c                 C   rQ   r   r   rM   rX   r>   r%   rY   rZ   r[   r\   r   r   r   $_rocm_aiter_biased_grouped_topk_fake      
r`   softmaxscoring_funcc           
   	   C   s.   |dk}ddl m}	 |	| ||||||| d S )Nrb   r   )grouped_topk)r   rd   )
rM   r>   r%   rY   rZ   r[   rc   r\   
is_softmaxrd   r   r   r   _rocm_aiter_grouped_topk_impl
  s   
rf   c                 C   rQ   r   r   rM   r>   r%   rY   rZ   r[   rc   r\   r   r   r   _rocm_aiter_grouped_topk_fake#  ra   rh   _AITER_MLA_SUPPORTS_FP8c                  C   sh   t du r2zddl} ddlm} | |}d|jv od|jv a W t S  tttt	t
fy1   da Y t S w t S )zKCheck if aiter.mla.mla_decode_fwd supports q_scale and kv_scale parameters.Nr   mla_decode_fwdq_scalekv_scaleF)ri   inspect	aiter.mlark   	signature
parametersImportErrorModuleNotFoundErrorAttributeError
ValueError	TypeError)rn   rk   sigr   r   r   _check_aiter_mla_fp8_support4  s&   

rx           q	kv_buffero	qo_indptrmax_seqlen_qo	kv_indptr
kv_indiceskv_last_page_lenssm_scale	logit_caprl   rm   c              	   C   s`   ddl m} ||	d}t r|
|d< ||d< || |ddd| jd ||||||fi | d S )Nr   rj   )r   r   rl   rm      )ro   rk   rx   viewshape)rz   r{   r|   r}   r~   r   r   r   r   r   rl   rm   rk   r   r   r   r   _rocm_aiter_mla_decode_fwd_implP  s&   	
r   c                 C   rQ   r   r   )rz   r{   r|   r}   r~   r   r   r   r   r   rl   rm   r   r   r   _rocm_aiter_mla_decode_fwd_fakew  s   r   ABAsBsbiasc                 C   s   ddl m} || |||||S )Nr   )gemm_a8w8_CK)r   r   )r   r   r   r   r   r/   r   r   r   r   _rocm_aiter_gemm_a8w8_impl  s   r   c           	      C   s,   | j d }|j d }tj|||| jd}|S Nr   r3   devicer   r:   emptyr   )	r   r   r   r   r   r/   mnYr   r   r   _rocm_aiter_gemm_a8w8_fake  s   

r   c                 C      ddl m} || ||||dS Nr   )gemm_a8w8_blockscaler8   )%aiter.ops.triton.gemm_a8w8_blockscaler   r   r   r   r   r/   r   r   r   r   ,_rocm_aiter_triton_gemm_a8w8_blockscale_impl     r   c                 C   ,   | j d }|j d }tj|||| jd}|S r   r   r   r   r   r   r/   r   r   r   r   r   r   ,_rocm_aiter_triton_gemm_a8w8_blockscale_fake     

r   c                 C   r   r   )r   r   r   r   r   r   %_rocm_aiter_gemm_a8w8_blockscale_impl  r   r   c                 C   r   r   r   r   r   r   r   %_rocm_aiter_gemm_a8w8_blockscale_fake  r   r   xweightvariance_epsilonc                 C   sP   ddl m} |  dkr"| j}| d|d } || ||} | |S || ||S )Nr   )rms_norm   r   )r   r   dimr   reshape)r   r   r   r   x_original_shaper   r   r   _rocm_aiter_rms_norm_impl  s   
r   c                 C   rH   r   r9   r   r   r   r   r   r   _rocm_aiter_rms_norm_fake  s   
r   residualc                 C   s:   ddl m} t|}t| }||| |||| ||fS )Nr   )rmsnorm2d_fwd_with_add)r   r   r:   r;   )r   r   r   r   r   residual_outoutr   r   r   '_rocm_aiter_rmsnorm2d_fwd_with_add_impl  s   

r   c                 C   s   t |}t | }||fS r   r9   )r   r   r   r   r   r   r   r   r   '_rocm_aiter_rmsnorm2d_fwd_with_add_fake  s   

r   epsilonquant_dtypec           	   
   C   sx   dd l }|tjtfv sJ tj| jd dtj| jd}tj| j|| jd}t| }|j	|| |||||dd |||fS Nr   r   r   )use_model_sensitive_rmsnorm)
r   r:   int8
_FP8_DTYPEr   r   float32r   r;   #rmsnorm2d_fwd_with_add_dynamicquant)	r   r   r   r   r   
rocm_aitery_scaler   r   r   r   r   0_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl  s    

r   c                 C   sD   t j| jd dt j| jd}t j| j|| jd}t | }|||fS Nr   r   r   )r:   r   r   r   r   r;   )r   r   r   r   r   r   r   r   r   r   r   0_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake*  s   

r   c                 C   sh   dd l }|tjtfv sJ tj| jd dtj| jd}tj| j|| jd}|j|| |||dd ||fS r   )	r   r:   r   r   r   r   r   r   rmsnorm2d_fwd_with_dynamicquant)r   r   r   r   r   r   r   r   r   r   ,_rocm_aiter_rmsnorm_fused_dynamic_quant_impl8  s   r   c                 C   s8   t j| jd dt j| jd}t j| j|| jd}||fS r   )r:   r   r   r   r   )r   r   r   r   r   r   r   r   r   ,_rocm_aiter_rmsnorm_fused_dynamic_quant_fakeL  s   r   scalec                 C   s   ddl m} || ||S )Nr   )per_tensor_quant_hip)aiter.ops.quantr   )r   r   r   r   r   r   r   !_rocm_aiter_per_tensor_quant_implX  s   r   c                 C   s"   t j| |dt jdt j| jdfS )Nr8   r   r   )r:   r;   r   r   r   r   r   r   r   r   r   !_rocm_aiter_per_tensor_quant_fakeb  s   
r   c              	   C   s   ddl m} |tjtfv sJ | j}tj| jt| jd}|d u r3tjg |d d dR tj| jd}||| |d dd dd ||fS )Nr   )dynamic_per_token_scaled_quantr   r   r   F)scale_ubshuffle_scalenum_rowsnum_rows_factor)	r   r   r:   r   r   r   r   r   r   )r   r   r   r   	out_shaper   r   r   r    _rocm_aiter_per_token_quant_impll  s    &	r   c                 C   s@   | j }tj| j t| jdtjg |d d dR tj| jdfS )Nr   r   r   )r   r:   r   r   r   r   )r   r   r   r   r   r   r    _rocm_aiter_per_token_quant_fake  s   $r   
group_sizec           
      C   s<   ddl m} || ||d d d |t|d	\\}}}}}	||	|fS Nr   )fused_rms_fp8_group_quant)r   dtype_quantres1 aiter.ops.triton.fused_fp8_quantr   AITER_FP8_DTYPE)
r   r   r   r   r   r   x_quantx_quant_scales_resr   r   r   1_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl  s    r   c                 C   sR   | j \}}||| d | f}tj| t| jdtj||jdtj|tj| jdfS )Nr   r   )r   r   r:   r;   r   r   r   r   )r   r   r   r   r   MNscale_shaper   r   r   1_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake  s   
r   c           	      C   s:   ddl m} || ||d d d |td d	\\}}}}}||fS r   r   )	r   r   r   r   r   r   r   r   r   r   r   r   (_rocm_aiter_rmsnorm_fp8_group_quant_impl  s   r   c                 C   sD   | j \}}||| d | f}tj| t| jdtj|tj| jdfS )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   (_rocm_aiter_rmsnorm_fp8_group_quant_fake  s
   
r   c                 C   sD   | j d | dksJ dddlm}m} ||j}||  tdS )Nr   r   z+Input shape must be divisible by group size)r1   get_hip_quant)r   )r   r   r1   r   	per_1x128
contiguousr   )r   r   r1   r   aiter_per1x128_quantr   r   r    _rocm_aiter_group_fp8_quant_impl  s   
r   c                 C   sL   | j \}}tj||ft| jd}tj||| d | ftj| jd}||fS )Nr   r   r   r:   r   r   r   r   )r   r   r   r   x_fp8out_bsr   r   r    _rocm_aiter_group_fp8_quant_fake  s   
r   c                 C   s   ddl m} || d|tdS )Nr   )act_mul_and_fp8_group_quantsilu)r5   r   r   )aiter.ops.triton.activationr   r   )r   r   r   r   r   r   ,_rocm_aiter_act_mul_and_fp8_group_quant_impl  s   r   c                 C   sd   | j \}}|d dksJ |d }tj||ft| jd}tj||| d | ftj| jd}||fS )Nr   r   r   r   r   )r   r   r   r   N_halfr   r   r   r   r   ,_rocm_aiter_act_mul_and_fp8_group_quant_fake  s   
r   c                "   @   sj  e Zd ZdZejZejZej	Z
ejZejZejZejZejZejZejZejZejZejZejZ e!dd Z"e!e#de$fddZ%e!e#de$fddZ&e!e#de$fd	d
Z'e!e#de$fddZ(e!e#de$fddZ)e!e#de$fddZ*e!e#de$fddZ+e!e#de$fddZ,e!e#de$fddZ-e!e#de$fddZ.e!e#de$fddZ/e!e#de$fddZ0e!e#de$fddZ1e!e#de$fdd Z2e!e#de$fd!d"Z3e4e#dd$d%Z5e4de6fd&d'Z7e4de6fd(d)Z8e4de6fd*d+Z9e4de6fd,d-Z:e4de6fd.d/Z;e4de6fd0d1Z<e4de6fd2d3Z=e4de6fd4d5Z>e4de6fd6d7Z?e4d8e@jAd9e@jAd:eBde@jAfd;d<ZCe4d8e@jAd=e@jAd9e@jAd:eBdeDe@jAe@jAf f
d>d?ZEe4d#e@jFfd@e@jAdAe@jAdBe@jAdCe@jAdDe@jAd#B dEe@jGde@jAfdFdGZHe4e@jFfd@e@jAdAe@jAdBe@jAdCe@jAdHeIeJ dEe@jGde@jAfdIdJZKe4e@jFfd@e@jAdAe@jAdBe@jAdCe@jAdHeIeJ dEe@jGde@jAfdKdLZLe4	#	M	M	N	#	#	#	#	#	#ddOe@jAdPe@jAdQe@jAdRe@jAdSe@jAdTe@jAd#B dUeJdVeJdWe$dXe@jAd#B dYe@jAd#B dZe@jAd#B d[e@jAd#B d\e@jAd#B dEe@jGd#B de@jAf d]d^ZMe4	#	#	#	#	N	#	#	MddOe@jAdPe@jAdQe@jAd_e@jAdSe@jAd`e@jAd#B dae@jAd#B dbe@jAd#B dce@jAd#B dde$dee@jAd#B dTe@jAd#B dUeJde@jAfdfdgZNe4d_e@jAdhe@jAdie@jAdje@jAdke$deDe@jAdlf fdmdnZOe4d_e@jAdhe@jAdie@jAdje@jAdke$deDe@jAdlf fdodpZPe4	qddje@jAdre@jAd_e@jAdSe@jAdseJdteJdue$dveBdd#fdwdxZQe4	y	qddje@jAd_e@jAdSe@jAdseJdteJdue$dzeRdveBdd#fd{d|ZSe4	#	#	#	}	#	#dd~e@jAde@jAde@jAdeBde@jAdeJde@jAd#B de@jAd#B de@jAd#B deBde@jAd#B de@jAd#B fddZTe4	#dd8e@jAde@jGde@jAd#B deDe@jAe@jAf fddZUe4	#dd8e@jAde@jGde@jAd#B deDe@jAe@jAf fddZVe4e@jWd#fd8e@jAd9e@jAde@jAde@jGd#B de@jAd#B de@jAfddZXe4de@jAde@jAde@jAde@jAdeJdeJde$fddZYe4	N	N	#dde@jAde@jAde@jAde@jAde$d#B de$d#B de@jAd#B de@jAfddZZe4dd#e@jWd#d#dNd#fde@jAde@jAde@jAdeJdDe@jAd#B de@jGd#B deJd#B de@jAd#B de$d#B de[d#B de@jAfddZ\e4	dde@jAdeJdeDe@jAdlf fddZ]e4deJdeJde$fddZ^e4deJdeJde$fddZ_e4	dde@jAdeDeJeJf de@jAfddZ`e4ddde@jAdeDeJeJf deDe@jAdlf fddÄZad#S )rocm_aiter_opsa  ROCm AITER operations wrapper for AMD GPU acceleration in vLLM.

    This class centralizes the import and registration of AITER ops,
    and provides a unified interface for checking if AITER is enabled.
    Operations are only available on supported gfx9
    architectures when aiter is installed.

    The class uses environment variables to control which features are enabled,
    allowing fine-grained control over which AITER optimizations are used.

    Environment Variables:
        VLLM_ROCM_USE_AITER: Main toggle for all AITER operations.
        VLLM_ROCM_USE_AITER_LINEAR: Controls GEMM and quantization ops.
        VLLM_ROCM_USE_AITER_RMSNORM: Controls RMSNorm operations.
        VLLM_ROCM_USE_AITER_MOE: Controls MoE (Mixture of Experts) ops.
        VLLM_ROCM_USE_AITER_MLA: Controls MLA (Multi-head Latent Attention) ops.
        VLLM_ROCM_USE_AITER_MHA: Controls MHA ops including flash_attn_varlen.
        VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: Controls Triton unified attention.
        VLLM_ROCM_USE_AITER_FP8BMM: Controls FP8 batched matrix multiply.
        VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: Controls FP4 assembly GEMM.
        VLLM_ROCM_USE_AITER_TRITON_ROPE: Controls Triton rotary embeddings.
        VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: Controls shared expert fusion.
        VLLM_ROCM_USE_AITER_TRITON_GEMM: Controls Triton unquantized GEMM.

    Note:
        The environment variables are assigned when the module is imported,
        so you can't change the environment variables after the module is imported.
        This is done out of performance consideration. Accessing environment variables
        is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
        so we don't want to do it repeatedly, especially in the hot path (the forward pass).
        You can call the refresh_env_variables() function to reload the env variables
        after monkey patching the env variables in the unit test.

    Check Functions:
        All check functions (is_*_enabled) are decorated with @if_aiter_supported,
        which verifies: (1) platform is ROCm, (2) device arch is gfx9, and
        (3) aiter library is installed. The check function then also verifies
        the corresponding environment variable is enabled.
        i.e.                                             ___
        is_enabled() == current_platform.is_rocm() and      |     checked by
                        current_platform.is_on_gfx9() and   | @if_aiter_supported
                        IS_AITER_FOUND and   _______________|
                        cls._AITER_ENABLED   -----> Check by the logic in `is_enabled()`

    Example:
        from vllm._aiter_ops import rocm_aiter_ops

        # Check if aiter is enabled before using operations
        if rocm_aiter_ops.is_enabled():
            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)

    Operations:
        - RMS normalization: rms_norm, rms_norm2d_with_add
        - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
        - Fused MoE: fused_moe, asm_moe_tkw1
        - Routing: topk_softmax, biased_grouped_topk, grouped_topk
        - MLA decode: mla_decode_fwd
        - Quantization: per_tensor_quant, per_token_quant, group_fp8_quant
        - Triton ops: triton_rotary_embed, triton_fp8_bmm, triton_gemm_a8w8_blockscale
    c                 C   st   t j| _t j| _t j| _t j| _t j	| _
t j| _t j| _t j| _t j| _t j| _t j| _t j| _t j| _t j| _dS )aT  
        Since the environment variables are assigned when the module is imported,
        This is a helper function to reload all the env variables from
        the environment variables.
        for example, after monkey patching the env variables in the unit test,
        you can call this function to reload the env variables.
        N)envsVLLM_ROCM_USE_AITER_AITER_ENABLEDVLLM_ROCM_USE_AITER_LINEAR_LINEAR_ENABLEDVLLM_ROCM_USE_AITER_RMSNORM_RMSNORM_ENABLEDVLLM_ROCM_USE_AITER_MOE_FMOE_ENABLEDVLLM_ROCM_USE_AITER_MLA_MLA_ENABLEDVLLM_ROCM_USE_AITER_MHA_MHA_ENABLED!VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT_SHUFFLE_KV_CACHE_ENABLED%VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION_TRITON_UNIFIED_ATTN_ENABLEDVLLM_ROCM_USE_AITER_FP8BMM_FP8BMM_ENABLEDVLLM_ROCM_USE_AITER_FP4BMM_FP4BMM_ENABLED VLLM_ROCM_USE_AITER_FP4_ASM_GEMM_FP4_GEMM_DYNAMIC_QUANT_ASMVLLM_ROCM_USE_AITER_TRITON_ROPE_TRITON_ROTARY_EMBED)VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS_MOE_SHARED_EXPERTS_ENABLEDVLLM_ROCM_USE_AITER_TRITON_GEMM_TRITON_UNQUANT_GEMMclsr   r   r   refresh_env_variableso  s   	z$rocm_aiter_ops.refresh_env_variablesr	   c                 C   s   | j S r   )r   r  r   r   r   
is_enabled  s   zrocm_aiter_ops.is_enabledc                 C      | j o| jS r   )r   r   r  r   r   r   is_linear_enabled     z rocm_aiter_ops.is_linear_enabledc                 C   s   |   S r   )r  r  r   r   r   is_linear_fp8_enabled  s   z$rocm_aiter_ops.is_linear_fp8_enabledc                 C   r  r   )r   r   r  r   r   r   is_rmsnorm_enabled  r  z!rocm_aiter_ops.is_rmsnorm_enabledc                 C   r  r   )r   r  r  r   r   r   is_fused_moe_enabled  r  z#rocm_aiter_ops.is_fused_moe_enabledc                 C   s   |   o| jS r   )r   r  r  r   r   r   $is_fusion_moe_shared_experts_enabled  s   z3rocm_aiter_ops.is_fusion_moe_shared_experts_enabledc                 C   r  r   )r   r  r  r   r   r   is_mla_enabled  r  zrocm_aiter_ops.is_mla_enabledc                 C   r  r   )r   r  r  r   r   r   is_mha_enabled  r  zrocm_aiter_ops.is_mha_enabledc                 C   r  r   )r   r  r  r   r   r   is_shuffle_kv_cache_enabled  r  z*rocm_aiter_ops.is_shuffle_kv_cache_enabledc                 C   r  r   )r   r
  r  r   r   r   is_triton_unified_attn_enabled  r  z-rocm_aiter_ops.is_triton_unified_attn_enabledc                 C   r  r   )r   r  r  r   r   r   is_fp8bmm_enabled  r  z rocm_aiter_ops.is_fp8bmm_enabledc                 C   r  r   )r   r  r  r   r   r   is_fp4bmm_enabled  r  z rocm_aiter_ops.is_fp4bmm_enabledc                 C   r  r   )r   r  r  r   r   r   %is_asm_fp4_gemm_dynamic_quant_enabled  r  z4rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabledc                 C   r  r   )r   r  r  r   r   r   is_triton_rotary_embed_enabled  r  z-rocm_aiter_ops.is_triton_rotary_embed_enabledc                 C   r  r   )r   r  r  r   r   r   is_triton_gemm_enabled  r  z%rocm_aiter_ops.is_triton_gemm_enabledNc                  C   s  t stdr	t ntjjf} tdtg tt	j
d tdtg tt	j
d tdtg dtt	j
d tdtdd	gtt	j
d td
tddgtt	j
d tdtddgtt	j
d tdtdgt| d tdtg tt	j
d tdttd tdttd tdttd tdtt t	j
d tdt!t"t	j
d tdt#t$t	j
d tdt%t&d tdt't(d tdt)t*d tdt+t,d tdt-g t.t	j
d tdt/t0t	j
d tdt1d gt2t	j
d d!a d S d S )"Nz2.7.0rocm_aiter_asm_moe_tkw1)op_nameop_funcmutates_args	fake_impldispatch_keyrocm_aiter_fused_moerocm_aiter_topk_softmax)r>   rK   rL   rocm_aiter_topk_sigmoidr>   rK   rocm_aiter_biased_grouped_topkr%   rocm_aiter_grouped_topkrocm_aiter_mla_decode_fwdr|   )r,  r-  r.  r/  tagsrocm_aiter_gemm_a8w8&rocm_aiter_triton_gemm_a8w8_blockscale)r,  r-  r/  rocm_aiter_gemm_a8w8_blockscalerocm_aiter_rms_norm!rocm_aiter_rmsnorm2d_fwd_with_add)r,  r-  r/  r0  &rocm_aiter_rmsnorm_fused_dynamic_quant*rocm_aiter_rmsnorm_fused_add_dynamic_quant"rocm_aiter_rmsnorm_fp8_group_quant+rocm_aiter_rmsnorm_with_add_fp8_group_quant&rocm_aiter_act_mul_and_fp8_group_quantrocm_aiter_group_fp8_quantrocm_aiter_per_tensor_quantrocm_aiter_per_token_quantr   topk_indices_bufferT)3_OPS_REGISTEREDr   tupler:   Tagneeds_fixed_stride_orderr   rG   rJ   r   r0  r7   r=   rP   rS   rU   rV   r^   r`   rf   rh   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r7  r   r   r   register_ops_once  s   z rocm_aiter_ops.register_ops_oncec                   C      t jjjjS r   )r:   opsvllmr<  defaultr   r   r   r   get_rmsnorm_fused_add_opv     z'rocm_aiter_ops.get_rmsnorm_fused_add_opc                   C   rK  r   )r:   rL  rM  r;  rN  r   r   r   r   get_rmsnorm_opz  rP  zrocm_aiter_ops.get_rmsnorm_opc                   C   rK  r   )r:   rL  rM  r>  rN  r   r   r   r   &get_rmsnorm_fused_add_dynamic_quant_op~  rP  z5rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_opc                   C   rK  r   )r:   rL  rM  r=  rN  r   r   r   r   "get_rmsnorm_fused_dynamic_quant_op  rP  z1rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_opc                   C   rK  r   )r:   rL  rM  r?  rN  r   r   r   r    get_rmsnorm_group_fused_quant_op  rP  z/rocm_aiter_ops.get_rmsnorm_group_fused_quant_opc                   C   rK  r   )r:   rL  rM  r@  rN  r   r   r   r   $get_rmsnorm_group_add_fused_quant_op  rP  z3rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_opc                   C   rK  r   )r:   rL  rM  rD  rN  r   r   r   r   get_per_token_quant_op  rP  z%rocm_aiter_ops.get_per_token_quant_opc                   C   rK  r   )r:   rL  rM  rB  rN  r   r   r   r   get_group_quant_op  rP  z!rocm_aiter_ops.get_group_quant_opc                   C   rK  r   )r:   rL  rM  rA  rN  r   r   r   r   $get_act_mul_fused_fp8_group_quant_op  rP  z3rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_opr   r   r   c                 C      t jj| ||S r   )r:   rL  rM  r;  r   r   r   r   r     s   zrocm_aiter_ops.rms_normr   c                 C   s   t jj| |||S r   )r:   rL  rM  r<  )r   r   r   r   r   r   r   rms_norm2d_with_add  s   z"rocm_aiter_ops.rms_norm2d_with_addr   r   r   r   r   r/   c                 C   s   t jj| |||||S r   )r:   rL  rM  r8  )r   r   r   r   r   r/   r   r   r   	gemm_a8w8  s   	zrocm_aiter_ops.gemm_a8w8
block_sizec                 C      t jj| ||||S r   )r:   rL  rM  r9  r   r   r   r   r\  r/   r   r   r   triton_gemm_a8w8_blockscale     	
z*rocm_aiter_ops.triton_gemm_a8w8_blockscalec                 C   r]  r   )r:   rL  rM  r:  r^  r   r   r   r     r`  z#rocm_aiter_ops.gemm_a8w8_blockscaler   Fr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   c                 C   s*   t jj| |||||||||	|
||||S r   )r:   rL  rM  r1  r<   r   r   r   r2     s"   zrocm_aiter_ops.fused_moer>   r?   r@   rA   rB   rC   rD   c                 C   s&   t jj| |||||||||	|
||S r   )r:   rL  rM  r+  rI   r   r   r   rE     s   zrocm_aiter_ops.asm_moe_tkw1rK   rL   rM   rN   .c                 C   s   t jj| |||| | |fS r   )r:   rL  rM  r2  rR   r   r   r   rO     s   
zrocm_aiter_ops.topk_softmaxc                 C   s   t jj| || | |fS r   )r:   rL  rM  r3  rR   r   r   r   rT   !  s   zrocm_aiter_ops.topk_sigmoidrW   rX   rY   rZ   r[   r\   c              
   C       t jj| ||||||| d S r   )r:   rL  rM  r4  r_   r   r   r   r]   .     z"rocm_aiter_ops.biased_grouped_topkrb   rc   c              
   C   ra  r   )r:   rL  rM  r5  rg   r   r   r   rd   D  rb  zrocm_aiter_ops.grouped_topkry   rz   r{   r|   r   r}   r~   r   r   r   r   rl   rm   c                 C   s<   t jjj| |ddd| jd ||||||||	|
|d d S )Nr   r   )r   r   rl   rm   )r:   rL  rM  r6  r   r   )rz   r{   r|   r   r}   r~   r   r   r   r   rl   rm   r   r   r   rk   Z  s   
zrocm_aiter_ops.mla_decode_fwdr   r   c                 C   rY  r   )r:   rL  rM  rC  r   r   r   r   per_tensor_quantx     zrocm_aiter_ops.per_tensor_quantc                 C   rY  r   )r:   rL  rM  rD  r   r   r   r   per_token_quant  rd  zrocm_aiter_ops.per_token_quantweight_scale	out_dtypex_scalesc           
      C   sn   ddl m} ddlm} |d u r|| \}}n| }|}tj|jd |jd |j|d}	|||||j||	 |	S )Nr   )gemm_afp4wfp4)dynamic_mxfp4_quant)r   r3   )	aiter.ops.triton.gemm_afp4wfp4ri  aiter.ops.triton.quantrj  r:   r   r   r   T)
r   r   rf  rg  rh  ri  rj  x_qx_syr   r   r   triton_fp4_gemm_dynamic_qaunt  s   z,rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt	positionsquerykeycos_sin_cache	head_size
rotary_dimis_neox_stylec              
   C   s   ddl m} |  }|jddd\}	}
|j}|j}|rdnd}||d|}||d|}|dd |f }|dd |f }| j|jd d  } ||||	|
| |dd	d
 ||}||}d S )Nr   )(rope_cached_thd_positions_2c_fwd_inplacer   r   )r   r   .TF)reuse_freqs_front_part
nope_first)aiter.ops.triton.ropery  numelchunkr   r   )rr  rs  rt  ru  rv  rw  rx  ry  
num_tokenscossinquery_shape	key_shaperotate_stylequery_key_r   r   r   triton_rotary_embed  s.   


z"rocm_aiter_ops.triton_rotary_embedXWw_scaler   transpose_bmprequantr   c              	   C   s"   ddl m} || ||||||dS )Nr   )batched_gemm_a16wfp4)rp  r  r  r   )%aiter.ops.triton.batched_gemm_a16wfp4r  )r  r  r  r   r  r  r   r  r   r   r   r    s   z#rocm_aiter_ops.batched_gemm_a16wfp4   WQr   r3   splitKYQconfigc
                 C   s(   ddl m}
 |
| |||||||||	d
S )Nr   )Gbatched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant)r   r   r3   r  r  r  r  )Xaiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quantr  )r  r  r  r   r   r3   r  r  r  r  aiter_triton_fp8_bmmr   r   r   triton_fp8_bmm  s   zrocm_aiter_ops.triton_fp8_bmminput_2dc                 C   s    |dksJ dt jj| |S )Nr  zGroup size must be 128)r:   rL  rM  rB  )r  r   r   r   r   group_fp8_quant  s   zrocm_aiter_ops.group_fp8_quantr   kc                 C      | |fdv S )N))       )i@     )i   i   )   r  )   r  )i   r  )i   r  )r     )r     r  r  )r  r  r   r   r  r   r   r   is_triton_gemm_w8a8_tuned  rP  z(rocm_aiter_ops.is_triton_gemm_w8a8_tunedc                 C   r  )N))r  r  )i   r  ) @  i   )i  r  )i   r  )r  r  )i 
  r  )i (  r  )r  r  )r   p  )r  r  )i H  r  r  )r  r  )i   r  )r  r  )r  r  ) 8  r  )r  r  )r  i   r   r  r   r   r   &is_triton_gemm_afp4wfp4_presh_ws_tuned  rP  z5rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned   r  tensorlayoutc                 C   s   ddl m} |||dS )Nr   shuffle_weightr  )aiter.ops.shuffler  )selfr  r  r  r   r   r   r  +  s   zrocm_aiter_ops.shuffle_weightr  tensorsc                    s$   ddl m t fdd|D S )a  
        Applies shuffle_weight function from AITER to each
        input tensor and returns them.

        Rearranges (shuffles) the input tensor/s
        into a specified block layout for optimized computation.

        Args:
            *tensors: Variable number of torch.Tensor objects.
            layout: A pair of integers specifying the block sizes used to divide
                the tensors during shuffling. Default is (16, 16).

        Returns:
        A Tuple of shuffled tensors.
        r   r  c                 3   s    | ]	}| d V  qdS )r  Nr   ).0r  r  r  r   r   	<genexpr>H  s    z1rocm_aiter_ops.shuffle_weights.<locals>.<genexpr>)r  r  rG  )r  r  r   r  r   shuffle_weights3  s   zrocm_aiter_ops.shuffle_weights)r	   N
Nr   r   FNNNNNNNNNNFNNr   rW   rb   rW   )NNNry   NNr   )FFN)r  )r  )b__name__
__module____qualname____doc__r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  classmethodr  r   boolr  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  staticmethodrJ  r   rO  rQ  rR  rS  rT  rU  rV  rW  rX  r:   Tensorfloatr   rG  rZ  float16r3   r[  listintr_  r   r2   rE   rO   rT   r]   strrd   rk   rc  re  bfloat16rq  r  r  dictr  r  r  r  r  r  r   r   r   r   r     s   >
 #

	
#	
				
#	


r   r  r  r  r  )NNNrW   ry   NNr   )Pr   collections.abcr   r:   
torch._opsr   	vllm.envsr   vllm.platformsr   vllm.utils.torch_utilsr   r   +vllm.v1.attention.ops.rocm_aiter_mla_sparser   r   	fp8_dtyper   r  r   r   r   r   r   r    fp8r   r  r  r3   r7   r=   rG   rJ   rP   rS   rU   rV   r  r^   r`   r  rf   rh   ri   __annotations__rx   r   r   r  r   r   r   r   r   r   r   r   rG  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rF  r   rJ  r   r   r   r   <module>   sF  

		

0	

	

+	







	
 	
	
 	
"	

-	




























      4