o
    iՌ                     @   sz  d Z ddlZddlmZmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( G d	d
 d
Z)ej*de+de+de,de+de-de,defddZ.e							d dej/dej/dej/dB dej/dB dej/dB de0de+de-dB de,deej/ej/f fddZ1g dZ2dS )!a  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Fused RMSNorm + FP4 Quantization using CuTe-DSL
================================================

High-performance fused kernel for RMS normalization followed by FP4 quantization.
This is an alternative backend to cuDNN, using CuTe-DSL for maximum flexibility
and performance on SM100+ architectures.

Supports both NVFP4 and MXFP4 quantization formats.
    N)CallableTuple)Float32Int32Uint8   )flashinfer_api   )FLOAT4_E2M1_MAXFLOAT8_E4M3_MAX	COPY_BITSget_sm_versionst_global_u64get_ptr_as_int64rcp_approx_ftzfmin_f32hmax2hmax_to_f32bfloat2_hmax2bfloat2_hmax_to_f32cvt_f32_to_e4m3fp8_e4m3_to_f32_and_rcpcvt_f32_to_ue8m0ue8m0_to_output_scaleload_8_half2half2_mul_8bfloat2_mul_8half2_max_abs_8bfloat2_max_abs_8half2_to_float16bfloat2_to_float16quantize_and_pack_16
row_reducepredicate_kc                   @   sX  e Zd ZdZ		d-dejdedededededB d	edB fd
dZ	e
dedejdedefddZe
dedefddZe
dedefddZe
dedededefddZe
dededededef
ddZdefddZejd ejd!ejd"ejd#ejd$ejd%ed&efd'd(Zejd ejd!ejd"ejd#ejd$ejd%ed&ed)ejd*ejfd+d,ZdS ).RMSNormFP4QuantKernela  
    Fused RMSNorm + FP4 Quantization Kernel.

    Key optimizations:
    1. Half2/BFloat2 SIMD for max-abs computation
    2. Branchless scale clamping via fmin_f32
    3. Cluster synchronization for large H dimensions
    4. Direct 128-bit vectorized global loads
    NdtypeH
block_sizeoutput_swizzledis_fp16
sm_versionscale_formatc           
      C   sX  || _ || _|| _|| _|| _|d ur|nt | _|d u r'|dkr#dnd| _n|| _|dv s5J d| | jdv s>J d| ||| j| _	|| j	 | _
| | j
| _| | j
| _| j| j | _t| jd d| _|jd	 }td	 | | _td| j
| j | j d | j | _| j| j | j | _|| | _|r|| }	|	d
 d | _d| _d S d S )N    ue8m0e4m3   r,   z!block_size must be 16 or 32, got )r.   r-   z&scale_format must be 'e4m3' or 'ue8m0'r	               )r%   r&   r'   r(   r)   r   r*   r+   _compute_cluster_n	cluster_n	H_per_cta_compute_threads_per_rowthreads_per_row_compute_num_threadsnum_threadsrows_per_blockmaxwarps_per_rowwidthr   vec_sizenum_vec_blockscols_per_tilenum_sf_blocks_per_rownum_k_tilesk_tile_stride)
selfr%   r&   r'   r(   r)   r*   r+   
elem_bytesnum_col_vecs rI   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/flashinfer/cute_dsl/rmsnorm_fp4quant.py__init__W   sD   



zRMSNormFP4QuantKernel.__init__returnc                 C   sh   |dk rdS t jt j }|j}|jd }dD ]}| | dkr"qt| ||}||kr1|  S qdS )a'  Compute optimal cluster size based on H and device shared memory.

        Dynamically determines the minimum cluster_n that fits within the
        device's shared memory limit, making it compatible with different
        GPU architectures (e.g., SM100 with 228KB vs SM120 with 128KB).
        Z   r	   r1   )r	   r   r3   r1   r0   r   r0   )torchcudaget_device_propertiescurrent_deviceshared_memory_per_block_optinr?   r$   _estimate_smem_bytes)r&   r%   r*   propsmax_smem_bytes	elem_sizer6   smem_neededrI   rI   rJ   r5      s   
z(RMSNormFP4QuantKernel._compute_cluster_nr7   c                 C   s@   | dkrdS | dkrdS | dkrdS | dkrdS | dkrdS d	S )
z3Compute optimal threads per row based on H per CTA.@   r1      r0   i   r,   i    @     rI   r7   rI   rI   rJ   r8      s   z.RMSNormFP4QuantKernel._compute_threads_per_rowc                 C   s   | dkrdS dS )z3Compute total threads per block based on H per CTA.rZ   rY   r[   rI   r\   rI   rI   rJ   r:      s   z*RMSNormFP4QuantKernel._compute_num_threadsr6   rV   c                 C   s   | | }t |}t |}|| }t|d d}td | }td|| | d | }	||	 | }
||
 | }|dkrFd| || d  S ||| | d  d S )zEstimate shared memory bytes needed for given configuration.

        This is used to dynamically determine cluster_n based on device
        shared memory limits.
        r,   r	   r1   r   r3   )r$   r8   r:   r=   r   )r&   r6   rV   r7   r9   r;   r<   r>   r@   rA   rB   
tile_bytesrI   rI   rJ   rS      s   

z*RMSNormFP4QuantKernel._estimate_smem_bytesr9   r<   r@   rA   c                 C   s4   | |f||ff}|| df||| |  ff}||fS )zBCreate Thread-Value layout for coalesced vectorized memory access.r	   rI   )r9   r<   r@   rA   shapestriderI   rI   rJ   _make_tv_layout   s   	
z%RMSNormFP4QuantKernel._make_tv_layoutc                 C   sh   | j | j | jjd  }| jdkr| j | j d }n| j | j | j d }| jdkr,dnd}|| | S )z$Calculate shared memory requirement.r1   r	   r3   r   )r<   rB   r%   r?   r6   r>   )rF   r]   reduction_bytes
mbar_bytesrI   rI   rJ   _smem_size_in_bytes   s   
z)RMSNormFP4QuantKernel._smem_size_in_bytesmXmWmYmSmGlobalScaleMepsc	                 C   s   |  | j| j| j| j\}	}
tj|	|
d}| j| jf}| |||||||||	j	t
|| j| jdg| jddgt| jdkrEd| jdgnd|  |d dS )a~  Host function to launch the kernel.

        Takes tensors directly via TVM-FFI.
        - mX: Input tensor, shape (M, H), row-major
        - mW: Weight tensor, shape (H,)
        - mY: Output FP4 tensor, shape (M, H // 2), row-major (packed)
        - mS: Scale factor tensor, shape depends on swizzle mode
        - mGlobalScale: Global scale tensor, shape (1,), float32
        r_   r	   N)gridblockclustersmemstream)r`   r9   r<   r@   rA   cutemake_layoutrB   kernellaunchceil_divr6   r;   cutlass
const_exprrc   )rF   rd   re   rf   rg   rh   ri   rj   rp   tv_shape	tv_stride	tv_layouttiler_mnrI   rI   rJ   __call__  s"   

zRMSNormFP4QuantKernel.__call__rz   r{   c
           Z   	   C   sl  t j \}
}}t j \}}}| j}| j}| j}| j}| j}t	
|dkr.t j d }nt	
d}|jd d }t|d d}|	d }|
| }|
| }ttt}t	j }|j|jt j|	dddd}t	
|dkr}|jtt ||fdd}d	}n|jtt |||ffdd}|jt	jdd
}t	
|dkr|
dkrt j|d t j  t j  t j  t |j}t ||	||f}t ||	||f}t |jt j|	d fdd} t  |j!| }!t |!|	d|f}t j"t j#j$% |jt&d}"t '|"||	}#|#(|
}$|$)|}%|$*|}&|$)|}'t +|%}(t,|'|d})|'d }*|*d |k }+|+r5t j-|"|%|&|)d t j.  t j/d t 0|&|( |(1 2t},|,|, }-t3|-t j4j5||||td}.|.| }/t j6j7|/| dd}0|d }1t	
|dkrt j  t j  nt j8  || | }2|2|k r|| d | }3t9|3D ]}4||4|  }5|5|k r|5| }6t	
|dkrt:|||2|6|\}7}8t	
|rt;|7|8}9t<|9}:t=|:};t>|9|0}<nt?|7|8}9t@|9}:tA|:};tB|9|0}<|;|0 }=|1|= | }>tC|>ttD}>tE|>}?tF|?t	Gd@ }@tH|?|1 }At	
| jIr^|5tJd }B|2tJd tJd }C|2tJd }D|5tJd }E|2tJd }F| jK| jL }G|F|G |E| jL  |DtJd  |CtJd  |B }H|@||H< n|@||2|5f< tM|<|A}I|6d }JtN||2|d  |J }KtO|K|I qt:|||2|6|\}L}Mt:|||2|6tJd |\}N}Ot	
|rt;|L|M}Pt;|N|O}Qt<|P}Rt<|Q}StP|R|S}:t=|:};t>|P|0}Tt>|Q|0}Un%t?|L|M}Pt?|N|O}Qt@|P}Rt@|Q}StQ|R|S}:tA|:};tB|P|0}TtB|Q|0}U|;|0 }=t	
| jRdkr|=| }>tS|>}VtF|Vt	Gd@ }WtT|V}An |1|= | }>tC|>ttD}>tE|>}?tF|?t	Gd@ }WtH|?|1 }At	
| jIrv|5tJd }B|2tJd tJd }C|2tJd }D|5tJd }E|2tJd }F| jK| jL }G|F|G |E| jL  |DtJd  |CtJd  |B }H|W||H< n|W||2|5f< tM|T|A}X|6d }JtN||2|d  |J }KtO|K|X tM|U|A}Y|6d d }JtN||2|d  |J }KtO|K|Y qd	S d	S )a  Device kernel with cluster synchronization for large H.

        mGlobalScale contains the global scale value. The kernel reads it and
        computes 1/global_scale, which is multiplied with rstd to apply:
        y = x * rstd * w / global_scale = rmsnorm(x, w) / global_scale
        r	   r   r,   r	   r   )orderr0   )byte_alignmentr3   N)	num_elems)r   rk   )num_bits_per_copy)limit))r   r   r   r   )predg        T)fastmath   rY   r   r-   )Urq   arch
thread_idx	block_idxr&   r'   rC   r)   r6   rv   rw   r^   r=   r   r   r
   utilsSmemAllocatorallocate_tensorelement_typemake_ordered_layoutrr   allocate_arrayInt64mbarrier_initmbarrier_init_fencecluster_arrive_relaxedcluster_waitmake_identity_tensor
local_tileprependlayoutmake_tensoriteratormake_copy_atomnvgpucpasync	CopyG2SOpr   make_tiled_copy	get_slicepartition_Spartition_Dmake_fragment_liker#   copycp_async_commit_groupcp_async_wait_groupautovec_copyloadtor"   ReductionOpADDmathrsqrtbarrierranger   r   r   r   r   r   r   r   r    r   r   r   r   Uint32r   r(   r   rD   rE   r!   r   r   r   r   r+   r   r   )ZrF   rd   re   rf   rg   rh   ri   rj   rz   r{   tidx_bidxr&   r'   rC   r)   r6   	cluster_yr9   r>   r<   lane_in_rowrow_in_blockfp4_max_rcpro   sXreduction_buffermbar_ptridXgXcXmW_expanded_layoutmW_2dcopy_atom_load_asynctiled_copy_load
thr_copy_XtXgXtXsXtXcXtXrXtXpX	row_coordrow_in_boundsxx_sqsum_sqmean_sqrstdglobal_scale_valactual_row_idxnum_sf_per_threadsf_itersf_idxblock_startx_h2w_h2xw_h2	max_xw_h2max_xwy_f32max_absscale_floatscale_fp8_u32	scale_fp8	inv_scaleinner_k_idxinner_m_idxouter_m_idx
k_tile_idx
m_tile_idxm_tile_strideswizzled_offsetpacked64
out_offsetout_ptrx_h2_c0w_h2_c0x_h2_c1w_h2_c1xw_h2_c0xw_h2_c1	max_c0_h2	max_c1_h2y_f32_c0y_f32_c1scale_ue8m0scale_u8packed64_c0packed64_c1rI   rI   rJ   rs   -  s  












































 zRMSNormFP4QuantKernel.kernel)NN)__name__
__module____qualname____doc__rv   NumericintboolstrrK   staticmethodr5   r8   r:   rS   tupler`   rc   rq   jitTensorr   r   r|   rs   LayoutShaperI   rI   rI   rJ   r$   L   s    
<*	
r$   hidden_sizer'   r)   r*   r+   is_sf_swizzled_layoutrL   c                    s@  |rt jnt j}t|| ||||d}t }tjj||| fddd}	tjj|| fdd}
tjjt j|| d fddd}rNt }tjjt j|fdd}ntjjt j|| | fddd}tjj	dd}tjjt j
d	d
d}tj||	|
|||tdt
d|dd
 dtjdtjdtjdtjdtjdtdtddf fdd}|S )z
    Get a compiled kernel closure that takes torch.Tensor directly.

    Uses TVM-FFI for efficient tensor passing without manual pointer construction.
    )r%   r&   r'   r(   r)   r*   r+   r}   rY   )stride_orderassumed_align)r  r   T)use_tvm_ffi_env_stream)r	   r3   r	   ư>z--enable-tvm-ffi)optionsr   wysglobal_scaleri   rj   rL   Nc           	   	      s@   r|  n| }|tj} | ||||t|t| dS )z;Runtime API that passes torch tensors directly via TVM-FFI.N)flatten
contiguousviewrN   uint8r   r   )	r   r  r  r  r  ri   rj   s_tensory_uint8compiled_kernelr  rI   rJ   
tensor_api  s   z(_get_compiled_kernel.<locals>.tensor_api)rv   Float16BFloat16r$   rq   sym_intruntimemake_fake_compact_tensorr   make_fake_streamr   compiler   rN   r  r   float)r  r'   r)   r*   r+   r  cutlass_dtype
kernel_objsym_mx_fakew_fakey_fakesym_swizzled_sizes_fakestream_fakeglobal_scale_faker  rI   r  rJ   _get_compiled_kernel  s   

r+  r
  r0   Finputweighty_fp4block_scaler  rj   c	                 C   s  |   dk}	|	r| j\}
}}| |
| | }n| }|j\}}| j}|| dks-J d|dks5J d|dv s=J d|tjk}|rF|n|dkrLd	nd
}t| j}|du rw|	ritj	|
||d ftj
| jd}ntj	||d ftj
| jd}|du r|d	krtjntj}|| }|r|d d }|d d }d}|| | }tj	|f|| jd}n|	rtj	|
||f|| jd}ntj	||f|| jd}|	r||
| d}|s||
| dn|}n|}|}|du rtjdtj| jd}t||||||}|| | ||tj| || ||fS )a  
    Fused RMS normalization with FP4 quantization using CuTe-DSL.

    Computes: ``y = RMSNorm(input) * weight``, optionally applies global scaling
    (``y = y / global_scale``), then quantizes ``y`` to FP4.

    Parameters
    ----------
    input : torch.Tensor
        Input tensor, shape ``(batch_size, hidden_size)`` or ``(batch_size, seq_len, hidden_size)``.
        Must be ``torch.float16`` or ``torch.bfloat16``.
    weight : torch.Tensor
        Weight tensor for RMSNorm, shape ``(hidden_size,)``.
        Must have the same dtype as input.
    y_fp4 : torch.Tensor, optional
        Output tensor for quantized values in FP4_E2M1 format with dtype
        ``torch.float4_e2m1fn_x2``.
        Shape must be ``(batch_size, hidden_size // 2)`` or matching 3D input.
        If ``None``, will be allocated automatically.
    block_scale : torch.Tensor, optional
        Output tensor for per-block scale factors.

        - If ``is_sf_swizzled_layout=False`` (default): row-major layout with shape
          ``(batch_size, hidden_size // block_size)`` or matching 3D input.
        - If ``is_sf_swizzled_layout=True``: swizzled layout for efficient tensor core
          access, with shape ``(batch_size * hidden_size // block_size,)`` flattened.
          The swizzle pattern uses 128x4 tiles where scales are arranged as:
          ``[m_tile][k_tile][outer_m (32)][inner_m (4)][inner_k (4)]``.

        Dtype should be ``torch.float8_e4m3fn`` for E4M3 format or ``torch.uint8``
        for UE8M0 format. If ``None``, will be allocated automatically.
    global_scale : torch.Tensor, optional
        Global scale factor tensor of shape ``(1,)`` with dtype ``torch.float32``.
        If provided, the RMSNorm output is divided by this value before quantization:
        ``y = rmsnorm(x, w) / global_scale``. This is used for NVFP4 format where
        a pre-computed global scale lifts per-block scales into optimal dynamic range.
        If ``None``, no global scaling is applied (equivalent to global_scale=1.0).
    eps : float
        Epsilon for numerical stability in RMSNorm. Default is ``1e-6``.
    block_size : int
        Number of elements per quantization block. Default is ``16``.

        - ``16``: NVFP4 format with E4M3 scale factors
        - ``32``: MXFP4 format with UE8M0 scale factors
    scale_format : str, optional
        Scale factor format: ``"e4m3"`` or ``"ue8m0"``.
        If ``None``, auto-selects based on ``block_size``:
        ``"e4m3"`` for block_size=16, ``"ue8m0"`` for block_size=32.
    is_sf_swizzled_layout : bool
        If ``True``, output scale factors in swizzled layout optimized for
        tensor core GEMM operations. The swizzle uses 128x4 tiles with the pattern:
        ``[m_tile_idx * k_tiles * 512 + k_tile_idx * 512 + outer_m * 16 + inner_m * 4 + inner_k]``
        where ``outer_m = row % 32``, ``inner_m = (row % 128) // 32``, etc.
        Default is ``False`` (row-major layout).

    Returns
    -------
    Tuple[torch.Tensor, torch.Tensor]
        A tuple of ``(y_fp4, block_scale)``:

        - ``y_fp4``: Quantized FP4 values packed as uint8.
        - ``block_scale``: Per-block scale factors.

    Notes
    -----
    - Requires SM100+ (Blackwell) for FP4 quantization PTX intrinsics.
    - For block_size=16 (NVFP4): uses E4M3 scale factors (max value 448.0).
    - For block_size=32 (MXFP4): uses UE8M0 scale factors (power-of-2 scales).
    - FP4 E2M1 format has a max representable value of 6.0.
    r2   r   z+hidden_size must be divisible by block_sizerX   zhidden_size must be >= 64r/   zblock_size must be 16 or 32r,   r-   r.   Nr   )r%   device   rY   r3   r4   r	   )dimr^   r  r  r%   rN   float16r   r0  emptyfloat4_e2m1fn_x2r  float8_e4m3fnonesfloat32r+  )r,  r-  r.  r/  r  rj   r'   r+   r  is_3dBSr&   input_2d
batch_sizer  r%   r)   actual_scale_formatr*   scale_dtyperC   num_m_tilesrD   rE   swizzled_sizey_fp4_2dblock_scale_2dr  rI   rI   rJ   rmsnorm_fp4quant  s   S






rE  )r$   r   rE  )NNNr
  r0   NF)3r   	functoolstypingr   r   rv   cutlass.cuterq   rN   r   r   r   api_loggingr   
fp4_commonr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   cacher   r   r   r+  r  r   rE  __all__rI   rI   rI   rJ   <module>   sz    p)    Bm	
 9