o
    U۷i                  	   @   s  d Z ddlmZmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlZd	ddd
deeef deeef dedefddZ				d,ddddededefddZd	ddddededefddZddddedefddZ d-deeef dedeeef fd d!Z!d-deeef dedeeef fd"d#Z"G d$d% d%Z#G d&d' d'Z$G d(d) d)Z%G d*d+ d+e%Z&dS ).z
Shared utilities for grouped GEMM kernels.

This module contains the tile scheduler classes and helper functions used by both
the forward (grouped_gemm_swiglu) and backward (grouped_gemm_dswiglu) kernels.
    )TupleUnion)BooleanIntegerInt32minextract_mlir_valuesnew_from_mlir_valuesdsl_user_op
const_expr)ir)scfllvmnvvm)T)Float32r   NT)nanlocipabr   returnc             
   C   sR   |rd}nd}t tjt t | j||dt |j||dg|dddtjjdS )zCompute the minimum of two float32 values with NaN handling.

    :param a: First operand
    :param b: Second operand
    :param nan: If True, propagate NaN values
    :return: Minimum value
    zmin.NaN.f32 $0, $1, $2;zmin.f32 $0, $1, $2;r   r   z=f,f,fTFhas_side_effectsis_align_stackasm_dialect)r   r   
inline_asmr   f32ir_value
AsmDialectAD_ATT)r   r   r   r   r   	ptx_instr r#   N/home/ubuntu/vllm_env/lib/python3.10/site-packages/cudnn/grouped_gemm/utils.pyfmin<   s   "r%       Fr   mask_and_clampabsc             
   C   sT   t | }| j||d}t|j||d}	d}
|tjt ||	g|
dddtjjdS )al  Perform a warp-level reduction synchronization for max with abs and NaN.

    :param value: Value to reduce
    :param kind: Reduction kind (unused, kept for API compatibility)
    :param mask_and_clamp: Warp mask and clamp value
    :param abs: Whether to use absolute value
    :param nan: Whether to handle NaN values
    :return: Reduced value across warp
    r   z&redux.sync.max.abs.NaN.f32 $0, $1, $2;z=f,f,iTFr   )	typer   r   r   r   r   r   r    r!   )valuekindr'   r(   r   r   r   
value_typevalue_irmask_irr"   r#   r#   r$   warp_redux_sync\   s   r/   )positive_onlyr   r   r*   r0   c                C   s^   t jt |j||d||d}tjt tjj	jj
j| |||d}tt jt |||dS )a\  Perform atomic max operation on a float32 value in global memory.

    This implementation works correctly for non-negative values (>= 0) using direct bitcast.

    :param ptr: Pointer to the memory location
    :param value: The float32 value to compare and potentially store (should be >= 0)
    :return: The old value at the memory location
    r   resopptrr   r   r   )r   bitcastr   i32r   r   	atomicrmwcutlass_mlirdialectsAtomicOpKindMAXr   r   )r4   r*   r0   r   r   	value_intold_value_intr#   r#   r$   atomic_max_float32   s    	r?   c                C   s6   t jt tjjj jj| |j	||d||d}t
|S )zPerform atomic add operation on a float32 value in global memory.

    :param ptr: Pointer to the memory location
    :param value: The float32 value to add
    :return: The old value at the memory location
    r   r1   )r   r7   r   r   r8   r9   r:   r;   FADDr   r   )r4   r*   r   r   	old_valuer#   r#   r$   atomic_add_float32   s   	rB   fastmathc                 C   s   t jdt jj|  |d S )zCompute the sigmoid function: 1 / (1 + exp(-a)).

    :param a: Input value
    :param fastmath: Whether to use fast math approximations
    :return: Sigmoid of input
    g      ?rC   )cutearch
rcp_approxmathexpr   rC   r#   r#   r$   sigmoid_f32   s   rK   c                 C   s   | t | |d S )zCompute the SiLU (Swish) activation: a * sigmoid(a).

    :param a: Input value
    :param fastmath: Whether to use fast math approximations
    :return: SiLU of input
    rD   )rK   rJ   r#   r#   r$   silu_f32   s   rL   c                   @   sv   e Zd ZdZdejdefddZdee	j
 fddZd	ee	j
 dd fd
dZedefddZedejfddZdS )WorkTileInfozA class to represent information about a work tile.

    :ivar tile_idx: The index of the tile.
    :type tile_idx: cute.Coord
    :ivar is_valid_tile: Whether the tile is valid.
    :type is_valid_tile: Boolean
    tile_idxis_valid_tilec                 C   s   || _ t|| _d S N)	_tile_idxr   _is_valid_tile)selfrN   rO   r#   r#   r$   __init__   s   zWorkTileInfo.__init__r   c                 C   s   t | j}|t | j |S rP   )r   rN   extendrO   rS   valuesr#   r#   r$   __extract_mlir_values__   s   
z$WorkTileInfo.__extract_mlir_values__rW   c                 C   s@   t |dksJ t| j|d d }t| j|d g}t||S )N   )lenr	   rQ   rR   rM   )rS   rW   new_tile_idxnew_is_valid_tiler#   r#   r$   __new_from_mlir_values__   s   
z%WorkTileInfo.__new_from_mlir_values__c                 C      | j S )zCheck latest tile returned by the scheduler is valid or not.

        Any scheduling requests after all tasks completed will return an invalid tile.

        :return: The validity of the tile.
        :rtype: Boolean
        )rR   rS   r#   r#   r$   rO      s   	zWorkTileInfo.is_valid_tilec                 C   r_   )zgGet the index of the tile.

        :return: The index of the tile.
        :rtype: cute.Coord
        )rQ   r`   r#   r#   r$   rN      s   zWorkTileInfo.tile_idxN)__name__
__module____qualname____doc__rE   Coordr   rT   listr   ValuerX   r^   propertyrO   rN   r#   r#   r#   r$   rM      s    
rM   c                   @   sx   e Zd ZdZe		dddddejdejded	efd
dZ	dd Z
dd Zeddddedeeeef fddZdS )PersistentTileSchedulerParamsa  A class to represent parameters for a persistent tile scheduler.

    This class is designed to manage and compute the layout of clusters and tiles
    in a batched gemm problem.

    :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1).
    :type cluster_shape_mn: tuple
    :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of
        number of clusters in (m, n, l) dimensions.
    :type problem_layout_ncluster_mnl: cute.Layout
    T   Nr   problem_shape_ntile_mnlcluster_shape_mnkraster_along_mswizzle_sizec                C   s  |d dkrt d|d  |dk rt d| || _|| _|dd | _|| _|| _|| _tjtj	| j|dd ||d||d| _
|dkrt| j
j|rUd|dfn|ddf}|rtj|d ||d | f|d f|d||d  f|d |d  f||d| _
n&tj||d | f|d |d fd||d  f||d |d  f||d| _
|dkrtj| j
||d}| j
jd }	| j
jd }
tj|||d| _tj|	||d| _tj|
||d| _dS d| _d| _d| _dS )	a  Initializes the PersistentTileSchedulerParams with the given parameters.

        :param problem_shape_ntile_mnl: The shape of the problem in terms of
            number of CTA (Cooperative Thread Array) in (m, n, l) dimensions.
        :type problem_shape_ntile_mnl: cute.Shape
        :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions.
        :type cluster_shape_mnk: cute.Shape
        :param swizzle_size: Swizzling size in the unit of cluster. 1 means no swizzle
        :type swizzle_size: int
        :param raster_along_m: Rasterization order of clusters. Only used when swizzle_size > 1.
            True means along M, false means along N.
        :type raster_along_m: bool

        :raises ValueError: If cluster_shape_k is not 1.
           rj   zunsupported cluster_shape_k z"expect swizzle_size >= 1, but get Nr   r   )strider   r   )
ValueErrorrk   _cluster_shape_mnkcluster_shape_mnrn   _raster_along_m_locrE   make_layoutceil_divproblem_layout_ncluster_mnlround_upshapesizefast_divmod_create_divisor	batch_fddcluster_shape_m_fddcluster_shape_n_fdd)rS   rk   rl   rm   rn   r   r   problem_shape_ncluster_mnlproblem_layout_sizecluster_count_mcluster_count_nr#   r#   r$   rT     sn   

z&PersistentTileSchedulerParams.__init__c           
      C   s   g g }| _ | j| j| j| jfD ]}t|}||7 }| j t| qg }g }td| j	fd| j
fd| jfgD ]\}\}}|d urOt|}	||	 || q7||7 }| j t| || _|S )Nr}   r~   r   )_values_posrk   rr   rt   rn   r   appendr[   	enumerater}   r~   r   rU   _fastdivmod_indices)
rS   rW   obj
obj_valuesfastdivmod_valuesfastdivmod_indicesifdd_namefdd_obj
fdd_valuesr#   r#   r$   rX   u  s4   

z5PersistentTileSchedulerParams.__extract_mlir_values__c                 C   s   g }t |}t| j| j| j| jg| jd d D ]\}}|t||d |  ||d  }qt	t
|d| ji}g d}t| drtt| jdkrtt| jD ]%\}}	||	 }
t| |
}|d urs|t|k rst||| g}t||
| qN|S )NrZ   r   )r}   r~   r   r   r   )rf   ziprk   rr   rt   rn   r   r   r	   ri   tupleru   hasattrr[   r   r   getattrsetattr)rS   rW   obj_listvalues_copyr   n_items
new_params	fdd_namesjoriginal_indexr   original_fddreconstructed_fddr#   r#   r$   r^     s.   	
z6PersistentTileSchedulerParams.__new_from_mlir_values__max_active_clustersr   c          
      C   sx   t dd t| jj| jD | jjd f }tj|||d}tj| j||d}|| }t||}|| }	g | j|	R S )a  Computes the grid shape based on the maximum active clusters allowed.

        :param max_active_clusters: The maximum number of active clusters that
            can run in one wave.
        :type max_active_clusters: Int32

        :return: A tuple containing the grid shape in (m, n, persistent_clusters).
            - m: self.cluster_shape_m.
            - n: self.cluster_shape_n.
            - persistent_clusters: Number of persistent clusters that can run.
        c                 s   s"    | ]\}}t || V  qd S rP   )rE   r{   ).0xyr#   r#   r$   	<genexpr>  s     z?PersistentTileSchedulerParams.get_grid_shape.<locals>.<genexpr>ro   r   )r   r   rx   rz   rs   rE   r{   r   )
rS   r   r   r   num_ctas_mnlnum_ctas_in_problemnum_ctas_per_clusternum_ctas_per_wavenum_persistent_ctasnum_persistent_clustersr#   r#   r$   get_grid_shape  s   

z,PersistentTileSchedulerParams.get_grid_shape)Trj   )ra   rb   rc   rd   r
   rE   ShapeboolintrT   rX   r^   r   r   r   r   r#   r#   r#   r$   ri      s*    i$%*ri   c                   @   sb  e Zd ZdZdedededejdef
ddZd	e	e
j fd
dZde	e
j d	d fddZeeddddedeeeef deeeef fddZeddddeded	eeeef fddZdddded	efddZdddded	eeeef fddZedddd	efddZedddd	efddZed ddd!d"efd#d$Zed	efd%d&ZdS )'StaticPersistentTileSchedulera  A scheduler for static persistent tile execution in CUTLASS/CuTe kernels.

    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
    :type params: PersistentTileSchedulerParams
    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
    :type num_persistent_clusters: Int32
    :ivar cta_id_in_cluster: ID of the CTA within its cluster
    :type cta_id_in_cluster: cute.Coord
    :ivar _num_tiles_executed: Counter for executed tiles
    :type _num_tiles_executed: Int32
    :ivar _current_work_linear_idx: Current cluster index
    :type _current_work_linear_idx: Int32
    paramsr   current_work_linear_idxcta_id_in_clusternum_tiles_executedc                 C   s"   || _ || _|| _|| _|| _dS )a  Initializes the StaticPersistentTileScheduler with the given parameters.

        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
        :type params: PersistentTileSchedulerParams
        :param num_persistent_clusters: Number of persistent clusters that can be launched.
        :type num_persistent_clusters: Int32
        :param current_work_linear_idx: Current cluster index.
        :type current_work_linear_idx: Int32
        :param cta_id_in_cluster: ID of the CTA within its cluster.
        :type cta_id_in_cluster: cute.Coord
        :param num_tiles_executed: Counter for executed tiles.
        :type num_tiles_executed: Int32
        N)r   r   _current_work_linear_idxr   _num_tiles_executed)rS   r   r   r   r   r   r#   r#   r$   rT     s
   
z&StaticPersistentTileScheduler.__init__r   c                 C   sN   t | j}|t | j |t | j |t | j |t | j |S rP   )r   r   rU   r   r   r   r   rV   r#   r#   r$   rX     s   
z5StaticPersistentTileScheduler.__extract_mlir_values__rW   c                 C   s   t |dksJ t| j|d g}t| j|d g}t| j|dd }t| j|d g}|dd  }t| j|}t|||||S N   r   rj   ro      )r[   r	   r   r   r   r   r   r   rS   rW   new_num_persistent_clustersnew_current_work_linear_idxnew_cta_id_in_clusternew_num_tiles_executedparams_valuesr   r#   r#   r$   r^     s   z6StaticPersistentTileScheduler.__new_from_mlir_values__Nr   	block_idxgrid_dimc                C   sv   t j|||dt j| j||d }|\}}}t|}	t|| jd  t|| jd  tdf}
td}t| ||	|
|S )a   Initialize the static persistent tile scheduler.

        :param params: Parameters for the persistent tile scheduler.
        :type params: PersistentTileSchedulerParams
        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
        :type block_idx: Tuple[Integer, Integer, Integer]
        :param grid_dim: The 3d grid dimensions for kernel launch.
        :type grid_dim: Tuple[Integer, Integer, Integer]

        :return: A StaticPersistentTileScheduler object.
        :rtype: StaticPersistentTileScheduler
        r   r   rj   )rE   r{   rs   r   r   )r   r   r   r   r   r   bidxbidybidzr   r   r   r#   r#   r$   create%  s   "
z$StaticPersistentTileScheduler.creater   c                C   s   | j |||dS )a  Calculates the grid shape to be launched on GPU using problem shape,
        threadblock shape, and active cluster size.

        :param params: Parameters for grid shape calculation.
        :type params: PersistentTileSchedulerParams
        :param max_active_clusters: Maximum active clusters allowed.
        :type max_active_clusters: Int32

        :return: The calculated 3d grid shape.
        :rtype: Tuple[Integer, Integer, Integer]
        r   )r   )r   r   r   r   r#   r#   r$   r   V  s   z,StaticPersistentTileScheduler.get_grid_shapec                C   s   |t j| jj||dk }| jjdkr| j|||d}n
| jjj|||d}tdd t|| j	g | jj
tdR D }t||S )h  Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.

        :param current_work_linear_idx: The linear index of the current work.
        :type current_work_linear_idx: Int32

        :return: An object containing information about the current tile coordinates
            and validity status.
        :rtype: WorkTileInfo
        r   rj   c                 s   s.    | ]\}}}t |t | t | V  qd S rP   )r   )r   r   r   zr#   r#   r$   r     s
    
zQStaticPersistentTileScheduler._get_current_work_for_linear_idx.<locals>.<genexpr>)rE   r{   r   rx   rn   %_get_cluster_work_idx_with_fastdivmodget_flat_coordr   r   r   rs   r   rM   )rS   r   r   r   is_validcur_cluster_coordcur_tile_coordr#   r#   r$    _get_current_work_for_linear_idxm  s   
	z>StaticPersistentTileScheduler._get_current_work_for_linear_idxc          
      C   s@   t || jj\}}t || jj\}}t || jj\}}	||	|fS )a  FastDivmod optimized CLUSTER coordinate calculation.

        CRITICAL: This should mimic problem_layout_ncluster_mnl.get_hier_coord()
        which returns CLUSTER coordinates, not tile coordinates!

        :param current_work_linear_idx: Linear index in the work space
        :type current_work_linear_idx: Int32
        :return: Cluster coordinates (m, n, l) or None if FastDivmod not available
        :rtype: Tuple[Int32, Int32, Int32] or None
        )divmodr   r}   r~   r   )
rS   r   r   r   work_iterationwork_unit_idcluster_n_batch	cluster_mbatch_l	cluster_nr#   r#   r$   r     s   
zCStaticPersistentTileScheduler._get_cluster_work_idx_with_fastdivmodc                C   s   | j | j||dS Nr   )r   r   rS   r   r   r#   r#   r$   get_current_work  s   z.StaticPersistentTileScheduler.get_current_workc                C   s   | j ||dS r   )r   r   r#   r#   r$   initial_work_tile_info  s   z4StaticPersistentTileScheduler.initial_work_tile_inforj   )advance_countr   r   r   c                C   s2   |  j t|t| j 7  _ |  jtd7  _d S )Nrj   )r   r   r   r   )rS   r   r   r   r#   r#   r$   advance_to_next_work  s   z2StaticPersistentTileScheduler.advance_to_next_workc                 C   r_   rP   )r   r`   r#   r#   r$   r     s   z0StaticPersistentTileScheduler.num_tiles_executed)ra   rb   rc   rd   ri   r   rE   re   rT   rf   r   rg   rX   r^   staticmethodr
   r   r   r   r   rM   r   r   r   r   r   r   rh   r   r#   r#   r#   r$   r     s\    
/$!r   c                       s   e Zd ZdZ	ddedededejdedef fd	d
Z	de
ej dd fddZee	dddddedeeeef deeeef defddZddddedefddZ  ZS )$StaticPersistentRuntimeTileSchedulera  A scheduler for static persistent runtime tile execution in CUTLASS/CuTe kernels.

    This scheduler will always launch all the SMs and the scheduler will generate
    the real tile info for each SM.

    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
    :type params: PersistentTileSchedulerParams
    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
    :type num_persistent_clusters: Int32
    :ivar cta_id_in_cluster: ID of the CTA within its cluster
    :type cta_id_in_cluster: cute.Coord
    :ivar _num_tiles_executed: Counter for executed tiles
    :type _num_tiles_executed: Int32
    :ivar _current_work_linear_idx: Current cluster index
    :type _current_work_linear_idx: Int32
    rj   r   r   r   r   r   
inner_modec                    s4   t  ||||| |dvrtd| || _dS )aF  Initializes the StaticPersistentRuntimeTileScheduler with the given parameters.

        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
        :type params: PersistentTileSchedulerParams
        :param num_persistent_clusters: Number of persistent clusters that can be launched.
        :type num_persistent_clusters: Int32
        :param current_work_linear_idx: Current cluster index.
        :type current_work_linear_idx: Int32
        :param cta_id_in_cluster: ID of the CTA within its cluster.
        :type cta_id_in_cluster: cute.Coord
        :param num_tiles_executed: Counter for executed tiles.
        :type num_tiles_executed: Int32
        :param inner_mode: The inner mode along which the linear index will be decomposed first.
        :type inner_mode: int
        )r   rj   z;inner_mode must be 0(for M mode) or 1(for N mode), but got N)superrT   rq   r   )rS   r   r   r   r   r   r   	__class__r#   r$   rT     s   
z-StaticPersistentRuntimeTileScheduler.__init__rW   r   c                 C   s   t |dksJ t| j|d g}t| j|d g}t| j|dd }t| j|d g}|dd  }t| j|}t|||||| jS r   )	r[   r	   r   r   r   r   r   r   r   r   r#   r#   r$   r^     s   z=StaticPersistentRuntimeTileScheduler.__new_from_mlir_values__Nr   r   r   c                C   sx   t j|||dt j| j||d }|\}}}	t|	}
t|| jd  t|| jd  tdf}td}t| ||
|||S )a  Initialize the static persistent tile scheduler.

        :param params: Parameters for the persistent tile scheduler.
        :type params: PersistentTileSchedulerParams
        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
        :type block_idx: Tuple[Integer, Integer, Integer]
        :param grid_dim: The 3d grid dimensions for kernel launch.
        :type grid_dim: Tuple[Integer, Integer, Integer]
        :param inner_mode: The inner mode along which the linear index will be decomposed first.
        :type inner_mode: int

        :return: A StaticPersistentRuntimeTileScheduler object.
        :rtype: StaticPersistentRuntimeTileScheduler
        r   r   rj   )rE   r{   rs   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r#   r$   r     s    "
z+StaticPersistentRuntimeTileScheduler.createc          
      C   s   | j jj}d}t| jdkrtj||d f|d dfd}ntj|d |fd|d fd}||}|d |d tdf}d}	t	||	S )r   irj   )rp   r   T)
r   rx   rz   r   r   rE   rv   get_hier_coordr   rM   )
rS   r   r   r   ntile_shapeint_maxntile_layoutcluster_tile_coord_mnr   r   r#   r#   r$   r   9  s   

 

zEStaticPersistentRuntimeTileScheduler._get_current_work_for_linear_idx)rj   )ra   rb   rc   rd   ri   r   rE   re   r   rT   rf   r   rg   r^   r   r
   r   r   r   rM   r   __classcell__r#   r#   r   r$   r     sB    #"3r   )Nr&   FN)F)'rd   typingr   r   cutlass.cutlass_dslr   r   r   r   r   r	   r
   r   cutlass._mlirr   cutlass._mlir.dialectsr   r   r   r   cutlass.cute.typingr   	CuteInt32cutlass.cuterE   r8   floatr   r%   r   r/   r?   rB   rK   rL   rM   ri   r   r   r#   r#   r#   r$   <module>   sx   (



"
)
"
((
- ` _