o
    'i/                 %   @   s  d dl mZmZmZmZ d dlm  mZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlm  mZ d dlm  mZ d dlZd dlZd dlmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d d	l/m0Z0 d d
l1m2Z2 ddlm3Z3m4Z4m5Z5m6Z6 d dl m7Z7m8Z8 dZ9e(dddde%de"de$de%fddZ:e(dddde%de"de$fddZ;e(dddde#de"de"fddZ<G dd dZ=G dd dZ>	 G dd  d Z?ej@d!ejAd"ejAfd#d$ZBej@d"ejAfd%d&ZCd'd( ZDG d)d* d*ZEejFd+eGd,eGd-eGd.eGd/eHd0eHd1eHd2ee	jI d3ee	jI d4ee	jI d5eee	jI  d6eGd7eeGeGf d8eeGeGf d9eGd:eHd;eJde7f$d<d=ZKe0ddd>d?eejAejAf d@eejAejAf dAejAdBejAd2eHd3eHd4eHd6eGdCeejA d9eeG fdDdEZLdS )F    )OptionalTupleTypeUnionN)ir)cpasynctcgen05)from_dlpack)	Int32Int64Uint8Uint64TIntegerdsl_user_opextract_mlir_valuesnew_from_mlir_values)llvm)get_compute_capability)flashinfer_api)WorkTileInfo   )get_cutlass_dtypecutlass_to_torch_dtype
get_num_smmake_ptr)CallableList   locipobjindexvaluereturnc                C   s>   | d|d >  M } | ||d > O } t | tsJ d| | S )N      zobj=)
isinstancer   )r"   r#   r$   r    r!    r)   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/cute_dsl/blockscaled_gemm.py	with_byteC   s   r+   c                C   s   | |d ? d@  tS )Nr'   r&   )tor   )r"   r#   r    r!   r)   r)   r*   	read_byteK   s   r-   addrc             
   C   s@   t tjt | j||dt |j||dgddddtjjdS )Nr   z-atom.add.release.gpu.global.s32 $0, [$1], $2;z=r,l,rTF)has_side_effectsis_align_stackasm_dialect)r
   r   
inline_asmr   i32ir_value
AsmDialectAD_ATT)r.   r$   r    r!   r)   r)   r*   atomic_add_release_globalP   s   r7   c                   @   s~   e Zd Zddddejdeej dejdeeef dej	f
dd	Z
d
d Zdd Zeddddedeeeef fddZdS )MaskedSchedulerParamsNr   masked_mdst_signalscc_tilercluster_shape_mnkc          
      C   s   |d dkrt d|d  tj||d}|d j}	|| _|| _|| _|| _|	| _|| _	|d d | _
|| _tjtj| j|d d ||d||d| _d S )N   r   zunsupported cluster_shape_k )tiler)r   NNNr   )
ValueErrorcutezipped_divideshaper9   r:   r;   r<   problem_shape_ntile_mnl_cluster_shape_mnkcluster_shape_mn_locmake_layoutceil_divproblem_layout_ncluster_mnl)
selfr9   r:   r;   r<   r=   r    r!   gcrE   r)   r)   r*   __init__c   s&   
zMaskedSchedulerParams.__init__c                 C   sN   g g }| _ | j| j| j| j| jfD ]}t|}||7 }| j t| q|S N)	_values_posr9   r:   r;   r<   rF   r   appendlen)rL   valuesr"   
obj_valuesr)   r)   r*   __extract_mlir_values__   s   z-MaskedSchedulerParams.__extract_mlir_values__c                 C   sj   g }t | j| j| j| j| jg| jddD ]\}}|t||d |  ||d  }qt	t
|d| jiS )NTstrictr    )zipr9   r:   r;   r<   rF   rP   rQ   r   r8   tuplerH   )rL   rS   obj_listr"   n_itemsr)   r)   r*   __new_from_mlir_values__   s   z.MaskedSchedulerParams.__new_from_mlir_values__max_active_clustersr%   c                C   s   |}g | j |R S rO   )rG   )rL   r]   r    r!   num_persistent_clustersr)   r)   r*   get_grid_shape   s   z$MaskedSchedulerParams.get_grid_shape)__name__
__module____qualname__rB   Tensorr   Pointerr   intShaperN   rU   r\   r   r
   r   r_   r)   r)   r)   r*   r8   b   s.    	

"r8   c                   @   s  e Zd Zdedededededejdefdd	Zd
ee	j
 fddZdee	j
 d
d fddZeeddddedeeeef deeeef fddZeddddeded
eeeef fddZej	d)dedee dee dee d
eeee f f
ddZe			d*ddddee dee dee d
eeee f fddZedddd
efd d!Zed"ddd#d$efd%d&Zed
efd'd(ZdS )+MaskedSchedulerparamsr^   current_work_linear_idxcurrent_batch_idxaccum_tile_mcta_id_in_clusternum_tiles_executedc                 C   s.   || _ || _|| _|| _|| _|| _|| _d S rO   )rh   r^   _current_work_linear_idx_current_batch_idx_accum_tile_mrl   _num_tiles_executed)rL   rh   r^   ri   rj   rk   rl   rm   r)   r)   r*   rN      s   

zMaskedScheduler.__init__r%   c                 C   s^   t | j}|t | j |t | j |t | j |t | j |t | j |S rO   )r   r^   extendrn   ro   rp   rl   rq   )rL   rS   r)   r)   r*   rU      s   
z'MaskedScheduler.__extract_mlir_values__rS   c                 C   s   t |dksJ t| j|d g}t| j|d g}t| j|d g}t| j|d g}t| j|dd }t| j|d g}t| j	||||||S )Nr'   r   r   r>      r      )
rR   r   r^   rn   ro   rp   rl   rq   rg   rh   )rL   rS   new_num_persistent_clustersnew_current_work_linear_idxnew_current_batch_idxnew_accum_tile_mnew_cta_id_in_clusternew_num_tiles_executedr)   r)   r*   r\      s4   z(MaskedScheduler.__new_from_mlir_values__Nr   	block_idxgrid_dimc                C   s   | } t j|||dt j| j||d }|\}}}t|}	td}
td}t|| jd  t|| jd  tdf}td}t| ||	|
|||S )Nr   r   r   )rB   sizerG   r
   rg   )rh   r{   r|   r    r!   r^   bidxbidybidzri   rj   rk   rl   rm   r)   r)   r*   create   s,   

zMaskedScheduler.creater]   c                C   s   | j |||dS Nr   )r_   )rh   r]   r    r!   r)   r)   r*   r_     s   zMaskedScheduler.get_grid_shapedsm_pending_packeddsm_counternum_c_stagec                 C   s  | j jd }| j}| j}|t| j j| | j jd  | |krt|| j jjd k rtt	
|d uo4| j jd urAt||||d  d}|t| j j| | j jd 7 }|td7 }|t| j j| | j jd  | |krt|| j jjd k s)|| _|| _| j| j jjd k }|r| jt| j j| j | j jd  | |k}|| | j || | jf}	tdd t|	| jg | j jtdR ddD }
t|
||fS )Nr   r   )r#   r$   c                 s   s.    | ]\}}}t |t | t | V  qd S rO   )r
   ).0xyzr)   r)   r*   	<genexpr>[  s
    
zCMaskedScheduler._get_current_work_for_linear_idx.<locals>.<genexpr>TrV   )rh   rE   rp   ro   rB   rJ   r9   r<   rD   cutlass
const_exprr:   r+   r
   rY   rX   rl   rG   r   )rL   ri   r   r   r   num_tiles_nrk   	batch_idxis_validcur_cluster_coordcur_tile_coordr)   r)   r*    _get_current_work_for_linear_idx  sv   



z0MaskedScheduler._get_current_work_for_linear_idxc                C   s   | j | j|||dS )Nr   r   r   )r   rn   )rL   r   r   r   r    r!   r)   r)   r*   get_current_workg  s   
z MaskedScheduler.get_current_workc                C   s   | j ||d\}}|S r   )r   )rL   r    r!   	tile_info_r)   r)   r*   initial_work_tile_infox  s   z&MaskedScheduler.initial_work_tile_infor   )advance_countr    r!   r   c                C   s2   |  j t|t| j 7  _ |  jtd7  _d S Nr   )rn   r
   r^   rq   )rL   r   r    r!   r)   r)   r*   advance_to_next_work}  s   
z$MaskedScheduler.advance_to_next_workc                 C   s   | j S rO   )rq   rL   r)   r)   r*   rm     s   z"MaskedScheduler.num_tiles_executedrO   r@   )r`   ra   rb   r8   r
   rB   CoordrN   listr   ValuerU   r\   r   staticmethodr   r   r   r_   jitr   r   r   re   r   r   r   r   r   propertyrm   r)   r)   r)   r*   rg      s    
	*
Irg   c                -   @   s6  e Zd ZdZdedeeef deeef defddZdd	 Ze	j
d
e	jde	jde	jde	jde	jde	jdee	j dee	j dejdejfddZe	jde	jde	jde	jde	jde	jde	jde	jde	jde	jde	jd ee	j d!e	jd"ee	j d#e	jd$e	jd%e	jd&e	jd'e	jd(e	jd)ee	je	jd*f d+e	jd,ef,d-d.Zd/e	jd0e	jd1ee	je	je	jf fd2d3Zd4ejd5e	jd6e	jd+e	jd7eejef d1ee	je	je	jf fd8d9Z d:e	jd;e	jd4ejd<e	jd1ee	je	je	jf f
d=d>Z!d4ejd?ee	je	jf d6e	jd+e	jd<e	jd1ee	je	je	jf fd@dAZ"e#de	jdBeeeef dCe$ej% dDe&j'dEe$ej% dFe&j'd+e	jdGe$ej% dHe(j)dIe$ej% dedJedKed1eeeef fdLdMZ*e#de	jdee	j dNe	jdOeeeef deeef dejd1eeeeeef f fdPdQZ+e#dRe$ej% dIe$ej% dedGe$ej% d1ef
dSdTZ,e#dRe$ej% dGe$ej% dUedVedWed1efdXdYZ-e#deeef deeef d1efdZd[Z.e#d\ed]ed^ed_edRe$ej% dGe$ej% dUedVedWed1efd`daZ/e#dRe$ej% dIe$ej% dedGe$ej% deeef deeef d\ed]ed^ed_edUedVedWed1efdbdcZ0d*S )d)Sm100BlockScaledPersistentDenseGemmKernelak  This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param sf_vec_size: Scalefactor vector size.
    :type sf_vec_size: int
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16

    :note: Supported accumulator data types:
        - Float32

    :note: Supported C data types:
        - Float32
        - Float16/BFloat16
        - Float8E4M3FN/Float8E5M2
    :note: Constraints:
        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
        - MMA tiler N must be 128/256
        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors

    Example:
        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
        ...     sf_vec_size=16,
        ...     mma_tiler_mn=(256, 128),
        ...     cluster_shape_mn=(2, 1)
        ... )
        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
    sf_vec_sizemma_tiler_mnrG   
sm_versionc                 C   s   ddg}||v sJ | d| t j| _|| _|d dk| _|| _g |dR | _| jr1tjj	ntjj
| _d| _d| _d| _d	| _d
t| j| jg| jR  | _d| _d| _d| _t|| _d}|| _dS )a@  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator, always set to Float32
            - sf_vec_size: Scalefactor A/B vector size.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        :param sf_vec_size: Scalefactor vector size.
        :type sf_vec_size: int
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        sm_100sm_103zJ are the only supported SM versions for cute-dsl backend, but encountered r      r   )r   r   r>   rs   r          r>   i   N)r   Float32	acc_dtyper   use_2cta_instrsrG   	mma_tilerr   CtaGroupTWOONE	cta_group	occupancyepilog_warp_idmma_warp_idtma_warp_idrR   threads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacitynum_tmem_alloc_cols)rL   r   r   rG   r   supported_sm_versionsSM100_TMEM_CAPACITY_COLUMNSr)   r)   r*   rN     s0   

z2Sm100BlockScaledPersistentDenseGemmKernel.__init__c                 C   s  d}| j d | j d || jj f| _| jd | jrdnd t| jd d| jd f| _t	| j| j
| j| j| j| j| jdd }t	| j| j
| j| j| jtjjjj| jdd }d}| jd | jd | jd | f| _ | jd | jd | jd | f| _| j d t|jj | j d | j d f| _ttg | jdR |jjf| _ttg | jdR |jjf| _t| jjd | _t| jjd | _t| jjd | _| jdk| _ | jdk| _!| jdk| _"t#| j| j| j$| j%| _&| '|| j | j| j
| j(| j| j&| j%| j$| j| j| j)| j*\| _+| _,| _-t.|| j | j| j,| _/t0|| j | j(| j,| _1t23|| j | j| j,| _4t25|| j | j| j,| _6t7| j%| j$| j&| j-| _8dS )aJ  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B/SFA/SFB
        - Computing epilogue subtile
        - Setting up A/B/SFA/SFB/C stage counts in shared memory
        - Computing A/B/SFA/SFB/C shared memory layout
        - Computing tensor memory allocation columns
        r   r   r   r>      Nr   )9r   a_dtypewidthmma_inst_shape_mnkr   rB   round_upmma_inst_shape_mnk_sfbsm100_utils"make_blockscaled_trivial_tiled_mmaa_major_modeb_major_modesf_dtyper   r   nvgpur   r   r   mma_tiler_sfbr}   thr_idrD   cta_tile_shape_mnktiled_dividerI   rG   cluster_layout_vmnkcluster_layout_sfb_vmnknum_mcast_ctas_anum_mcast_ctas_bnum_mcast_ctas_sfb
is_a_mcast
is_b_mcastis_sfb_mcastcompute_epilogue_tile_shapec_layoutc_dtypeepi_tile_compute_stagesb_dtyper   r   num_acc_stagenum_ab_stager   make_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedblockscaled_utilsmake_smem_layout_sfasfa_smem_layout_stagedmake_smem_layout_sfbsfb_smem_layout_stagedmake_smem_layout_epic_smem_layout_staged)rL   mma_inst_bits_k	tiled_mmatiled_mma_sfbmma_inst_tile_kr)   r)   r*   _setup_attributes:  s   



z;Sm100BlockScaledPersistentDenseGemmKernel._setup_attributesa_tensorb_tensor
sfa_tensor
sfb_tensorc_tensormasked_m_tensorr:   alpha_tensorr]   streamc           )         s\  |j  _|j  _|j  _|j  _tj|  _	tj|  _
tj| _t j jkr>td j d j    t|j j}t|j|}t|j j}t|j|}t j j	 j
 j j j jdd }t j j	 j
 j jtjjjj jdd }t |j!j}t" j#|j!}t$ j%d}tj&||| j'| j(j\}}t) j#|j!}t$ j*d}tj+||| j'| j(j\}}t" j#|j!}t$ j,d}tjj&||| j'| j(jtj-d\}}t. j#|j!}t$ j/d}tjj+||| j0| j1jtj-d\}}t2 j|} t2 j|}!t2 j|}"t2 j|}#| |! |" |# |  _3t$ j4d}$t56t57 ||$ j8\}%}& 9||| j: j#|	\ _;}'d _<tj=G  fd	d
d
}(|( _> ?|||||||||||%|&| j( j1 j% j* j, j/ j4 j8 j;j@|' jAddgg  j#dR  j>2 |
d dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param a_tensor: Input tensor A
        :type a_tensor: cute.Tensor
        :param b_tensor: Input tensor B
        :type b_tensor: cute.Tensor
        :param sfa_tensor: Scale factor tensor A
        :type sfa_tensor: cute.Tensor
        :param sfb_tensor: Scale factor tensor B
        :type sfb_tensor: cute.Tensor
        :param c_tensor: Output tensor C
        :type c_tensor: cute.Tensor
        :param masked_m_tensor: Masked layout tensor M
        :type masked_m_tensor: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param alpha_tensor: Optional 1D tensor of shape (l,) containing per-batch scaling factors.
        :type alpha_tensor: cute.Tensor
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        zType must match: z != Nr>   NNNr   )internal_typeNNr      c                       sd  e Zd ZU ejjej jf e	d< ejjej jf e	d< ejjej j
f e	d< ejjej j
f e	d< eje	d< eje	d< ejjejj je jjf  jf e	d< ejjejj je jjf  jf e	d< ejjejj je jjf  jf e	d	< ejjejj je jf  jf e	d
< ejjejj je jf  jf e	d< dS )zISm100BlockScaledPersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsCsAsBsSFAsSFBN)r`   ra   rb   rB   structMemRanger   r   r   __annotations__r   r
   Alignr   cosizer   outerbuffer_align_bytesr   r   r   r   r   r   r   r)   r   r)   r*   SharedStorage  sX   
 

r  r   )gridblockclustersmemr   )Belement_typer   r   r   r   r   
LayoutEnumfrom_tensormma_major_moder   r   r   r   r   	TypeErrorr   r   tile_atom_to_shape_SFrD   r   rB   make_tensoriteratorr   r   r   r   r   r   r   r   r   r}   r   cluster_shape_to_tma_atom_ArG   slice_r   make_tiled_tma_atom_Ar   r   cluster_shape_to_tma_atom_Br   make_tiled_tma_atom_Br   Int16cluster_shape_to_tma_atom_SFBr   r   r   size_in_bytesnum_tma_load_bytesr   r   make_tiled_tma_atomCopyBulkTensorTileS2GOpr   _compute_gridr   tile_sched_paramsr  r  shared_storagekernellaunchr   ))rL   r   r   r   r   r   r   r:   r   r]   r   
sfa_layout
sfb_layoutr   r   atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_bsfa_opsfa_smem_layouttma_atom_sfatma_tensor_sfasfb_opsfb_smem_layouttma_atom_sfbtma_tensor_sfba_copy_sizeb_copy_sizesfa_copy_sizesfb_copy_sizeepi_smem_layout
tma_atom_ctma_tensor_cr  r  r)   r   r*   __call__  s*  *

	





	,
z2Sm100BlockScaledPersistentDenseGemmKernel.__call__r   r   r1  mA_mklr5  mB_nklr9  mSFA_mklr=  mSFB_nklrD  mC_mnlalphar   r   r   r   r   r   r   Nr   r(  c                 C   sb  t j }t j|}|| jkr)t| t| t| t|	 t| t |jj	dk}t j
 \}}}|t |jj	 }|dk}t jt j }||}||} t j \}!}"}"t }#|#| j}$|$j}%|$j}&ttjj}'| j| j d }(ttjj|(})tjj|$j | j|'|)| j|d}*ttjj}+t | j!|rdnd },ttjj|,}-tj"j|$j# | j$|+|-|d}.|r|| jkrd}/t j%  t j&|%|/ W d   n1 sw   Y  t j'  t | j(dkrt j)  |$j*j+|j,|j-d}0|$j.j+|j,|j-d}1|$j/j+|j,|j-d}2|$j0+|}3|$j1+|}4d}5d}6d}7d}8t23| j4p7| j5p7|rZtj6||dd	}5tj6||dd	}6tj6||dd	}7tj6|| dd	}8t 7|t 8| j9d
d}9t 7|t 8| j9dd}:t 7|t 8| j9d
d};t 7|
t 8| j9dd}<t 7|t 8| j9dd}=t j|9dgd}>|:|}?|:|}@|?;|9}A|?<|:}B|?;|;}C|@<|<}D|?=|=}Et >t 8|dj	}Ft?||d |Ft @|1ddt @|Add\}G}Ht >t 8|dj	}It?||d |It @|2ddt @|Bdd\}J}K|F}Lt jAj?||d |Lt @|3ddt @|Cdd\}M}Nt B|M}Mt B|N}Nt >t 8|dj	}Ot jAj?|	| d |Ot @|4ddt @|Ddd\}P}Qt B|P}Pt B|Q}Q|C|1}R|D|2}S|E| j9dd }T|Ft G|T| j$}Ut | j(dkrt jH  n
t jjI| jJ| jKd || jkrtL|t j
 t jM }V|VN }WtOtjPjQ| j}X|WjRr|WjS}Y|Yd t |jj	 |Yd |Yd f}Z|Hd|Zd d|Zd f }[|Kd|Zd d|Zd f }\|Nd|Zd d|Zd f }]|Qd|Zd d|Zd f }^|XT  t2Ud}_|XjV|>k r	|*W|X}_t2jXd|>dddD ]y}`|*Y|X|_ t jZ||[d|XjVf |Gd|Xj[f |*\|X|5d t jZ||\d|XjVf |Jd|Xj[f |*\|X|6d t jZ||]d|XjVf |Md|Xj[f |*\|X|7d t jZ|	|^d|XjVf |Pd|Xj[f |*\|X|8d |X]  t2Ud}_|XjV|>k r|*W|X}_q|V^  |V_ \}W}"|WjRs|*`|X || jakrEdt | jag| j!R  }at jjI| jb|ad t jjc| jdd|&d}bt e|b|Ujf}ct jg|bthi|c | jjd}dtkl|| j9| jmt 8|d}et e|d|e}ft jg|bthi|c thi|f | jjd}gtkn|| j9| jmt 8|d}ht e|g|h}i| o|3|f\}j}k}l| o|4|i\}m}n}otL|t j
 t jM }V|VN }WtOtjPjp| j}ptOtjPjQ| j$}q|WjRr@|WjS}Y|Yd t |jj	 |Yd |Yd f}Z|cddd|qj[f }r|pT  t2Ud}s|pjV|>k r|r|*q|p}s|r|.Y|q |rthjsjtd t2u|>D ]}`|r|*v|p|s dddd|pj[f}t|k|t }u|n|t }vt Z|j|u|l t Z|m|v|o t j|Rdgd}wt2jX|wddD ]:}xdd|x|pj[f}ydd|xf}z|rthjsjw|f|z jx |rthjsjy|i|z jx t z||r|R|y |S|y |r |rthjsjtd q|*{|p |p]  t2Ud}s|pjV|>k r$|r$|*q|p}sq|r.|.||q |q]  |V^  |V_ \}W}"|WjRsK|.`|q || jak r/|| j!d kr]t jj}| j~|&|d dt | jag| j!R  }at jjI| jb|ad t jjc| jdd|&d}bt e|b|Ujf}c|!}{| |{|c|E||\}|}}}~t |~j	| j}| ||||{|0\}}}| |{||E||0\}}}tL|t j
 t jM }V|VN }WtOtjPjp| j$}ttjjdt | j! dt | j! }tjj| j|d}t23|jdur| jdk sJ d|jj	d }|d ksJ d!td}td}td}|WjRr|WjS}Y|Yd t |jj	 |Yd |Yd f}Z|dddg|ZR  }|}ddddd|j[f }|.v| t @|dt |}t @|dt |}t j|j	dgd}|Vj| }t2X|D ]}|ddd|f }t Z||||~ ||~ }t23|dur|||WjSd   }|| j}|| || | j }t Z|||ddd|f  t jjt jjjt jjjd" dt | j! }t jjI| j|d || j!d krt Z||d|f |d|f  ||  t23|jdur|d t}t|||k}|rt jj| jd dd# n	|Y  n|Y  t jjI| j|d t23|jdurl|!d }|| j!d krl|dkrl||k rlt|||krlt|j t|  dd$ |d7 }||k rlt|||ksNqnt j%  |.{| W d   n	1 sw   Y  |]  |V^  |Vj_||| jd%\}W}|WjRs|| j!d krt jj|d dt | j! }t jjI| j|d || j!d kr|rt j|%|dA  t j|%d t jj|b| j~|d t23|jdur)t jjddd# |!d }|| j!d kr#|dkr%||k r't|j t|  dd$ |d7 }||k sdS dS dS dS |`  dS dS )&zW
        GPU device kernel performing the Persistent batched GEMM computation.
        r>   r   r   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)rM  rN  rO  rP  rR  r   N)swizzle)
mcast_mode)Nr   Nr@   )r   NNr   rs   )mode)r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addr)dtyper   FT)unroll_full)
is_two_cta)rN  rO  r   zmust be representable in 1 byter'   zneed to be packable into a u64)space)read)r$   r   )rB   archwarp_idxmake_warp_uniformr   r   prefetch_descriptorr}   r   rD   r{   block_idx_in_clusterget_flat_coord
thread_idxr   SmemAllocatorallocater)  r  r  pipelineCooperativeGroupAgentThreadr   r   PipelineTmaUmmar   r   data_ptrr   r$  rR   r   PipelineUmmaAsyncr   r   	elect_onembarrier_initmbarrier_init_fencerG   cluster_arrive_relaxedr  
get_tensorr  innerr  r  r  r  r   r   r   r   create_tma_multicast_mask
local_tiler  r   	get_slicepartition_Apartition_Bpartition_CrI   tma_partitiongroup_modesr   filter_zerosmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_CrQ   cluster_waitbarrierr   r   rg   r|   r   make_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyr#   producer_get_barrieradvancer   r   producer_tailr   r   retrieve_tmem_ptrr   r  layout
recast_ptrr   find_tmem_tensor_col_offsetr   r   make_tmem_layout_sfar   make_tmem_layout_sfbmainloop_s2t_copy_and_partitionConsumerconsumer_try_waitsetField
ACCUMULATErange_constexprconsumer_waitSFAr  SFBgemmconsumer_releaseproducer_commit
alloc_tmemr   epilog_tmem_copy_and_partitionmake_fragmentr   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorer   r:   r9   r   r
   r   rankrm   retileloadr,   storefence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctar   r-   cp_async_bulk_wait_groupr7   toint
sizeof_i32relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmem)rL   r   r   r1  rG  r5  rH  r9  rI  r=  rJ  rD  rK  rL  r   r   r   r   r   r   r   r   r(  rd  r   r~   r   r   mma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnkblock_in_cluster_coord_sfb_vmnktidxr   r  storager  r  ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsr  r  r  r  r  a_full_mcast_maskb_full_mcast_masksfa_full_mcast_masksfb_full_mcast_maskgA_mklgB_nklgSFA_mklgSFB_nklgC_mnlk_block_cntthr_mmathr_mma_sfbtCgAtCgBtCgSFAtCgSFBtCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBsfa_cta_layouttAsSFAtAgSFAsfb_cta_layouttBsSFBtBgSFBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_stater   mma_tile_coord_mnl
tAgA_slice
tBgB_slicetAgSFA_slicetBgSFB_slicepeek_ab_empty_statusk_blocktmem_ptr_read_threadsacc_tmem_ptrtCtAcc_basesfa_tmem_ptrtCtSFA_layouttCtSFAsfb_tmem_ptrtCtSFB_layouttCtSFBtiled_copy_s2t_sfatCsSFA_compact_s2ttCtSFA_compact_s2ttiled_copy_s2t_sfbtCsSFB_compact_s2ttCtSFB_compact_s2tab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statuss2t_stage_coordtCsSFA_compact_s2t_stagedtCsSFB_compact_s2t_stagednum_kphases
kphase_idxkphase_coordsf_kphase_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcctTR_rCtiled_copy_r2stRS_rCtRS_sCbSG_sCbSG_gC_partitionedacc_consumer_statec_producer_group
c_pipelinenum_expertsr   dsm_pending_idxr   bSG_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mnacc_vecc_bufferepilog_threadswill_write_signalslane_idr)   r)   r*   r*    sp  








	









	








	

]	








	



{	


	










  
	  z0Sm100BlockScaledPersistentDenseGemmKernel.kernelsSFtSFr%   c                 C   sj   t |}t |}t t| j| j}t||}|d}|	|}t
||}	||}
||	|
fS )a  
        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).

        :param sSF: The scale factor tensor in smem
        :type sSF: cute.Tensor
        :param tSF: The scale factor tensor in tmem
        :type tSF: cute.Tensor

        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        r   )rB   r  make_copy_atomr   Cp4x32x128bOpr   r   make_s2t_copyr{  partition_Sget_s2t_smem_desc_tensorpartition_D)rL   r/  r0  tCsSF_compacttCtSF_compactcopy_atom_s2ttiled_copy_s2tthr_copy_s2ttCsSF_compact_s2t_tCsSF_compact_s2ttCtSF_compact_s2tr)   r)   r*   r  ?  s   






zISm100BlockScaledPersistentDenseGemmKernel.mainloop_s2t_copy_and_partitionr  tAccr  r   c                 C   s   t | j| j| j| j||}t|d |}t	||d }|
|}	|	|}
t|d |}|	|}t|d j| j}||
|fS )a  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )NNr   r   N)NNr   r   r   r@  r   r   NNN)NNNr   r   r   r   r   )r   get_tmem_load_opr   r   r   r   rB   flat_divider   make_tmem_copyr{  r4  r6  r  rD   )rL   r  r?  r  r   r   copy_atom_t2rtAcc_epir  thr_copy_t2rr%  
gC_mnl_epitTR_gCr  r)   r)   r*   r  j  s2   	



zHSm100BlockScaledPersistentDenseGemmKernel.epilog_tmem_copy_and_partitionr  r  r  c           
      C   sJ   t | j| j| j|}t||}||}||}|	|}	||	|fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rC: The partitioned accumulator tensor
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor
        :type sepi: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )
r   get_smem_store_opr   r   r   rB   make_tiled_copy_Dr{  r6  r  )
rL   r  r  r  r  copy_atom_r2sr  thr_copy_r2sr  r  r)   r)   r*   r    s   



zHSm100BlockScaledPersistentDenseGemmKernel.epilog_smem_copy_and_partitionatomc                 C   sV   t |d |}|}t |dd}t |dd}	t|dt d||	\}
}||
|fS )a  Make tiledCopy for global memory store, then use it to:
        partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
        :type atom: cute.CopyAtom or cute.TiledCopy
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor

        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
            - tma_atom_c: The TMA copy atom
            - bSG_sC: The partitioned shared memory tensor C
            - bSG_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
        rA  r   r>   r   )rB   rC  r  r   r  rI   )rL   r  rN  r  r   r  gC_epirD  sC_for_tma_partitiongC_for_tma_partitionr  r$  r)   r)   r*   r    s   
zHSm100BlockScaledPersistentDenseGemmKernel.epilog_gmem_copy_and_partitionmma_tiler_mnkr   r   r   r   r   r   r   r   r   c                 C   s   |d dkrdnd}d}t | ||d}t | ||d}t| ||
d}t| ||
d}t |||d}t||t|| t|	| t|	| }d}t||}|| }|| ||  | }|||| |  |||   ||  7 }|||fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param a_major_mode: Major mode of operand A.
        :type a_major_mode: tcgen05.OperandMajorMode
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param b_major_mode: Major mode of operand B.
        :type b_major_mode: tcgen05.OperandMajorMode
        :param epi_tile: The epilogue tile shape.
        :type epi_tile: cute.Tile
        :param c_dtype: Data type of operand C (output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param sf_dtype: Data type of Scale factor.
        :type sf_dtype: type[cutlass.Numeric]
        :param sf_vec_size: Scale factor vector size.
        :type sf_vec_size: int
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, C stages)
        :rtype: tuple[int, int, int]
        r   r   r>   r   )	r   r   r   r   r   r   r   rB   r#  )r   rR  r   r   r   r   r   r   r   r   r   r   r   r   r   a_smem_layout_stage_oneb_smem_layout_staged_onesfa_smem_layout_staged_onesfb_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesc_bytes_per_stagec_bytesr   r)   r)   r*   r     sn   2






z9Sm100BlockScaledPersistentDenseGemmKernel._compute_stagesr;   r   c           
      C   s>   t |d}g |dR }t| ||||}t||}	||	fS )a  Use persistent tile scheduler to compute the grid size for the output tensor C.

        :param c: The output tensor C
        :type c: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[MaskedSchedulerParams, tuple[int, int, int]]
        r   r   )rB   r  r8   rg   r_   )
r   r:   r;   r   rG   r]   r<   cluster_shape_mnlr(  r  r)   r)   r*   r'  r  s   
z7Sm100BlockScaledPersistentDenseGemmKernel._compute_gridab_dtypec                 C   s   d}| t jt jt jhvrd}|dvrd}|t jt jhvrd}|t jkr)|dkr)d}| t jt jhv r7|dkr7d}|t jt jt jt jt jhvrGd}|S )aO  
        Check if the dtypes and sf_vec_size are valid combinations

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param sf_dtype: The data type of the scale factor
        :type sf_dtype: Type[cutlass.Numeric]
        :param sf_vec_size: The vector size of the scale factor
        :type sf_vec_size: int
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes and sf_vec_size are valid, False otherwise
        :rtype: bool
        TF>   r[  r   r   r[  )r   Float4E2M1FN
Float8E5M2Float8E4M3FNFloat8E8M0FNUr   Float16BFloat16)r]  r   r   r   r   r)   r)   r*   )is_valid_dtypes_and_scale_factor_vec_size  s0   zSSm100BlockScaledPersistentDenseGemmKernel.is_valid_dtypes_and_scale_factor_vec_sizea_majorb_majorc_majorc                 C   s&   d}| t ju r|dkr|dksd}|S )a}  
        Check if the dtypes and sf_vec_size are valid combinations

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major dimension of the A tensor
        :type a_major: str
        :param b_major: The major dimension of the B tensor
        :type b_major: str
        :param c_major: The major dimension of the C tensor
        :type c_major: str

        :return: True if the layouts are valid, False otherwise
        :rtype: bool
        TkF)r   r^  )r]  r   re  rf  rg  r   r)   r)   r*   is_valid_layouts  s   z:Sm100BlockScaledPersistentDenseGemmKernel.is_valid_layoutsc                 C   s   d}| d dvr
d}| d dvrd}|d | d dkrdnd dkr$d}dd	 }|d |d  d
ksV|d dksV|d dksV|d dksV|d dksV||d rV||d sXd}|S )a  
        Check if the mma tiler and cluster shape are valid

        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]

        :return: True if the mma tiler and cluster shape are valid, False otherwise
        :rtype: bool
        Tr   )r   r   Fr   r   r>   c                 S   s   | dko| | d @ dkS )Nr   r   r)   )r   r)   r)   r*   <lambda>	  s    z`Sm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape.<locals>.<lambda>r[  r   r)   )r   rG   r   is_power_of_2r)   r)   r*   $is_valid_mma_tiler_and_cluster_shape  s&    

zNSm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shapemnrh  lc	                 C   sV   d}	dd }
|
||dk| ||fr'|
||dk|||fr'|
||dk| ||fs)d}	|	S )a  
        Check if the tensor alignment is valid

        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the problem shape is valid, False otherwise
        :rtype: bool
        Tc                 S   s*   |rdnd}|| }d| j  }|| dkS )Nr   r   r   )r   )r^  is_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementsr)   r)   r*   check_contigous_16B_alignment>	  s   
zjSm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment.<locals>.check_contigous_16B_alignmentrm  rn  Fr)   )rm  rn  rh  ro  r]  r   re  rf  rg  r   ru  r)   r)   r*   is_valid_tensor_alignment	  s   %zCSm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignmentc                 C   s`   d}t | |||sd}t | ||
||sd}t ||sd}t ||||	| ||
||	s.d}|S )aD  
        Check if the gemm can be implemented

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param sf_dtype: The data type of the scale factor tensor
        :type sf_dtype: Type[cutlass.Numeric]
        :param sf_vec_size: The vector size
        :type sf_vec_size: int
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the gemm can be implemented, False otherwise
        :rtype: bool
        TF)r   rd  ri  rl  rv  )r]  r   r   r   r   rG   rm  rn  rh  ro  re  rf  rg  can_implementr)   r)   r*   rw  L	  s$   1
z7Sm100BlockScaledPersistentDenseGemmKernel.can_implement)1r`   ra   rb   __doc__re   r   strrN   r   rB   r   rc   r   rd   r   	ConstexprcudaCUstreamrF  r*  TiledMmaCopyAtomLayoutComposedLayoutr   Tiler8   	TiledCopyr  r
   r  boolr  r  r  r   r   Numericr   OperandMajorModer   r  r   r'  rd  ri  rl  rv  rw  r)   r)   r)   r*   r     s   )


? 	
  		
      t
+
?
%
/	
t
"9

(	
4

	
r   sf_ref_tensorsf_mma_tensorc                 C   sN   t |dd}t |dd}tt | D ]}| j|}| | ||< qdS zdConvert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layoutr   rs   r   N)rB   r  r   r  r}   r  get_hier_coord)r  r  i	mkl_coordr)   r)   r*   cvt_sf_MKL_to_M32x4xrm_K4xrk_L	  s   r  c                 C   s    t | dd} t | dd} dS r  )rB   r  )r  r)   r)   r*   'cvt_sf_MKL_to_M32x4xrm_K4xrk_L_mma_spec	  s   r  c              	   C   sR  dd }|||}| ||f}d}	d}
| |||	d |	d  |||
|	d |	d |
f}d}d}t j|tj|t jjt jdd	d
d}t j|tj|t jjt jddd
d}tt|t| |j	|dd}|
dddd| |||| ||| j
| }|d d d |d d f }|j	|dd}t j||ddd\}}t j|||dd}|||fS )Nc                 S      | | d | S r   r)   abr)   r)   r*   rJ   	     z,create_scale_factor_tensor.<locals>.ceil_divr   r   r   r   r   )r   r>   r   rs   r   r   r   r>   r   rs   )min_valmax_val)permute_order	init_typeinit_configT)non_blockingr>   r[  )is_dynamic_layoutassumed_align)r  )cutlass_torchcreate_and_permute_torch_tensortorchfloat32TensorInitTypeRANDOMRandomInitConfigr  r	   r,   permute	unsqueezeexpandreshapecute_tensor_likeconvert_cute_tensor)ro  mnrh  r   r^  devicerJ   sf_k	ref_shapeatom_matom_k	mma_shaperef_permute_ordermma_permute_orderref_f32_torch_tensor_cpucute_f32_torch_tensor_cpucute_f32_torch_tensorref_f32_torch_tensorcute_tensorcute_torch_tensorr)   r)   r*   create_scale_factor_tensor	  sz   

	

r  c                    @   s   e Zd Zdedededededededejd	ejd
ejdejdedeeef deeef dedef ddZe	j
de	jde	jde	jde	jde	jde	jdee	j de	jdejfddZdS )MaskedBatchedMatmulCuteDSLrm  rn  rh  ro  re  rf  rg  r]  r   r   alpha_dtyper   r   rG   sm_countr   c                 C   s  || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _t||	||
|||||||||sftd| d|	 d| d|
 d| d| d| d| d| d| d| d| d| tj }t|| jd | jd  || _|| _d S )Nz-MaskedBatchedMatmulCuteDSL: Unsupported with z, z,  r   r   )_m_n_k_l_a_major_b_major_c_major	_ab_dtype	_sf_dtype_c_dtype_alpha_dtype_sf_vec_size_mma_tiler_mn_cluster_shape_mnr   rw  r  r   r   HardwareInfominget_max_active_clusters_max_active_clusters_sm_version)rL   rm  rn  rh  ro  re  rf  rg  r]  r   r   r  r   r   rG   r  r   hardware_infor)   r)   r*   rN   
  sP   P

z#MaskedBatchedMatmulCuteDSL.__init__a_ptrb_ptrsfa_ptrsfb_ptrc_ptrmasked_m_ptrdst_signals_ptr	alpha_ptrcurrent_streamc
                 C   s  t j|t j| j| j| jf| jdkrdnddd}
t j|t j| j| j| jf| jdkr,dnddd}t j|t j| j| j| jf| j	dkrEdnddd}dd }|| j| j
}d	}d
}| j|| j|d |d  ||||d |d |f}| j|| j|d |d  ||||d |d |f}d}t j|t j||dd}t j|t j||dd}t| t| t j|t j| jfddd}t|d urt j|t j| jfdddnd }t| j
| j| j| jd|
|||||||| j|	
 d S )Nrm  )r   r   r>   )r   r   r>   )order)r  rn  c                 S   r  r   r)   r  r)   r)   r*   rJ   f
  r  z5MaskedBatchedMatmulCuteDSL.__call__.<locals>.ceil_divr  r   r   r   r  )r   )r   r   rG   r   )rB   r  make_ordered_layoutr  r  r  r  r  r  r  r  r  r   r   r   r  r  r  r  )rL   r  r  r  r  r  r  r  r  r  r   r   r   rJ   r  r  r  mma_shape_amma_shape_br  r   r   r   r   r)   r)   r*   rF  B
  s   			z#MaskedBatchedMatmulCuteDSL.__call__N)r`   ra   rb   re   ry  r  r^  r   rN   rB   r   rd   r   r{  r|  rF  r)   r)   r)   r*   r  
  sl    	



?	
r  rm  rn  rh  ro  re  rf  rg  r]  r   r   r  r   r   rG   r  r   enable_dst_signalsc                    s  dt ttj  dttj f 	fddtjtdi ddd|dd	|d
|d|d d	ddd|d|d|d|d|gd t	 R  	 	 d dtj
dtj
dtj
dtj
dtj
dtj
dt tj
 dt tj
 ffdd}|S )!Ninput_tensorsr%   c              	      s  | d u rdd t dD \}}}}}}}}sd }n@| \}	}
}}}}}}|d uks,J |	 |
 | | | | |d urF| nd |d urO| nd f\}}}}}}}}t |tjjdd}t |tjjdd}t|tjjdd}t|tjjdd}t|tjjdd}ttj|tjjdd}|d urttj|tjjddnd }|d urd urt|tjjddnd }||||||||gS )Nc                 S   s   g | ]}d qS )r[  r)   )r   r   r)   r)   r*   
<listcomp>
  s    zWget_cute_dsl_compiled_masked_gemm_kernel.<locals>.get_cute_pointers.<locals>.<listcomp>r'   r[  )r  )	r  rq  r   rB   AddressSpacegmemr   r
   Uint32)r  
a_data_ptr
b_data_ptrsfa_data_ptrsfb_data_ptr
c_data_ptrmasked_m_data_ptrdst_signals_data_ptralpha_data_ptra_tensor_gpub_tensor_gpusfa_tensor_gpusfb_tensor_gpuc_tensor_gpumasked_m_tensor_gpudst_signals_tensor_gpualpha_tensor_gpur  r  r  r  r  r  r  r  )r]  r  r   r  r   r)   r*   get_cute_pointers
  s   
zCget_cute_dsl_compiled_masked_gemm_kernel.<locals>.get_cute_pointersrm  rn  rh  ro  re  rf  rg  r]  r   r   r  r   r   rG   r  r   r  r  r  r  r  r  r  r  c           	         sT   |d u rt jft dd}t }g | |||||||g|R   |S )Nr{  )r^  r  )r  emptyr   r  r  )	r  r  r  r  r  r  r  r  r  )r   r  r*  ro  rm  rn  r)   r*   
tensor_apiW  s.   
z<get_cute_dsl_compiled_masked_gemm_kernel.<locals>.tensor_apir)   r@  )r   r   r  tensorrB   rd   compiler  r  r  rc   )rm  rn  rh  ro  re  rf  rg  r]  r   r   r  r   r   rG   r  r   r  r  r)   )
r]  r  r   r  r  r*  ro  rm  rn  r   r*   (get_cute_dsl_compiled_masked_gemm_kernel
  s   z	
(r  )r:   r  lhsrhsoutr9   r:   c             
   K   s  | \}}|\}}|}|j \}}}|j \}}}|dkr|d }|
dd}|
dd}|	du r3t|j}	|
dd}|
d	d}t|
d
ksLJ d|
 t|j\}}|dkr_|d
kr_tdtdi d|d|d|d|dddddddt|dt|dt|d	|du rdn)t|d|d|d|d|	dd| | d|du||||||||dS d|d|d|d|	dd| | d|du||||||||dS )aL  
    Executes a masked, batched matrix multiplication (GEMM) with scale factors and optional alpha scaling at output.

    Args:
        lhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the left-hand side input tensor (A) and its scale factor tensor (SFA).
            - A should be in (m, k, l) order, but physically (l, m, k). For fp4 tensor with 8-bit storage, we expect the shape to be (m, k/2, l).
            - SFA should be in (m32, m4, rm, k4, rk, l) order, but physically (l, rm, rk, m32, m4, k4)
        rhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the right-hand side input tensor (B) and its scale factor tensor (SFB).
            - B should be in (n, k, l) order, but physically (l, n, k). For fp4 tensor with 8-bit storage, we expect the shape to be (n, k/2, l).
            - SFB should be in (n32, n4, rn, k4, rk, l) order, but physically (l, rn, rk, n32, n4, k4)
        out (torch.Tensor): Output tensor to store the result, with shape (l, m, n).
        masked_m (torch.Tensor): 1D tensor of shape (l,) specifying the valid row count for each batch (used for masking).
        ab_dtype (str): Data type for A and B matrices. Supported: "float4_e2m1fn", "float8_e4m3fn", "float8_e5m2".
        sf_dtype (str): Data type for scale factors. Supported: "float8_e8m0fnu", "float8_e4m3fn".
        c_dtype (str): Data type for output matrix C. Supported: "float16", "bfloat16", "float32", "float8_e4m3fn", "float8_e5m2".
        sf_vec_size (int): Vector size for scale factors. Typically 16 or 32.
        sm_count (int, optional): Number of SMs to use. Default: max available SMs under the CTA configuration.
        mma_tiler_mn (Tuple[int, int], optional): Shape of the MMA tiler (M, N). Default: (128, 128).
        cluster_shape_mn (Tuple[int, int], optional): Shape of the CTA cluster (ClusterM, ClusterN). Default: (1, 1).
        alpha_dtype (str, optional): Data type for alpha scaling factors.
        alpha (torch.Tensor, optional): Optional 1D tensor of shape (l,) containing per-batch scaling factors. Perform per-batch scaling out = alpha * out.

    Notes:
        - Legends of the input tensors:
            * `l` is the batch size, `m/n` is the number of rows, and `k` is the number of columns.
            * `m/n32`, `m/n4`, `k4` are constant values 32, 4, 4 respectively.
            * `m32 * m4 * rm` should be same as `M`, which is `m` padded up to the nearest multiple of 128.
            * `n32 * n4 * rn` should be same as `N`, which is `n` padded up to the nearest multiple of 128.
            * `k4 * rk` should be same as `K`, which is `k / sf_vec_size` padded up to the nearest multiple of 4.
        - The function applies masking per batch using masked_m.
        - If alpha is provided, each batch output is multiplied by its corresponding alpha value. out = alpha * (A @ B).
        - The result is written to c_tensor.
    float4_e2m1fnr>   r   )r   r   rG   )r   r   NrL  r  r   zUnsupported kwargs:    z,SM110 is not supported for cute-dsl backend.rm  rn  rh  ro  re  rf  rg  r]  r   r   r   r  r   sm_r  )r  r  r  r  r  r  r  r  r)   )	rD   popr   r  rR   r   rA   r  r   )r  r  r  r9   r]  r   r   r   r:   r  kwargsa_torch	sfa_torchb_torch	sfb_torchc_torchrm  rh  ro  rn  r   r   rG   rL  r  majorminorr)   r)   r*   grouped_gemm_nt_masked  s   1
	
r
  )Mtypingr   r   r   r   cuda.bindings.driverbindingsdriverr{  r   cutlass.cuterB   cutlass.pipelinerl  cutlass.torchr  r  cutlass.utilsr   cutlass.utils.blackwell_helpersblackwell_helpersr    cutlass.utils.blockscaled_layoutblockscaled_layoutr   	functoolscutlass._mlirr   cutlass.cute.nvgpur   r   cutlass.cute.runtimer	   cutlass.cutlass_dslr
   r   r   r   r   r   r   r   r   cutlass._mlir.dialectsr   flashinfer.utilsr   flashinfer.api_loggingr   .cutlass.utils.static_persistent_tile_schedulerr   r   r   r   r   r   r   r  r+   r-   r7   r8   rg   r   r   rc   r  r  r  r  cachere   ry  r  r  r  r
  r)   r)   r)   r*   <module>   s   ,$  K ]H               SR 1	


 P	
