o
    پi                 !   @   s
  d dl Z d dlmZmZmZmZmZmZ d dlm	Z	 d dl
m  mZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlm  mZ d dlm  mZ d dl m!Z!m"Z"m#Z#m$Z$ d dlm%Z%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d d	l.m/Z/ d d
l0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9m:Z: d dl;m<Z< d dl=mZ> 	 G dd de9Z?				 			d[dee@e@e@e@f deejA deejA deeejA  deejA deBdeBdeBdeBdee@e@f dee@e@f d eCd!e@d"e@d#eDd$eDf d%d&ZEeFd'krd(eBd)ee@d*f fd+d,ZGe jHd-d.ZIeIjJd/eGd0d1d2 eIjJd3eGd4d5d2 eIjJd6eGd7d8d2 eIjJd9ejKejLd: eIjJd;ejKejLd: eIjJd<ejKdd: eIjJd=ejKe&d: eIjJd>d?d@geBd?dA eIjJdBd?dCgeBd?dA eIjJdDdCd@geBdCdA eIjJdEdCd@geBdCdA eIjJdFeCdGdHd2 eIjJdIe@dJdKd2 eIjJdLe@dMdNd2 eIjJdOdPdQdR eIjJdSdPdTdR eIM ZNeOeNjPdUkreIQdV eOeNjRdWkreIQdX eOeNjSdWkreIQdY eEeNjPeNjTeNjUeNjVeNjWeNjXeNjYeNjZeNj[eNjReNjSeNj\eNj]eNj^eNj_eNj` eadZ dS dS )\    N)OptionalTypeTupleUnionCallableLiteral)partial)cpasynctcgen05)LdMatrix8x8x16bOpLdMatrix16x16x8bOpStMatrix8x8x16bOpStMatrix16x8x8bOp)Int32Float32Boolean
const_expr)
LayoutEnum)from_dlpackmake_ptr)PipelineTmaCpAsyncUmma)
ParamsBaseArgumentsBase)TileSchedulerOptions)VarlenArgumentsVarlenManager)GemmSm90NamedBarrierGemmc                :   @   s  e Zd ZdZdZejZejZejZ		dde	e
j de	e
j deeef deeeef d	ee d
efddZdedefddZej		ddejdejdeej deej dededee dejdeej deej fddZejdejdeej deej dejdejd ejd!eej d"eej d#eej d$eej d%eej d&eej d'eej d(eej d)ed*ej d+ej!d,eej! d-ej"d.ej"d/ej"d0eej! d1eej! d2e#ej!ej"df d3e#ej!ej"df d4ej$d5ed6e
j%e& f8d7d8Zejd9e
j'j(d:e
j'j)d;ee
j'j) d<e&d=ee& d>e*d?ee
j'j)ee
j'j) f fd@dAZ+ej								ddBe
j'j(dCe
j'j(dDe
j'j)dEe
j'j)dejdFejdGejdHejd>e*dIe,dJe*dKeej dLeej dMeej- dNeej- dOeej dPeej dQeej dReej d?ee
j'j)e
j'j)ejf f(dSdTZ.ej	ddUej-dVej-dWejdXejdYejdZed[e,fd\d]Z/d^ejd_ejd?eej-ejejf fd`daZ0dbe*dcejd4ej$dde#e,ef d?eej-ejejf f
dedfZ1dUej-dgee2 dhee	e
j  diejdjejdbe*d?eej-ejejf fdkdlZ3dUej-dme2dhe	e
j dnejdoe
j!dbe*d?eej-ejejf fdpdqZ4ejdejd+ej!drej5dIe,d?e'j(f
dsdtZ6d+ej!duej5d?e'j(fdvdwZ7	ddxej!dyej5dzed?e'j(fd{d|Z8ejd}ej5d?e'j(fd~dZ9e:dejdeeeef deeeef d4ej$de	e
j de	e
j dee	e
j  d	ee dee	e
j  dee	e
j  dgee2 dmee2 dede;d deded?eeeef f"ddZ<e=dejdeeeef ded?efddZ>e=de	e
j de	e
j de	e
j dee	e
j  de?de?d?efddZ@e=de	e
j de	e
j d	ede	e
j d?ef
ddZAe=deeef deeef ded?efddZBe=dedededede	e
j de	e
j de?de?de?d?efddZCe=de	e
j de	e
j de	e
j deeef deeef dedededede?de?de?d?efddZDdS )	GemmSm100a  This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported A/B data types:
        - TFloat32
        - Float16/BFloat16
        - Int8/Uint8
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulator data types:
        - Float32 (for all floating point A/B data types)
        - Float16 (only for fp16 and fp8 A/B data types)
        - Int32 (only for uint8/int8 A/B data types)

    :note: Supported C data types:
        - Float32 (for float32 and int32 accumulator data types)
        - Int32 (for float32 and int32 accumulator data types)
        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
        - Int8/Uint8 (for uint8/int8 accumulator data types)
        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)

    :note: Constraints:
        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
        - MMA tiler N must be 32-256, step 32
        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16

    Example:
        >>> gemm = GemmSm100(
        ...     acc_dtype=Float32,
        ...     mma_tiler_mn=(128, 128),
        ...     cluster_shape_mn=(2, 2)
        ... )
        >>> gemm(mA, mB, mD, max_active_clusters, stream)
    d   NF	acc_dtypea_dtypemma_tiler_mncluster_shape_mnksf_vec_sizegather_Ac                 C   s  || _ |d dko|d dv | _|| _|d dksJ dg |dR | _|| _|du| _d| _d| _|| _|rB|d dksBJ d	| jrIt	j
jnt	j
j| _| jsSdnd
| _d| _d| _d| _d
| _| j| j | _| jd | _t| j| _tjj| jt| j| j| jg| jR   | _dS )a  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
              with cta_group=2 should be used.

        2.  Cluster Shape:
            - cluster_shape_mnk: The (ClusterM, ClusterN) shape of the CTA cluster.

        :param acc_dtype: Data type of the accumulator.
        :type acc_dtype: type[cutlass.Numeric]
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mnk: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mnk: Tuple[int, int]
        r      )      zCluster shape K must be 1NTFz'Cluster shape N must be 1 for gather A    )r   r(   r&         )r    use_2cta_instrsr#   	mma_tilerr$   blockscaledis_persistentpingpongr%   r
   CtaGroupTWOONE	cta_groupnum_ab_load_warps	occupancyepilog_warp_idmma_warp_idab_load_warp_idepi_load_warp_idscheduler_warp_idlennum_epi_warpscutearch	WARP_SIZEthreads_per_cta)selfr    r!   r"   r#   r$   r%    rC   D/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_sm100.py__init__   sB   

zGemmSm100.__init__epilogue_argsvarlen_argsc                 C   s  d}| j d | j d || jj f| _| jd | jrdnd t| jd d| jd f| _t| j	 rKt
| j| j| j| j| j| j dd | _d| _n/t
| j| j| j| j| j| j| jdd | _t
| j| j| j| j| jtjj| jdd | _d}| jd | jd | jd | f| _ t| j	r| jd | jd | jd | f| _nd| _| j d t| jjj | j d | j d f| _tt| j| jjjf| _ t| j	rtt| j| jjjf| _!nd| _!t| j jd | _"| j#r| j"dksJ t| j jd | _$| j"dk| _%| j$dk| _&t| j	r%t| j!jd | _'| j'dk| _(t
j)| j| j| j*dur4| j*nt+j,| j-dur?| j-nt.j/| j0| j1d| _2| j#sOdn	|j3durWd	nd
}| 4| j| j | j| j2| j| j5| j| j| j-| j1| j*| j0||t.j67d| j8 | j9\| _:| _;| _<| _=d| _>| j#sdn
|j3durdn| j;| _?t
@| j| j | j| j;| _A| jA| _Bt| j#rtCD| j| j | j| j;| _Bt
E| j| j | j5| j;| _Fd| _Gt| j-durt
H| j-| j*| j2| j<| _Gd| _It| j1dur t
H| j1| j0| j2| j=| _It| j	r!tJK| j| j | j| j;| _LtJM| j| j | j| j;| _Nnd\| _L| _Nt| j	 r;| O| j| j | j:| _PdS d}|| _PdS )a2  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        - Computing tensor memory allocation columns
        r'   r   r(   r&      Nr+   )layout_c	elem_ty_cvarlen_mvarlen_ksm_NN   )Qr-   r!   widthmma_inst_shape_mnkr,   r>   round_upmma_inst_shape_mnk_sfbr   r.   sm100_utilsmake_trivial_tiled_mmaa_major_modeb_major_moder    r4   	tiled_mmatiled_mma_sfb"make_blockscaled_trivial_tiled_mmasf_dtyper$   r
   r1   r3   mma_tiler_sfbsizethr_idshapecta_tile_shape_mnktiled_dividemake_layoutr#   cluster_layout_vmnkcluster_layout_sfb_vmnknum_mcast_ctas_ar%   num_mcast_ctas_b
is_a_mcast
is_b_mcastnum_mcast_ctas_sfbis_sfb_mcastcompute_epilogue_tile_shaped_layoutr   	ROW_MAJORd_dtypecutlassBFloat16c_layoutc_dtypeepi_tilemCuSeqlensM_compute_stagesb_dtypeutilsget_smem_capacity_in_bytesr?   r6   num_acc_stageab_stage	epi_stageepi_c_stagesched_stagea_prefetch_stagemake_smem_layout_aa_smem_layout_stageda_smem_load_layout_stagedquack_sm100_utilsmake_smem_layout_cpasync_amake_smem_layout_bb_smem_layout_stagedepi_smem_layout_stagedmake_smem_layout_epiepi_c_smem_layout_stagedblockscaled_utilsmake_smem_layout_sfasfa_smem_layout_stagedmake_smem_layout_sfbsfb_smem_layout_staged_compute_num_tmem_alloc_colsnum_tmem_alloc_cols)rB   rF   rG   mma_inst_bits_kmma_inst_tile_kprefetch_A_idxSM100_TMEM_CAPACITY_COLUMNSrC   rC   rD   _setup_attributes   s0  
	








zGemmSm100._setup_attributesmAmBmDmCscheduler_argsstreammSFAmSFBc           *         s  t jr|	dur|
dusJ |j_|j_|dur|jnd_|dur(|jnd_|	dur2|	jnd_t	|_
t	|_|durJt	|nd_|durVt	|nd_t	| _t	| _t jjkr}tdj dj t |du rt }|jdujksJ dd fdd||fD \}}|| t jrt|jj}t|	j|}	t|jj}t|
j|}
tjjj}t j!d}t j"d}d	\}}t j rt#$j%jj}tj&j'|||j(jj)j|jt*u r
t+j,ndd
\}}t#-j%jj}tj&j.|||j(jj)j|jt*u r/t+j,ndd
\}}d	\}}d	\}}t jrt#$j%jj}t j/d}tj&j'||	|j(jj)jt+j0d
\}}t#1j%jj}t j2d}tj&j.||
|j3j4j5jt+j0d
\}}t6j|_7t j r j7t6j|7  _7t jrt6j|}t6j|} j7|| 7  _7 j7|9  _7d	\} }!t |durj8|j9j:t;|dr|j<sdndd\} }!d	\}"}#t |durj8|j=j:dd\}"}#>|t?@|}$jA|jBdud}%C|||||}&|%@|&}'|%D|'|jE}(d_F|dur<tGj9nd|durItGj=ndt jrTjnt+jHt jrctGj/ndt jrqtGj2ndd t jrjI|jBdurjJd njJd   tjKG  fddd})|)_LMjj4|t j r|n|||||||| |!|"|#|$j)j5j!jNj"j/j2j9j=j:|'|%jO|(jPddgj%|dd dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param mA: Input tensor A
        :type mA: cute.Tensor
        :param mB: Input tensor B
        :type mB: cute.Tensor
        :param mD: Output tensor D
        :type mD: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
        NzType must match: z != c                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )rH   )divbyN)r>   	is_staticassumeelement_typerP   ).0strC   rD   	<genexpr>  s
    "
z7GemmSm100.__call__.<locals>.<lambda>.<locals>.<genexpr>)tuplestrider   rC   r   rD   <lambda>  s    z$GemmSm100.__call__.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )N)r   )r>   make_tensoriteratorrb   r_   )r   r   )
new_striderC   rD   
<listcomp>  s    z&GemmSm100.__call__.<locals>.<listcomp>NNNr   rN   )internal_typeadd_to_outputstoreadd)op_typeload)rK      r   r&   c                       s  e Zd ZU ejjejjd f e	d< ejjejj
d f e	d< ejjejjd f e	d< ejjejjd f e	d< ejjejjd f e	d< ejjejf e	d< eje	d< ee	d	< ejjejje f d
f e	d< ejjejjjdur}jnef jf e	d< ejjejjjdurjnef jf e	d< e	d< ejjejjjejjf jf e	d< ejjejjjejjf jf e	d< ejjejjf jf e	d< ejjejjf jf e	d< dS )z)GemmSm100.__call__.<locals>.SharedStorager&   ab_pipeline_array_ptrepi_pipeline_array_ptracc_pipeline_array_ptrsched_pipeline_array_ptra_prefetch_pipeline_array_ptr
tile_counttmem_dealloc_mbar_ptrtmem_holding_buf   sAIdxNsDsCepisAsBsSFAsSFB)__name__
__module____qualname__r>   structMemRangero   Int64rz   __annotations__r|   ry   r}   r~   r   Alignrn   buffer_align_bytesrr   epi_get_smem_structr!   cosizer   outerrv   r   rC   )a_idx_smem_sizeepi_c_smem_sizeepi_smem_sizeepilogue_paramsrB   r[   sfa_smem_sizesfb_smem_sizerC   rD   SharedStorageq  s^   
 
 r   r(   )gridblockclusterr   min_blocks_per_mp)Qr   r.   r   r!   rv   rn   rr   r[   r   from_tensora_layoutb_layoutrl   rq   mma_major_moderV   rW   	TypeErrorr   mAIdxr%   r   r   tile_atom_to_shape_SFr_   r$   r>   r   r   r]   rX   r^   slice_r   r   rT   cluster_shape_to_tma_atom_Ar#   nvgpumake_tiled_tma_atom_Ar-   rc   r   ro   TFloat32cluster_shape_to_tma_atom_Bmake_tiled_tma_atom_Br   Int16cluster_shape_to_tma_atom_SFBr   r\   rY   rd   size_in_bytesnum_tma_load_bytes_make_tma_epi_atoms_and_tensorsr   rs   hasattrr   r   epi_to_underlying_argumentsr   to_underlying_argumentsget_scheduler_classrt   get_scheduler_argumentsget_grid_shapemax_active_clustersr   r   Float8E8M0FNUr~   r`   r   shared_storagekernelr   launchrA   )*rB   r   r   r   r   rF   r   rG   r   r   r   
sfa_layout
sfb_layoutatom_thr_sizea_smem_layoutb_smem_layout
tma_atom_atma_tensor_aa_opb_op
tma_atom_btma_tensor_btma_atom_sfatma_tensor_sfatma_atom_sfbtma_tensor_sfbsfa_opsfa_smem_layoutsfb_opsfb_smem_layoutsfa_copy_sizesfb_copy_size
tma_atom_dtma_tensor_d
tma_atom_ctma_tensor_cvarlen_paramsTileSchedulerClstile_sched_argstile_sched_paramsr   r   rC   )	r   r   r   r   r   rB   r[   r   r   rD   __call__  s`  
"




	














"/
$zGemmSm100.__call__rX   rY   r   mA_mklr  mB_nklr  mSFA_mklr  mSFB_nklr  mD_mnlr  mC_mnlr   r  rc   rd   r   a_smem_load_layoutr   r  r  epi_smem_layoutepi_c_smem_layoutrs   r  r  c                 C   s  t |jdu}t |jdu}|r|rJ t | jr|s|sJ t |du}t |du} tjtj }!|!| jkrO||||	||fD ]}"t |"durNt	
|" qAt|jjdk}#tj \}$}%}%|$t|jj }&|&dk}'tjtj }(tj \})}%}%tj }*|*| j}+|+j},|+j}-|#r|!| jkrd}.tj|,|. | j|||+j |'d}/d}0t | r| jt|d|+j d}0| j||+j  d}1d}2d}3t |j!dur| j"| j#|+j$ | d	}2|+j%&| j'f}3d}4t | jr| (|+j) }4|+j*j&|j+|j,d
}5|+j*j&|j+|j,d
}6|+j-j&|j+|j,d
}7d}8t | jr8|r%| j.d n| j.d }9t/|9| j0f}:|+j1&|:}8d\};}<t | j2rN|+j3&|};|+j4&|}<d}=t |r_|+j5j&|j+|j,d
}=d}>t | rp|+j6j&|j+|j,d
}>| 7||+}?|8|&}@t | j2r|8|&nd}A|9| j:dd }B|;t<|B| j=}Ct>j?||| j@tA|s|jBdu r|jd n|jBjd tA|jd d}DtC|j?||3|2}tDjEtFtGjHtjjItJ| jKg| jLR  d}Ed}Ft | rtDjEtFtGjMdtjjI d}F|!| jkrd}G|DN|||G |DO }H|DP }I|Q|(}Jd}Kt | j2r|Q|(}Kd\}L}Md\}N}Ot | jRp.| jSp.|#rWt	jT||Jdd}Lt	jT||Jdd}Mt | j2rWt	jT||Jdd}Nt	jT||Kdd}O| }P|PU }QtDVtDjWjX| jY}Rt |rp|DZ  t[d}S|Qj\r|Qj]}T|Td }U|D^|U| j_| j`|G |Td t|jj |Td |Td f}Vd}Wt | j r|Da||U}Xtb|Xtc| j:ddg|Vd df}Wtb|Dd||Utc| j:ddg|Vd df}Yt | j2rtb|Da||Utc| j:ddg|Vd df}Ztb|Dd|
|Utc| j:ddg|Vd df}[|De|G |Df|U}\t/t|dj}]d}^t | j r7|@g|W}_thji||Jd |]|_|6|L|Hd\}^}%}%|@j|Y}`t | j2rL|@g|Z}a|Aj|[}bthji||Jd t/t|dj|`|7|M|Id\}c}%}%d\}d}et | j2rthji||Jd |]|a|;d|Nd\}d}%}%t/t|dj}fthji|	|Kd |f|b|<d|Od\}e}%}%tk|\| j.d }g| l|/|R|^|c|g|d|e}Rt |Fdur|Sr|Fm  t[d}S|Pn  |Po }Q|Qj\sx|/p|R t | jr|!| jd kr|!| j| jq k r| }P|PU }QtDVtDjWjX| jY}RtDVtDjWjr| j0}h|Qj\r|Qj]}T|Td }U|Ds|U}it |r|}Xn|sJ tb|| j.d f|Td df}X|Dt|U}j|Df|U}\| u|jv| j_| jqd d }ktj d | jd d  })|k8|)}ld\}^}mt |r|4w|h thjx|l|X|6|8d|hjyf |j|Td | j.d   |\d}^tjz  tj{  |4||h W d   n	1 sw   Y  |h}  nthj~|l|X|6|8|j|Td | j.d   |\d\}^}mtC|m|4}mtk|\| j.d }g| |/|R|h|^|m|g\}R}h|Pn  |Po }Q|Qj\st |j!dup| jr|!| jkrd}nt t|dkrtj dk}n| j.d }o| j.d }pd\}q}r}st | jr=thjtAddd}t|t8tj }q|q|8}r|qt|r9|on|p}s||nd}P|PU }Qd}ut | jrWtDVtDjWjX| j0}u|Qj\rt | jrl|Qj]}T|Td }U|Ds|U}it |rtb|i|of|Td f}v|q|v}w|Dt|U}j|j|Td |o  }xtd|rjd ft[}ytj|rjd ddD ]}z|sd|zf |xk |yd|zf< q|4|u tj|q|w|rdd|ujyf |yd |4|u |u}  nt|i|pf}v|q|v}w|Df|U}\tk|\|p}gtj|gd ddD ]#}{|4|u t|q|wdd|{f |rdd|ujyf  |4|u |u}  qd|gk rl|gd }{|\|{|p  }|td|rjd ft[}}tj|rjd ddD ]}z|sd|zf ||k |}d|zf< q:|4|u tj|t|wdd|{f |rdd|ujyf |}d |4|u |u}  |Pj|nd |Pjn|nd |Po }Q|Qj\s[|nr|Pp  t |dur|!| jkrtDVtDjWjX| j}~t[d}| }P|PU }Q|Qj\r|Qj]}T|Td }U| ||D||U| j.dd ||>|T\}}%}th||0}|r|F  t[d}t tj|dgd}tj|ddD ]}|0|~ |||~d |0|~ |~}  q|Pn  |Po }Q|Qj\s|0p|~ |!| jKkr|E  tjj| jd |-d!}||5}||7}t||Cj}t | j2rtj|t| | jd"}t|| j:| jt|d#}t||}tj|t| t| | jd"}t|| j:| jt|d#}t||}| |;|\}}}| |<|\}}}nd\}}d\}}}d\}}}| }P|PU }QtDVtDjWjr| jY}tDVtDjWjX| j=}|Qj\r|Qj]}T|Td }U|Df|U}tk|| j:d }g|ddd|jyf }| |/|1|||||||g|'|(||||||||\}}}|Pn  |Po }Q|Qj\s|1p| |!| jKk 	r|!| jLd kr%tjj| j|-|#d$ |E  t[|!| jLd k}G|D|| ||G |D }|D }tjj| jd |-d!}t||Cj}tDjEtFtGj| jtjjI d}|)}| ||||#\}}}t|j| j}| || j| j||=|\}}}d\}}}d}t |dur| || j| j|>|j|\}}}}| }P|PU }QtDVtDjWjr| j=}|  }tDVtDjWjr| j}t |r|DZ  |Qj\	r|Qj]}T|Td }U| ||j|U\}}|D|U| j|||G |ddddd|jyf }|1w| |D|G d}t |	r | j||D||U| j.dd ||=|T|d%\}}%}%d}t|dt|}|Df|U}tC| j|||||	o>|dkd&}| ||?||0||d|||||||||||||T|D||P||G\}}%tj{  |1|| W d   n	1 	svw   Y  |}  |Pn  |Po }Q|Qj\s|!| jLd k	rtjj|#d$ |  |!| jLd k	rt |#	rtj|,|(dA  tj|,d tjj|| j|#d$ |G	r|p  dS dS dS )'zW
        GPU device kernel performing the Persistent batched GEMM computation.
        Nr&   r       )rX   rc   ab_pipeline_mbar_ptris_leader_cta)NNr   )c_smem_layoutepi_pipeline_mbar_ptr)rc   acc_pipeline_mbar_ptr)sched_pipeline_mbar_ptrhas_C)swizzlerN   r(   )len_m_staticlen_k_static)
barrier_idnum_threadsT)
mcast_moder*   )r   r   Nr   )	cta_coord
cta_layout
src_tensor
dst_tensor
mcast_masktma_desc_ptr)r   Nr   r   )r0  r1  r2  r3  filter_zerosr4  F)limit_mlimit_kNNN)r.  is_async)is_scheduler_warpunroll_fullpredunrollmode)src_idxproducer_stater   )	alignmentptr_to_buffer_holding_addrdtyper   )
is_two_cta)r5  )	clear_acc)r   cu_seqlens_mcu_seqlens_kr%   r>   r?   make_warp_uniformwarp_idxr9   r	   prefetch_descriptorr]   r^   r_   	block_idxblock_idx_in_cluster
thread_idxro   rw   SmemAllocatorallocater   r   r   mbarrier_initmake_ab_pipeliner   data_ptrmake_epi_pipeliner   r   make_acc_pipeliner   tile_count_semaphoremake_sched_pipeliner#   r   r   
get_tensorr}   make_a_prefetch_pipeliner   r   r   innerr   r`   rb   r~   r   r.   r   r   r   r   epi_get_smem_tensors	get_slicepartition_shape_Cr-   make_fragment_Cappendry   r   createnum_epi_tensormapsr   r   r   pipelineNamedBarrierintr   TmemPtrr@   r<   r8   r7   EpilogueLoadinit_tensormap_ABget_tma_desc_a_ptrget_tma_desc_b_ptrget_flat_coordrg   rh   create_tma_multicast_maskinitial_work_tile_infomake_pipeline_statePipelineUserTypeProducerrz   fence_tensormap_initr   is_valid_tiletile_idxupdate_tensormap_ABr   r   offset_batch_A
local_tileselectoffset_batch_Bfence_tensormap_update_ABlen_kpartition_A
copy_utilstma_get_copy_fnpartition_Bceil_divload_ABarriveadvance_to_next_workget_current_workproducer_tailr5   Consumeroffset_batch_AIdxlen_m_make_gmem_tiled_copy_Ar   consumer_waitgather_m_get_copy_fnindex	sync_warp	elect_oneconsumer_releaseadvancegather_k_get_copy_fnload_A_gather_Ar;   tiled_copy_1dlane_idxpartition_Dpartition_Smake_identity_tensormake_fragmentrangeproducer_acquirecopyproducer_commitflat_dividefetch_next_workr:   r|   epilog_gmem_copy_and_partitionoffset_batch_epitma_producer_copy_fnarrive_and_waitretrieve_tmem_ptrr    make_fragment_Amake_fragment_Br   layout
recast_ptrr
   find_tmem_tensor_col_offsetr[   r   make_tmem_layout_sfar$   make_tmem_layout_sfbmainloop_s2t_copy_and_partitionmma
alloc_tmemr   init_tensormap_epiepi_get_tma_atomsget_tma_desc_d_ptrget_tma_desc_epi_ptrsEpiloguer=   epilog_tmem_copy_and_partitionepilog_smem_store_and_partitionrl   rn   epilog_smem_load_and_partitionrq   rr   make_epi_store_pipeline&epi_get_tensormap_update_shapes_ordersupdate_tensormap_epifence_tensormap_update_epigroup_modesrankepi_load_acc_subtileepiloguerelinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmem)rB   rX   rY   r   r  r  r  r  r  r  r  r  r  r  r  r   r  rc   rd   r   r  r   r  r  r   r!  rs   r  r  rK   rL   has_Dr)  rO  tma_atomr,   bidx_mma_tile_coord_vr$  cta_rank_in_clustertidxsmemstorager   r   num_tmem_dealloc_threadsab_pipelineepi_pipelineacc_pipelinesched_pipeliner   a_prefetch_pipelinesA_mmar   r   r   a_idx_smem_dima_idx_smem_layoutr   r   r   r   epi_smem_tensorsthr_mmathr_mma_sfb	acc_shapetCtAcc_fakevarlen_managertmem_alloc_barrierepi_load_barrieris_tma_warptma_desc_a_ptrtma_desc_b_ptrblock_in_cluster_coord_vmnkblock_in_cluster_coord_sfb_vmnka_mcast_maskb_mcast_masksfa_mcast_masksfb_mcast_masktile_scheduler	work_tileab_producer_statedo_epi_load_barrier_arrivetile_coord_mnkl	batch_idxmma_tile_coord_mnlgA_mkmA_mkgB_nkgSFA_mklgSFB_nklr~  a_cta_layoutcopy_AtCgAtCgBtCgSFAtCgSFBcopy_Bcopy_SFAcopy_SFBsfb_cta_layout
k_tile_cnta_prefetch_consumer_statemAIdx_mkr  tiled_copy_A
thr_copy_A
prefetch_Ar;  tile_Mtile_Kthr_copy_AIdxtAsAIdxtAcAIdxtiled_copy_AIdxa_prefetch_producer_stategAIdxtAgAIdxm_limit	tApAIdx_mmk_tilek_limit	tApAIdx_kepi_producer_statedo_epi_load_barrier_wait	copy_C_fnbGS_gCcopy_Cepi_tile_numepi_idxacc_tmem_ptrtCrAtCrBtCtAcc_basesfa_tmem_ptrtCtSFA_layouttCtSFAsfb_tmem_ptrtCtSFB_layouttCtSFBtiled_copy_s2t_sfatCsSFA_compact_s2ttCtSFA_compact_s2ttiled_copy_s2t_sfbtCsSFB_compact_s2ttCtSFB_compact_s2tab_consumer_stateacc_producer_statek_lentCtAcctma_desc_d_ptrtma_desc_epi_ptrsepilogue_barrierepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcctTR_rDtiled_copy_r2stRS_rDtRS_sDtRS_rCtSR_rCtSR_sCtiled_copy_s2racc_consumer_stateepi_store_pipelineepi_read_state
epi_shapes
epi_orderstTR_tAcccopy_Dload_acc_subtilerC   rC   rD   r     s  $
































	  














	B



















:


















#










	
	V
  zGemmSm100.kernel
a_pipelinea_producer_stater  r  r  r  returnc                 C   s2  t d}d|k r||}d}tj|d ddD ]@}	|j}
d}t|d ur0||	|
|f}|  |||| ||	|
g|R   || |  t d}|	d |k rX||}qd|k r|d }	|j}
d}t|d ury||	|
|ddf}|  |||| ||	|
g|R ddi || |  ||fS )	NTr   Fr(   r@  rC   r>  r?  )	r   producer_try_acquirero   r  r  r   r  r  producer_cpasync_commit)rB   rE  rF  r  r  r  r  peek_a_empty_statusr  r  smem_idxprefetch_outrC   rC   rD   r    s<   



zGemmSm100.load_A_gather_Ar  r  r*  r+  r  r  accr$  r  r   r#  r$  r'  r%  r(  r&  r)  c              	   C   s  t |d u}t |r>tdd ||fD sJ tdd ||fD s$J tdd ||fD s1J tdd ||fD s>J t | joD| j}td}d|	k rW|
sR|rW||}|
r^|| |tj	j
d tj|d	gd
}tj|	ddD ]}t |r|
s||| tj  |jj|j|d@ d W d    n1 sw   Y  |
r||| t |rd d d d |jf}|| }|| }t||| t||| tj|ddD ]=}d d ||jf}t |rd d |f}|tj	j|| j |tj	j|| j t|||| || | |tj	j
d q|| |  td}|d |	k r2|
s-|r2||}qu|
r;|| |  |||fS )Nc                 s       | ]}|d uV  qd S NrC   r   xrC   rC   rD   r   Y      z GemmSm100.mma.<locals>.<genexpr>c                 s   rN  rO  rC   rP  rC   rC   rD   r   Z  rR  c                 s   rN  rO  rC   rP  rC   rC   rD   r   [  rR  c                 s   rN  rO  rC   rP  rC   rC   rD   r   \  rR  Tr   Fr&   rB  r(   r@     )dst_rankr<  )r   allr%   r,   r   consumer_try_waitr  setr
   Field
ACCUMULATEr>   r]   ro   r  r  r?   r  sync_object_fullarrive_mbarrierr  r  SFAr   SFBgemmr  r  r  )rB   r  r  r*  r+  rX   r  r  rM  r  r$  r  r   r#  r$  r'  r%  r(  r&  r)  r.   need_nonleader_ctapeek_ab_full_statusnum_k_blocksr  s2t_stage_coordtCsSFA_compact_s2t_stagedtCsSFB_compact_s2t_staged	k_blk_idxk_blk_coordsf_kblock_coordrC   rC   rD   r  @  sb   







zGemmSm100.mmar2  r6  rB  r4  r7  r  rK  c           	      C   sH   |st ||d d d |f | ||}||  d S |d d S )Ng        )r>   r  retiler   r   fill)	rB   r2  r6  rB  r4  r7  r  rK  tRS_rAccrC   rC   rD   r    s
   
zGemmSm100.epi_load_acc_subtilesSFtSFc                 C   sj   t |}t |}t t| j| j}t||}|d}|	|}t
||}	||}
||	|
fS )a  
        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).

        :param sSF: The scale factor tensor in smem
        :type sSF: cute.Tensor
        :param tSF: The scale factor tensor in tmem
        :type tSF: cute.Tensor

        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        r   )r>   r6  make_copy_atomr
   Cp4x32x128bOpr4   r[   make_s2t_copyra  r  get_s2t_smem_desc_tensorr  )rB   rk  rl  tCsSF_compacttCtSF_compactcopy_atom_s2ttiled_copy_s2tthr_copy_s2ttCsSF_compact_s2t_tCsSF_compact_s2ttCtSF_compact_s2trC   rC   rD   r    s   





z)GemmSm100.mainloop_s2t_copy_and_partitionr  tAccr,   c                 C   s   t | j| jdur| jntj| jdur| jntj| j	||}t
|d |}t||d }||}||}	t
| jd | jd f}
t
|
|}||}t
|d j| j	}||	|fS )aw  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        N)rN   r   r   N)NNr   r   r   r   r(   )NNNr   r   )rT   get_tmem_load_opr`   rl   r   rm   rn   ro   rp   r    r>   r  r
   make_tmem_copyra  r  r  r  r  r_   )rB   r  ry  rs   r,   copy_atom_t2rtAcc_epir2  thr_copy_t2rrB  cAcccAcc_epitTR_cAccr4  rC   rC   rD   r    s"   	



z(GemmSm100.epilog_tmem_copy_and_partitionrl   rI  r5  r   c                 C   sn   t |dur|ntj|dur|ntj| j|}t||}|	|}	|dur+|	
|nd}
||}|||
fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rD: The partitioned accumulator tensor
        :type tTR_rD: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: Int32
        :param sD: The shared memory tensor to be copied and partitioned
        :type sD: cute.Tensor
        :type sepi: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rD, tRS_sD) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rD: The partitioned tensor C (register source)
            - tRS_sD: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        N)rT   get_smem_store_opr   rm   ro   rp   r    r>   make_tiled_copy_Dra  r  rh  )rB   r2  rl   rI  r5  r   r  copy_atom_r2sr6  thr_copy_r2sr8  r7  rC   rC   rD   r    s   


z)GemmSm100.epilog_smem_store_and_partitionrq   r   tRS_rD_layoutc                 C   s   t ||| j|}|j}t|trt|j|jd}	nt|t	r-|jdv r-t
|jd d}	ntj }	t|	|}
t|
|}||}||}t||}||}||||fS )N)num_matrices	transpose)r&   r+   r&   )r  )rT   r  r    op
isinstancer   r   r  r  r   r   r>   r   CopyUniversalOprm  r  ra  r  r  rh  )rB   r2  rq   rI  r   r  r  r  store_opr  copy_atom_s2rr<  thr_copy_s2rr;  r9  r:  rC   rC   rD   r  +  s    






z(GemmSm100.epilog_smem_load_and_partitionr#  c              	   C   s   t | j r	d}n| jd d t | j rdnd }ttjj|}| j| j	 d }|}ttjj|}	t | j rJtj
j|| j||	| j|d}
|
S tj|| j||	| j|| jsYd n|s]dndd}
|
S )Nr(   r"  r&   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnkr   )r  r  r  r  r  r  producer_drop_count)r   r%   r5   r,   rg  CooperativeGroupAgentThreadre   rf   PipelineTmaUmmare  rz   r   r   )rB   rX   rc   r#  r$  producer_cntab_pipeline_producer_group
mcast_sizeconsumer_arrive_cntab_pipeline_consumer_grouppipeline_abrC   rC   rD   rW  L  sB   
zGemmSm100.make_ab_pipeliner'  c                 C   sJ   t t jj}| j| jrdnd }t t jj|}t jj|| j|||dS )Nr&   r(   )r  r  r  r  r  )	rg  r  r  r  r=   r,   PipelineUmmaAsyncre  ry   )rB   rc   r'  acc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_grouprC   rC   rD   rZ    s   zGemmSm100.make_acc_pipelinecluster_layout_mnkr(  r)  c           	      C   s   t t jj}t|}| jt| jg| j	| j
R  }|r"|d7 }|| d }t t jj|}t jj|| j||t|dkrBd dS ddS )Nr(   r   )r  r  r  r  consumer_mask)rg  r  r  r  r>   r]   r5   r<   r8   r7   r;   PipelineAsyncre  r}   r   )	rB   r  r(  r)  sched_pipeline_producer_groupcluster_sizewarps_per_ctar  sched_pipeline_consumer_grouprC   rC   rD   r\    s(   
zGemmSm100.make_sched_pipelinea_prefetch_pipeline_mbar_ptrc                 C   sH   d}t jt jj||d}| jd }t t jj|}t jj|| j||dS )Nr"  )rF  r(   )r  r  r  r  )rg  r  r  r  r5   PipelineCpAsyncre  r~   )rB   r  r  a_prefetch_producer_groupr  a_prefetch_consumer_grouprC   rC   rD   r^    s   

z"GemmSm100.make_a_prefetch_pipelinemma_tiler_mnkr`   rv   r[   rn   rr   r   )NrK   rL   smem_capacityr6   c           #      C   s  |du}t | rd}n
|d dkrdnd}t|d dkr!dnd}|
du r)dnt|d dkr4dnd}t|||d}t|||d}|	durRt|	||dnd}|
dur`t|
||dnd}t |rvt|||d}t	|||d}t
||t
|| }t |dkr|tjd	 |d  7 }t |r|t
||t
|| 7 }d
}t |dkr|tjd	 |d  d 7 }|	durt
|	|nd}|| ||| }|| }t |
durt
|
|} || | 7 }|| | | }!|!| }"||!||"  | 7 }||"||fS )a~  Computes the number of stages for A/B/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param epi_tile: The epilogue tile shape.
        :type epi_tile: cute.Tile
        :param d_dtype: Data type of operand C (output).
        :type d_dtype: type[cutlass.Numeric]
        :param d_layout: Layout enum of operand D.
        :type d_layout: LayoutEnum
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, C stages)
        :rtype: tuple[int, int, int]
        Nr&   r(   r'   r   r+   r   rL      r   rK   )r   r>   r]   rT   r   r   r   r   r   r   r   r   rP   epi_smem_bytes_per_stage)#clsrX   r  r`   rs   r!   rv   r[   r$   rn   rr   rl   rq   rF   r   r  r6   r.   ry   r{   r|   a_smem_layout_staged_oneb_smem_layout_staged_oned_smem_layout_staged_onec_smem_layout_staged_onesfa_smem_layout_staged_onesfb_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesd_bytes_per_stageepi_bytes_per_stage	epi_bytesc_bytes_per_stageremaining_bytesrz   rC   rC   rD   ru     s   -
&

zGemmSm100._compute_stagesr-   ry   c                 C   s4   |  |dd }| t||}tj|}|S )a  
        Compute the number of tensor memory allocation columns.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler: The shape (M, N, K) of the MMA tile.
        :type mma_tiler: tuple[int, int, int]
        :param num_acc_stage: The stage of the accumulator tensor.
        :type num_acc_stage: int

        :return: The number of tensor memory allocation columns.
        :rtype: int
        Nr&   )rb  rc  r>   rd  ro   rw   get_num_tmem_alloc_cols)rX   r-   ry   r  r  r   rC   rC   rD   r   ?  s   z&GemmSm100._compute_num_tmem_alloc_colsa_majorb_majorc              	   C   s  d}|| krd}| }|t jt jt jt jt jt jt jhvrd}|tt jt	hvsA|t jkr5|t jt jt jhvsA|t	krC|t jt jhvrCd}|dur~|tkr]|tt jt jt jt jt	t jt jhvs||t jkrj|t jt jhvs||t	kr~|t jt jtt	t jt jhvr~d}|t j
u r|dkr|dksd}|S )a  
        Check if the dtypes are valid

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFNk)ro   Float16rp   r   Uint8Int8Float8E4M3FN
Float8E5M2r   r   Float4E2M1FN)r!   rv   r    rn   r  r  is_validab_dtyperC   rC   rD   is_valid_dtypesW  sj   	


zGemmSm100.is_valid_dtypesr  c                 C   s   d}| t jt jt jhvrd}|dvrd}|t jt jhvrd}|t jkr)|dkr)d}| t jt jhv r7|dkr7d}|tt jt jt jt jhvrFd}|S )aO  
        Check if the dtypes and sf_vec_size are valid combinations

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param sf_dtype: The data type of the scale factor
        :type sf_dtype: Type[cutlass.Numeric]
        :param sf_vec_size: The vector size of the scale factor
        :type sf_vec_size: int
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes and sf_vec_size are valid, False otherwise
        :rtype: bool
        TF>   r   r"  r"  r   )ro   r  r  r  r   r   r  rp   )r  r[   r$   rn   r  rC   rC   rD   )is_valid_dtypes_and_scale_factor_vec_size  s(   z3GemmSm100.is_valid_dtypes_and_scale_factor_vec_sizecluster_shape_mnr.   c                 C   s   d}| d dvr
d}|s| d t dddvrd}n| d dvr!d}d	d
 }|d |d  dksG|d dksG|d dksG||d rG||d sId}|rY|d dksW|d dkrYd}|S )a  
        Check if the mma tiler and cluster shape are valid

        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]

        :return: True if the mma tiler and cluster shape are valid, False otherwise
        :rtype: bool
        Tr   )@   rH   r'   Fr(   r"  i  )rH   r'   c                 S   s   | dko| | d @ dkS )Nr   r(   rC   )rQ  rC   rC   rD   r     s    z@GemmSm100.is_valid_mma_tiler_and_cluster_shape.<locals>.<lambda>r   r+   )r  )r"   r  r.   r  is_power_of_2rC   rC   rD   $is_valid_mma_tiler_and_cluster_shape  s,   

z.GemmSm100.is_valid_mma_tiler_and_cluster_shaper  nr  ld_majorc	                 C   sV   d}	dd }
|
||dk| ||fr'|
||dk|||fr'|
||dk| ||fs)d}	|	S )a  
        Check if the tensor alignment is valid

        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param d_major: The major axis of the C tensor
        :type d_major: str

        :return: True if the problem shape is valid, False otherwise
        :rtype: bool
        Tc                 S   s*   |rdnd}|| }d| j  }|| dkS )Nr   r(   rH   )rP   )rI  is_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementsrC   rC   rD   check_contigous_16B_alignment/	  s   
zJGemmSm100.is_valid_tensor_alignment.<locals>.check_contigous_16B_alignmentr  r  FrC   )r  r  r  r  r  rn   r  r  r  r  r  rC   rC   rD   is_valid_tensor_alignment	  s   %z#GemmSm100.is_valid_tensor_alignmentc                 C   sR   d}t | | |||	|
sd}t j||ddsd}t ||||| ||	|
|	s'd}|S )a  
        Check if the gemm can be implemented

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param d_major: The major axis of the C tensor
        :type d_major: str

        :return: True if the gemm can be implemented, False otherwise
        :rtype: bool
        TF)r.   )r   r  r  r  )r  r    rn   r"   r  r  r  r  r  r  r  r  can_implementrC   rC   rD   r  =	  s   .zGemmSm100.can_implement)NFrN   )NNNNNNNN)F)Er   r   r   __doc__r?   r   rf  EpilogueArgumentsEpilogueParamsr   ro   Numericr   ri  r   boolrE   r   r   r>   jitTensorr   r   cudaCUstreamr  r   TiledMmaCopyAtomr   r   ParamsLayoutComposedLayoutr   Tile	Constexprr   rg  r  PipelineStater   r  r   	TiledCopyr  r  r  r  r   r  r  PointerrW  rZ  r\  r^  classmethodr   ru   staticmethodr   strr  r  r  r  r  rC   rC   rC   rD   r   h   s<   .

C G	
  &	
      L
-	
V	
#

4
*	
!2

	
 N5

+	
4

	
r   r'   r'   r&   r(   皙?r(   Fmnklr  rn   rr   r    r  r  r  c_majorr"   r  	tolerancewarmup_iterations
iterationsskip_ref_checkdynamic_persistentc           6         sD  t d t d|   t d| d| d|  t d| d| d|  t d	|	 d
|
  t d|  t d|  t d|  t d|  |rQJ d| \}}}t||||	|
||||||std| d| d| d|	 d|
 d| d| d d| d| d| d| tj stdtd d<fdd	}||||dk|dd\} }||||dk|dd\}}|||||dk|dd\}
}}|dur|||||dk|\}	}}nd\}	}g |
dR }t|||	|}t	j
 } | |
d |
d  }!|rtjdtjdd}"nd}"tt|!|"dur.tt|" tjjd d!ndd"| t tj }#t|#jt|
		|s
	 |t	jt	jt	jt	jhv rxt d#|! |! }$nt d#||}$|dur|$| }$|$! }$|! }%|t"kr|$}&nY|t	jt	jhv r|dkrd$nd%}'|dkr|||fn|||f}(t#j$|(tj%|'t#j&j'd& })t(|)d'd!j)|dkrdndd(}*||*_*t#j+|$|*|dd}*|)! }&n|$,t#-|}&tj.j/|%|&|d)d* dd+l0m1}+ ttj jd,| |  | },|}-|}.ddl2}/|/3d- |j4d.kr@|dks.J tj5d/tj6dd fd0d1}0n|du rM fd2d1}0n|, j- fd3d1}0|+|0|.|-d4}1|,|1d5  }2t d6|1d7d8|2d9 |/3d- 	
f	d:d1}3|+|3|.|-d4}4|,|4d5  }5t d;|4d7d8|5d9 dS )=a"  Execute a persistent batched dense GEMM operation on Blackwell architecture with performance benchmarking.

    This function prepares input tensors, configures and launches the persistent GEMM kernel,
    optionally performs reference validation, and benchmarks the execution performance.

    :param mnkl: Problem size (M, N, K, L)
    :type mnkl: Tuple[int, int, int, int]
    :param ab_dtype: Data type for input tensors A and B
    :type ab_dtype: Type[cutlass.Numeric]
    :param d_dtype: Data type for output tensor C
    :type d_dtype: Type[cutlass.Numeric]
    :param acc_dtype: Data type for accumulation during matrix multiplication
    :type acc_dtype: Type[cutlass.Numeric]
    :param a_major/b_major/d_major: Memory layout of tensor A/B/C
    :type a_major/b_major/d_major: str
    :param mma_tiler_mn: MMA tiling size. If not specified in the decorator parameters, the autotuner will use the
        default value of (256, 256). Otherwise, the autotuner will use the value specified in the decorator parameters.
    :type mma_tiler_mn: Tuple[int, int], optional
    :param cluster_shape_mn: Cluster shape. If not specified in the decorator parameters, the autotuner will use the
        default value of (2, 1). Otherwise, the autotuner will use the value specified in the decorator parameters.
    :type cluster_shape_mn: Tuple[int, int], optional
    :param tolerance: Tolerance value for reference validation comparison, defaults to 1e-01
    :type tolerance: float, optional
    :param warmup_iterations: Number of warmup iterations before benchmarking, defaults to 0
    :type warmup_iterations: int, optional
    :param iterations: Number of benchmark iterations to run, defaults to 1
    :type iterations: int, optional
    :param skip_ref_check: Whether to skip reference result validation, defaults to False
    :type skip_ref_check: bool, optional
    :raises RuntimeError: If CUDA GPU is not available
    :raises ValueError: If the configuration is invalid or unsupported by the kernel
    :return: Execution time of the GEMM kernel
    :rtype: float
    z2Running Blackwell Persistent Dense GEMM test with:zmnkl: z
AB dtype: z, C dtype: z, Acc dtype: zMatrix majors - A: z, B: z, C: zMma Tiler (M, N): z, Cluster Shape (M, N): zTolerance: zWarmup iterations: zIterations: zSkip reference checking: z-Dynamic persistent mode is not supported yet.zUnsupported testcase z, z$GPU is required to run this example!iW  Tc              	      s  |r| ||fn| ||f}|rdnd}|t jhv }t|}	|t jt jhvr'|	ntj}
tj||
|t jj	j
t jj d ddd|	}| }|jtjd}|t jt jhvrW|n|tj}t|dd	}||_|rr|j|rnd
ndd}tj||||d}||||fS )Nr&   r(   r   r(   r&   r   g      r(   )stdscale)permute_order	init_typeinit_configrH  r   assumed_alignr   leading_dimis_dynamic_layout)ro   r  cutlass_torchrI  r  r  torchbfloat16create_and_permute_torch_tensorTensorInitTypeGAUSSIANGaussianInitConfigtor  float32viewuint8r   r   mark_layout_dynamicconvert_cute_tensor)r  mode0mode1r  rI  r
  r_   r  is_unsignedtorch_dtype	gen_dtypetorch_tensor_cputorch_tensorf32_torch_tensortorch_tensor_viewcute_tensor)r  rC   rD   create_and_permute_tensor	  sD   


z&run.<locals>.create_and_permute_tensorr  r	  r  Nr9  r(   r   r  )rI  devicer+   r  )r[  zmkl,nkl->mnlr  r  )r  r  r   r  gh㈵>)atolrtol)do_benchr&   g      ?r  )r(   c                      s<   t j d d d d df d d d d df jt jdS )Nr   )scale_ascale_b	out_dtype)r  
_scaled_mmmTr  rC   )a_torchb_torchscale_abrC   rD   r   
  s    zrun.<locals>.<lambda>c                      s"   t  ddddddjS Nr&   r   r(   )r  matmulpermuter+  rC   )r,  r-  rC   rD   r   
  s   " c                	      s.   t ddd ddddddjS r/  )r  baddbmmr1  r+  rC   )r,  r-  c_torch_convertrC   rD   r   
  s
    )warmuprepg    eAzCuBLAS Average time: z.3fz ms, TFLOPS: z.1fc                	      s    S rO  rC   rC   )	compiled_gemmcurrent_streamepi_argsr   r   r   r   r   rG   rC   rD   r   
  s    zCute-DSL Average time: )T)7printr   r  r   r  r  is_availableRuntimeErrormanual_seedro   rw   HardwareInfoget_max_active_clusterszerosint32r   r   r   rX  r>   AddressSpacegmemr  r   r7  r  cuda_streamcompiler  r  r  r  einsumcpur   r  r  r  r  SKIPr   r  r   r  r  rI  testingassert_closetriton.testingr&  timesleeprP   onesr  )6r  r  rn   rr   r    r  r  r  r  r"   r  r  r  r  r  r  kwargsr  r  r  r"  a_refa_torch_cpub_refb_torch_cpur  d_torchd_torch_cpucc_torchr#   r^  hardware_infor   r[  torch_streamrefgpu_dref_dr  r_   f8_torch_tensorref_d_tensorr&  flopsrepeatsr4  rK  	fn_cublastiming_cublastflops_cublasfntimingtflopsrC   )r,  r-  r3  r6  r7  r8  r  r   r   r   r   r.  r   rG   rD   run|	  s  5J

3








	
rf  __main__r   rG  .c                 C   s4   zt dd | dD W S  ty   tdw )Nc                 s   s    | ]	}t | V  qd S rO  )ri  striprP  rC   rC   rD   r   
  s    z-parse_comma_separated_ints.<locals>.<genexpr>,z2Invalid format. Expected comma-separated integers.)r   split
ValueErrorargparseArgumentTypeError)r   rC   rC   rD   parse_comma_separated_ints
  s
   
rn  z.Example of Dense Persistent GEMM on Blackwell.)descriptionz--mnkl)r'   r'   rO   r(   z!mnkl dimensions (comma-separated))typedefaulthelpz--mma_tiler_mn)rH   rH   z Mma tile shape (comma-separated)z--cluster_shape_mn)r(   r(   zCluster shape (comma-separated)z
--ab_dtype)rp  rq  z	--d_dtypez	--c_dtypez--acc_dtypez	--a_majorr  r  )choicesrp  rq  z	--b_majorr  z	--d_majorz	--c_majorz--tolerancegQ?zTolerance for validationz--warmup_iterationsr)   zWarmup iterationsz--iterations   z&Number of iterations to run the kernelz--skip_ref_check
store_truezSkip reference checking)actionrr  z--dynamic_persistentzDynamic persistent kernelr+   z$--mnkl must contain exactly 4 valuesr&   z,--mma_tiler_mn must contain exactly 2 valuesz0--cluster_shape_mn must contain exactly 2 valuesPASS)r  r  r  r   r(   FF)brl  typingr   r   r   r   r   r   	functoolsr   cuda.bindings.driverbindingsdriverr  r  ro   cutlass.cuter>   cutlass.cute.nvgpur	   r
   cutlass.torchr  cutlass.pipelinerg  cutlass.utils.blackwell_helpersrw   blackwell_helpersrT    cutlass.utils.blockscaled_layoutblockscaled_layoutr   cutlass.cute.nvgpu.warpr   r   r   r   r   r   r   r   cutlass.utilsr   cutlass.cute.runtimer   r   quack.pipeliner   quack.cute_dsl_utilsr   r   quack.tile_schedulerr   quack.varlen_utilsr   r   quack.gemm_sm90r   r   quack.copy_utilsr  quack.sm100_utilsr   r   ri  r  r  floatr  rf  r   rn  ArgumentParserparseradd_argumentrI  rp   
parse_argsargsr<   r  errorr"   r  r  rn   rr   r    r  r  r  r  r  r  r  r  r  r9  rC   rC   rC   rD   <module>   s2   B                  0	



  
0


