o
    پiog                    @   sR  d dl Z d dlmZmZmZmZmZmZ d dlm	Z	 d dl
Z
d dlm  mZ d dlZd dlmZ d dlmZ d dlmZmZmZ d dlm  mZ d dlmZmZmZmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d d	l(m)Z)m*Z*m+Z+m,Z,m-Z- d d
l.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6mZ7 	 G dd de j8Z9G dd dZ:dS )    N)TupleTypeCallableOptionalUnionLiteral)partial)cpasyncwarp	warpgroup)Int32Float32Float16Boolean
const_expr)if_generate)
LayoutEnum)
ParamsBaseArgumentsBase)TileSchedulerOptionsTileSchedulerArgumentsTileSchedulerVarlenMTileSchedulerArgumentsVarlenMTileScheduler)VarlenArgumentsVarlenManager)make_pipeline_statePipelineTmaCpAsyncc                   @   sD   e Zd Ze Ze Ze Ze Ze Z	e Z
e ZdS )NamedBarrierGemmN)__name__
__module____qualname__enumautoEpilogueEpilogueLoadMmaWG0MmaWG1EpiWG0EpiWG1TmemPtr r+   r+   C/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_sm90.pyr   K   s    r   c                5   @   s	  e Zd ZU dZdZdZeed< eZ	e
Z				ddeej deej d	eeef d
eeeef dedededefddZde	fddZejdejdejdeej deej dededee dejfddZejdejdeej dejd ejd!ejd"eej d#eej d$eej d%eej d&e
d'ej d(ej!d)ej"d*ej"d+ej"d,ej"d-e
d.ej#e$ f$d/d0Zej	1	1dd2ej%j&d3ej%j'd4ee$ d5e$d6e(d7ee$ d8ee$ d9ej%j'fd:d;Z)ej	dd2ej%j&d3ej%j'd4e$d<ee$ d5e$d6e(d=ed9ej%j'fd>d?Z*ejd2ej%j&d@ej%j'dejdAejdBejdCejdDeej d6e(dEe(d9eej%j'ejf fdFdGZ+ejdHedIeejdJf dKe,eej-  dLej%j&dMej%j&dNej%j'dOeej%j' dPej.dQe$dRejdSeej dTeej/ dUej/dVejdWeej0 dXeej dYeej dZee$ d[ee$ d\ej1d]ed^ej%j2d_e(d`e3d9eej%j'ej%j'f f2dadbZ4dd=efdcddZ5dejdejdeej fdedfZ6ejdgejdRejdhefdidjZ7ejdHedIeejdJf dPej.dTeej/ dUej/d\ej1d]ed^ej%j2d_e(d9eejdJf fdkdlZ8dHedmeejdJf dnej1d9eejdJf fdodpZ9	1ddHedqeejdJf dRejdSeej d9eej f
drdsZ:dHedCejdejd\ej1d_e(d9d1fdtduZ;ejdHedmeejdJf dPej.dTeej/ dUej/d\ej1d9d1fdvdwZ<d1d1dxdye	d9efdzd{Z=d1d1dxdHed9e,ej fd|d}Z>d1d1dxdHed~ejde(d9e?e,e( e,e f fddZ@eAdyee	 deeeef dPej.d9efddZBdHefddZCdHed9eejdJf fddZDdEe(deEd fddZFdEe(deEd fddZGdejd9ej/fddZHdejdeeI deej deej d_e(d9eej/ejejf fddZJdejdeIdeej dejdej!d_e(d9eej/ejejf fddZK	1ddeLejej/f dejd	ej.dPej.dejd\ej1deej- d9eejejf fddZMdejdej!dej-fddZNdej!ej"B dej-fddZOdd ZPd(ej!dej-defddZQeR	ddeeeef dPeeef deej deej deeej  deeej  de	dededed9eeef fddZSeA	1	1ddeeeef deeeef deeej  deeef d1B d9eeef f
ddZTeAdeeeef dPeeef deej deIdeej deIdedeeej  deIdedeeej  deeI ded9eej"ej"ej"eej" f fddZUeAdejdej"dPeeef deEd d9eejejf f
ddǄZVeAdejdej"deeef ded9eejejf f
dd̈́ZWdddЄZXeAdeej deej deej deeej  deYdeYd9efddԄZZd1S )GemmSm90aX  
    This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Hopper GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param tile_shape_mn: Shape of the CTA tile (M,N)
    :type tile_shape_mn: Tuple[int, int, int]
    :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
    :type cluster_shape_mnk: Tuple[int, int, int]

    :note: Data type requirements:
        - For 16-bit types: A and B must have the same data type
        - For 8-bit types: A and B can have different types (Float8E4M3FN/Float8E5M2) as long as both are 8-bit
        - Float8 types only support k-major layout

    :note: Supported data types:
        - Float16
        - BFloat16
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulation types:
        - Float32 (for all floating point inputs)

    :note: Constraints:
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 4

    Example:
        >>> gemm = GemmSm90(
        ...     acc_dtype=Float32,
        ...     tile_shape_mn=(128, 256),
        ...     cluster_shape_mnk=(1, 1, 1)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, stream)
    Z   r   num_epi_tensormapsFT	acc_dtypea_dtypetile_shape_mncluster_shape_mnkpingpongis_persistentfp8_fast_accumgather_Ac	                 C   s  || _ || _|| _| jr| jsJ d| o|jdk| _|| _|r+|d dks+J d|| _g |dR | _| jd | jd }	}
| js|	dvrLtd|	dv rm|	d	krVd
nd}|
d dkrb|
|ksltd|	 d| n@|
d dkrw|
d
ks|
d dkr|
dkstdn'|	dvrtd|	dkrd
n|	dkrdnd}|
d dkr|
|kstd| | js|	dkrd\}}n$|	d	kr|
dkrd\}}nd\}}n| jd d
k r| jd d nd}d}|dv r|dv sJ nd\}}||df| _	| jd | _
| jr| j
dksJ | jd | _| j
dk| _| jdk| _d| _t| j	| js$dnd | _| jr4| jdks4J | jdv s<J d| _| jd | j | _tjd| _| jsV| jndd | _| jsadnd| _| jd | _t| jd d t| j	| j  }| jr|d9 }| js| jd!krd"\| _| _n#|dk}|sd#nd$\| _| _n| jd!krd%\| _| _nd&\| _| _d | _d | _d | _d | _ d | _!d | _"d | _#d'| _$d S )(a5  
        Initializes the configuration for a Hopper dense GEMM kernel.

        This configuration includes data types for operands, tile shape, cluster configuration,
        and thread layout.

        :param acc_dtype: Data type for accumulation during computation
        :type acc_dtype: type[cutlass.Numeric]
        :param tile_shape_mn: Shape of the CTA tile (M,N)
        :type tile_shape_mn: Tuple[int, int]
        :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
        :type cluster_shape_mnk: Tuple[int, int, int]
        z+Pingpong gemm requires persistent scheduler      z'Cluster shape N must be 1 for gather A r   )@            @  z+CTA tile shape M must be 64/128/192/256/320)r<   r>   r<   r=          zIf tile_m == z2, CTA tile shape N must be divisible by 32 and <=    i   zRCTA tile shape N must be divisible by 16 and <= 256, or divisible by 32 and <= 512)r:   r;   r<   z/CTA tile shape M must be 64/128/192 if pingpongr:   r;      z0CTA tile shape N must be divisible by 16 and <= r>   )r9      )   r9   rC   )r9   rC   rD   )r9   r9   sm_90   NrD   )r@   r?   )(      )      )8      )rK         )%r0   r4   r5   widthfp8_slow_accumr7   r3   cta_tile_shape_mnk
ValueErroratom_layout_mnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcast	occupancymathprodmma_warp_groupsnum_threads_per_warp_groupthreads_per_ctacutlassutilsget_smem_capacity_in_bytessmem_capacitynum_epi_warpsnum_ab_load_warpsab_load_warp_idnum_regs_loadnum_regs_mmaab_stage	epi_stagea_smem_layout_stagedb_smem_layout_stagedepi_smem_layout_stagedepi_tileshared_storagebuffer_align_bytes)selfr0   r1   r2   r3   r4   r5   r6   r7   tile_Mtile_N
tile_N_maxatom_layout_matom_layout_nregs_per_threadheavy_register_pressurer+   r+   r,   __init__   s   (



zGemmSm90.__init__epilogue_argsc                 C   s  t j| j| j| j | j | j| jd| j	d | jd  fd| _
t| jd dkrR| jd }tjd| j	d | d |fdd}tjt| j
j| jd|dfd| _
tj| j
jd	gd
}d}| j	d | j	d || f| _	t| j| _| | j	| j| j| _| j| j	| j| j| j| j| j|tjd| j | j| j  d
\| _!| _"| _#| j$rd	nd| _%| &| j	| j| j| j| j| j| j!| j| j'| j"| j| j(| j#\| _)| _*| _+| _,dS )a  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        r:   r9   )tiler_mnr8   )r   rC   r9   orderN)permutation_mnkrC   moderF   r   sm_)overlap_sD_sA)-
sm90_utilsmake_trivial_tiled_mmar1   b_dtypea_layoutsm90_mma_major_modeb_layoutr0   rS   rQ   	tiled_mmar   cutemake_ordered_layoutmake_tiled_mmamake_mma_atomopsize	shape_mnkmake_layoutr3   cluster_layout_mnk$_sm90_compute_tile_shape_or_overrided_dtyperl   _compute_stagesc_dtyper^   r_   r`   archrX   r5   rg   rh   epi_c_stager4   sched_stage_make_smem_layoutsd_layoutc_layoutri   rj   rk   epi_c_smem_layout_staged)ro   rx   atom_npermutation_nmma_inst_shape_kmma_inst_tile_kr+   r+   r,   _setup_attributes  s   	
zGemmSm90._setup_attributesmAmBmDmCscheduler_argsvarlen_argsstreamc	                    s  |j _|j _|dur|j nd_|dur|j nd_t|_t|_|dur1t|nd_	|dur=t|nd_
tjjdkoLjjkrZtdj dj tjjjjkrrtdjj djj tjjdko~jjdkrtdt|du rt }|jdujksJ dd	 fd
d||fD \}}| tjd}	tjd}
d\}}tj rڈ||	jd jd fjd \}}||
jd jd fjd \}}tj|
_tj r
 jtj|	7  _d\}}t|dur.j|jjt|dr&|js(dndd\}}d\}}t|durFj|j jdd\}}!|t"#|}j$|j%dud}&|||||}|#|}|'||j(}j)r}|dur}t*jnd|durt*j nd tj+G  fddd}|_,-j.|tj r|n||||||||j/jjjj ||j0|j1ddgj|dd dS )a  Execute the GEMM operation in steps:
        - Setup static attributes
        - Setup TMA load/store atoms and tensors
        - Compute grid size
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param mA: Input tensor A
        :type mA: cute.Tensor
        :param mB: Input tensor B
        :type mB: cute.Tensor
        :param mD: Output tensor D
        :type mD: cute.Tensor
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        NrA   zType mismatch: z != zType width mismatch: r8   z#a_dtype should be float16 or float8c                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )r;   )divbyN)r   	is_staticassumeelement_typerO   ).0str+   r,   	<genexpr>  s
    "
z6GemmSm90.__call__.<locals>.<lambda>.<locals>.<genexpr>)tuplestrider   r+   r   r,   <lambda>  s    z#GemmSm90.__call__.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr   )r   make_tensoriteratorr   shape)r   r   )
new_strider+   r,   
<listcomp>  s    z%GemmSm90.__call__.<locals>.<listcomp>NNr   NNr   rC   r9   add_to_outputstoreadd)op_typeloadvarlen_mc                       sB  e Zd ZU ejjejjd f e	d< ejjejj
d f e	d< ejjejjd f e	d< ejjejf e	d< ejjejjjdurHjnef jf e	d< ejjejjjdurajne f jf e	d< e	d	< ejjejjjejf jf e	d
< ejjejjjejf jf e	d< dS )z(GemmSm90.__call__.<locals>.SharedStoragerC   ab_pipeline_array_ptrepi_pipeline_array_ptrsched_pipeline_array_ptr
tile_countNsDsCepisAsB)r   r    r!   r   structMemRanger^   Int64rg   __annotations__r   r   r   Alignr   rn   r   epi_get_smem_structr1   cosizeri   r   rj   r+   )epi_c_smem_sizeepi_smem_sizeepilogue_paramsro   r+   r,   SharedStorage  s<   
 r   )gridblockclusterr   min_blocks_per_mp)2r   r1   r   r   r   r   from_tensorr   r   r   r   r   rO   	TypeErrorr   mAIdxr7   r   r   slice_ri   rj   _make_tma_atoms_and_tensorsrQ   r3   size_in_bytesnum_tma_load_bytes_make_tma_epi_atoms_and_tensorsrk   rl   hasattrr   r   epi_to_underlying_argumentsr   to_underlying_argumentsget_scheduler_classmCuSeqlensMget_scheduler_argumentsget_grid_shapemax_active_clustersr5   r   r   rm   kernelr   r   launchr]   )ro   r   r   r   r   rx   r   r   r   a_smem_layoutb_smem_layout
tma_atom_atma_tensor_a
tma_atom_btma_tensor_b
tma_atom_dtma_tensor_d
tma_atom_ctma_tensor_cvarlen_paramsTileSchedulerClstile_sched_argstile_sched_paramsr   r   r+   )r   r   r   r   ro   r,   __call__a  s   







 
zGemmSm90.__call__r   r   mA_mklr   mB_nklr   mD_mnlr   mC_mnlr   r   r   r   r   epi_smem_layoutepi_c_smem_layoutr   r   c           b      C   s  t |jdu}t |jdu}|r|rJ t | jr|s|sJ t |du}t |	du}tjtj }|| jkrM||||fD ]}t |durLt	
| q?tj }|| j}| j|tdg|jR |j d}d}t |r| jt|d|j d}d}d}t |jdur| j||j |d}|j| jf}|jj|j|j d}|j!j|j|j d} d}!t |rt | j" rtj#|j$|j | j%d}"t&|"|j}!n
|j'j|j|j d}!d}#t |r|j(j|j|j d}#| )|
|}$t*j+||| j,t-|s|j.du r|jd	 n|j.jd	 t-|jd | j/|d
}%t0|j+|||}|| jkr3tj1| j2 || jkr3|| j| j3 k r3| j3dkpD|| jk}&|%4|||& |%5 }'|%6 }(tjtj7 })|8|)}*tj9||*dd}+tj9||*d	d},| j:rx|+nd	}+| j;r|,nd	},| j3dkp|| jk}-t t<|dkr|-otj7 d	k}-||-d}.|.= }/t>t?j@jA| jB}0t |r|%C  |/jDr|/jE}1|1d }2|%F|2| jG| jH|& t | j r|%I||2}3tJ|3tK| jLd	dg|1d	 df}4n9|%M|2}5t |rtJ|5| jLd	 f|1d	 f}6|}3n|sJ tN|5| jLd f}6tJ|| jLd	 f|1d	 df}3tJ|%O||2tK| jLddg|1d df}7|%P|& |%Q|2}8|%R|2}9d}:t | j rqtSjT||*d tt|dj|4||+|'d\}:};};nS| U|jV| jG| j3d }<tjW d	 tjjX| j  }=|<Y|=}>d\}:}?t |rtSjZ|>|3||6|8|1d	 | jLd	   |9d}:ntSj[|>|3||6|8|1d	 | jLd	   |9d\}:}?tSjT||*d	 tt|dj|7| |,|(d\}@};};t\|9| jLd }At | j r| ]||0|:|@|A}0n| j^||0|:|?|@|A|d}0|.j_|-d |.j`|-d |.a }/|/jDst | j/o| r'|.j`|-d |b|0 |-r3|.b  || jk r;tjc| jd te| j/ rK|d	kpW| j/oW|d	kpW|dk}&|%f|| g|
|& |%h }B|%i }CtjW \}=};};tj|=| jj }Dt | j/r|=| jj }=tj| j/s| jknd| jjd}E|Y|E| j/s|Dnd	}F|l|Fm|}G|n|Fo| }H|ptjK| jLd	dgd}Itq|I| jr}Jd}Kt | jsrtq|I| jr}Kt | j/r|Dd	kr| jtd	dd | jtd	dd t\|jd | jLd }Lt<t\| jLdd | ju}Mt>t?j@jv| jB}N| w }Ot>t?j@jv| jx}Pt>t?j@jA| jx}Q| }.d}/t | j/rt |r:|.= }/|dkry|Py|M |Qy|M t | rU|Ny|L n|%jR|/jEd d}9t\|9| jLd }A|Ny|A |.`  t |ry|.a }/t | r|.= }/n|.= }/t |r|%C  |/jDr+|/jE}1|1d }2| z|
|j|2\}R}S|%{|2| j||R|S|& |%R|2}9t\|9| jLd }A| }||N||G|H|J|K|A|D	\}N}t |r|Ad	kr|J~d t | j/r| |Dd t?jttj| jtjjX d}T|%|& d}Ut |r| j||%||2| jLdd | ju|!|1|Bd\}U};};d}Vt |r@| ||%|	|2| jLdd | ju|#|1\}W};};tS|W|}V| j%durI| j%ntj}X| || j||X|!|=\}Y}Z}[|Y|J}\t0| j|\}]t |r{| || j| j|#|Zj|=\}^}_}`}and\}^}a}_}`t | j" r|T  | |
|J||1|= | |
|$|C||O|P|Q| ju|]|Z|_d|Y|[|^|`|a|U|V|1|%|T|.|=|&\}P}Qt | j/r|&r|Ob  | jtd|D dd  t | j/ r|.`  |.a }/nL|Py|M |Qy|M t | r|Ny|L |.j`| jkd! |.a }/n+|.`  |.a }/|/jDr'|%jR|/jEd d}9t\|9| jLd }A|Ny|A |.`  |.a }/|/jDst | j/ r=|&r?|Ob  dS dS dS dS )"a7  
        GPU device kernel performing the batched GEMM computation.

        :param tma_atom_a: TMA copy atom for A tensor
        :type tma_atom_a: cute.CopyAtom
        :param mA_mkl: Input tensor A
        :type mA_mkl: cute.Tensor
        :param tma_atom_b: TMA copy atom for B tensor
        :type tma_atom_b: cute.CopyAtom
        :param mB_nkl: Input tensor B
        :type mB_nkl: cute.Tensor
        :param tma_atom_d: TMA copy atom for D tensor
        :type tma_atom_d: cute.CopyAtom
        :param mD_mnl: Output tensor D
        :type mD_mnl: cute.Tensor
        :param tiled_mma: Tiled MMA object
        :type tiled_mma: cute.TiledMma
        :param cluster_layout_mnk: CTA layout
        :type cluster_layout_mnk: cute.Layout
        :param a_smem_layout: Shared memory layout for A
        :type a_smem_layout: cute.ComposedLayout
        :param b_smem_layout: Shared memory layout for B
        :type b_smem_layout: cute.ComposedLayout
        :param epi_smem_layout: Shared memory layout for epilogue
        :type epi_smem_layout: cute.ComposedLayout
        Nr9   )r   cluster_layout_vmnkab_pipeline_mbar_ptrr   )c_smem_layoutepi_pipeline_mbar_ptr)sched_pipeline_mbar_ptrvarlen_k)swizzle)dtyper   )len_m_staticlen_k_staticr4   warp_idxr}   )is_scheduler_warprD   rC   )r   Nr   )	cta_coord
cta_layout
src_tensor
dst_tensor
mcast_masktma_desc_ptrr@   r   )limit_mlimit_k)Nr   r   r   rF   r   mma)warp_group_idxstager   )	batch_idxg        )
barrier_idnum_threads)r  )NNNNr  )advance_count)r   cu_seqlens_mcu_seqlens_kr7   r   r   make_warp_uniformr  rd   r	   prefetch_descriptorr^   r_   SmemAllocatorallocaterm   make_ab_pipeliner   r   r   data_ptrmake_epi_pipeliner   r   tile_count_semaphoremake_sched_pipeliner   r   
get_tensorr   r   outerinnerr   r5   
recast_ptrr   r   r   r   r   epi_get_smem_tensorsr   creater/   r   r   r4   r   warpgroup_reg_deallocre   rc   init_tensormap_ABget_tma_desc_a_ptrget_tma_desc_b_ptrblock_idx_in_clusterget_flat_coordmake_layout_image_maskrV   rW   r   initial_work_tile_infor   pipelinePipelineUserTypeProducerrg   fence_tensormap_initis_valid_tiletile_idxupdate_tensormap_ABr   r   offset_batch_A
local_tileselectrQ   offset_batch_AIdxflat_divideoffset_batch_Bfence_tensormap_update_ABlen_mlen_k
copy_utilstma_get_copy_fn_make_gmem_tiled_copy_Ar   
thread_idx	WARP_SIZE	get_slicegather_m_get_copy_fngather_k_get_copy_fnceil_divload_ABload_AB_gather_Afetch_next_workadvance_to_next_workget_current_workproducer_tailwarpgroup_reg_allocrf   r   init_tensormap_epiepi_get_tma_atomsget_tma_desc_d_ptrget_tma_desc_epi_ptrsr\   r[   make_fragment_Apartition_Amake_fragment_Bpartition_Bpartition_shape_Cmake_fragmentr0   rP   pingpong_barrier_arriverl   Consumermake_epi_store_pipeliner   advance_iters&epi_get_tensormap_update_shapes_ordersupdate_tensormap_epir   r  fillpingpong_barrier_syncNamedBarrierintr   r$   rb   fence_tensormap_update_epiepilog_gmem_copy_and_partitionoffset_batch_epitma_producer_copy_fnBFloat16epilog_smem_store_and_partitionretileepi_load_acc_subtileepilog_smem_load_and_partitionr   r   layoutarrive_and_waitepi_visit_accepilogue)bro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r  has_Dhas_Cr  tma_atomsmemstorageab_pipelineepi_pipelinesched_pipeliner   r   r   r   sD_ptrr   epi_smem_tensorsvarlen_manageris_tma_warptma_desc_a_ptrtma_desc_b_ptrcta_rank_in_clusterblock_in_cluster_coord_mnka_mcast_maskb_mcast_maskr  tile_scheduler	work_tileab_producer_statetile_coord_mnklr  mA_mkgA_mkmAIdx_mkgAIdxgB_nkrE  rF  copy_A_tiled_copy_Atidx
thr_copy_A
prefetch_Acopy_B
k_tile_cnttma_desc_d_ptrtma_desc_epi_ptrsr  warp_group_thread_layoutthr_mmatCrAtCrB	acc_shapeaccacc_slowk_tile_cnt_static
c_tile_cntab_read_stateepi_store_pipelineepi_read_stateepi_producer_state
epi_shapes
epi_ordersepilogue_barriercopy_Dcopy_C	copy_C_fnd_dtype_for_layouttiled_copy_r2stRS_rDtRS_sDtRS_rAccload_acc_subtiletiled_copy_s2rtRS_rCtSR_rCtSR_sCr+   r+   r,   r     s  1











	










	
	
	x


















	











    fzGemmSm90.kernelNr}  r  r  r  r  copy_SFAcopy_SFBreturnc                 C   s   t |d u}t |r|d usJ td}	d|k r||}	tj|ddD ]N}
|||	 ||}|j}t |d urA||
||d ||
||d t |rZ||
||d ||
||d || |	  td}	|
d |k rr||}	q$|S )NTr   r9   unrolltma_bar_ptr)
r   r   producer_try_acquirer^   rangeproducer_acquireproducer_get_barrierindexproducer_commitadvance)ro   r}  r  r  r  r  r  r  blockscaledpeek_ab_empty_statusk_tiler  smem_idxr+   r+   r,   rP  !  s.   



zGemmSm90.load_ABr  r   c                 C   s  t jt j }td}	d|k r||}	tj|d ddD ]X}
d}t|d ur.||
f}|| j	t|r:|
| j
 nd k}|||	| |j}|rV||}||
||d ||
|g|R   || |  td}	|
d |k rw||}	qd|k r|d }
d}t|d ur||
ddf}|| j	t|r|
| j
 nd k}|||	| |j}|r||}||
||d ||
|g|R ddi || |  |S )	NTr   r9   r  r+   r  )predr  )r   r   r   r  r   r  r^   r  r   rd   rc   r  r  r  producer_cpasync_commitr  )ro   r}  r  r  r  r  r  r   r  r  r  prefetch_outr  r  r  r+   r+   r,   rQ  K  sP   






zGemmSm90.load_AB_gather_Ar  r  r  r  r  r  c
              	   C   s  d}
|  }t|
|}t| jr| j|	dd td}d|k r$||}|tj	j
d tj|dgd}t|D ]I}||| t  tj|dd	D ]}d d ||jf}t|||| || | |tj	j
d qLt  |  td}|d |k r||}q9t| jrtd ||  tj||dd
D ]|}||| t  t| jr|tj	j
d tj|dd	D ]}d d ||jf}t|||| || | |tj	j
d qt  t| j rt|
 ntd || |   || |  |  td}|d |k r||}qt| jr(| jd|	 dd t| j r4td tj|dd
D ]}|| |  q;t| jrU||  ||fS )Nr9   r  r  Tr   FrC   r}   )unroll_fullr  )cloneminr   r4   rh  r   consumer_try_waitsetr   Field
ACCUMULATEr   r   r^   r  consumer_waitfencer  gemmcommit_groupr  rP   
wait_groupr   r   consumer_releasera  )ro   r}  r  r   r  r  r  r  r  r  k_pipe_mmasab_release_statenum_prologue_mmapeek_ab_full_statusnum_k_blocksr  	k_blk_idxk_blk_coordr+   r+   r,   r    sr   











zGemmSm90.mmaparamsr  .r  r~  r  r  r  rl   r  r  r  tiled_copy_t2rr  r  r  r  r  r  r  r  r  r  r  r  c           *         s  t |d u}t  d utt| jd d |jd }tj|dd}t|}|j| }| 	||||||||	}t |d urjt
jt|| jddD ]} || }!re|| ||!|d || |  qL fdd}"d	}#d
\}$}%t
|D ]} || }&|	|
|  | |||&}'t |r|| t||d d d |jf | tjjtjjjtjjjd tj  tj  || W d    n1 sw   Y  |  t |d uo| | j |k r|| | j }!r|| ||!|d || |  |  ||'|
|}(||  | j! })t |#r-t | dkr(|"|$|%d |)|&}$}%t r?t"#||
|d d d |)f  t |# rK|"|)|&d qt |#rW|"|$|%d | $|||||||| ||fS )NrC   r9   )r9   r   rz   r  )src_idxproducer_statec                    sl   t jjt jjjt jjjd   rtr | |d t	fdd t	fdd   d S )Nspacer  dst_idxc                            S N)r  r+   r  r+   r,   r   (      z9GemmSm90.epilogue.<locals>.tma_store_fn.<locals>.<lambda>c                      r  r  )r  r+   r  r+   r,   r   )  r  )
r   r   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctaru  r   r   r  r  r  r  rx  r  r+   r,   tma_store_fn  s   z'GemmSm90.epilogue.<locals>.tma_store_fnFr   r  r   r  )%r   r   zipped_divider   rQ   r   r   r   num_tiles_executed	epi_beginr^   r  r  r   get_hier_coordr  r  r  range_constexprepi_begin_loopr  copyr  r   r  r  r  r  r  	sync_warp	elect_oner  epi_visit_subtilerh   rG  cvt_copyepi_end)*ro   r  r  r  r~  r  r  r  rl   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  ry  epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_Cr  delay_tma_storesrc_idx_prevdst_idx_prev
gmem_coordepi_loop_tensorstRS_rEpi
epi_bufferr+   r  r,   rw    s   















zGemmSm90.epiloguec                 C   s   |st S tS )zPReturn the scheduler class to use. Override in subclasses for custom schedulers.)r   r   )ro   r   r+   r+   r,   r   e  s   zGemmSm90.get_scheduler_classc           	      C   s&  t |jdu rN|dur|jd n|jdu r|jd n|jjd d }t|jd | jd t|jd | jd |f}t||j|j	| j
|j|j| jd}|S |dusW| jrWJ dt|jd | jd |jjd d f}t||durw|jd n|jjd |j|j|j	| jdd | j
|j| jd	}|S )zICreate scheduler arguments. Override in subclasses for custom schedulers.NrC   r   r9   )problem_shape_ntile_mnlraster_order
group_sizer3   r'  batch_idx_permuter5   )	r  total_mr  r  r  r2   r3   r'  r5   )r   r   r   mCuSeqlensKr   rO  rQ   r   r  max_swizzle_sizer3   r'  r  r5   r7   r   r   )	ro   r   r   r   r   r   num_problemsr  r   r+   r+   r,   r   i  sL   	



z GemmSm90.get_scheduler_argumentsr  r  c                 C   s4   t t|D ]}||t| |  ||< qd S r  )r^   r  r   r   )ro   r  r  r  epi_vr+   r+   r,   rr    s   zGemmSm90.epi_load_acc_subtilec
           
      C      dS Nr+   r+   )
ro   r  r  rl   r  r  r  r  r  r  r+   r+   r,   r    s   zGemmSm90.epi_beginr  	epi_coordc                 C   r  r  r+   )ro   r  r  r  r+   r+   r,   r    s   zGemmSm90.epi_begin_loopr  c                 C      d S r  r+   )ro   r  r  r  r  r+   r+   r,   r    s   zGemmSm90.epi_visit_subtilec                 C   r  r  r+   )ro   r  r  r   r  r  r+   r+   r,   rv    s   zGemmSm90.epi_visit_accc	           	      C   r  r  r+   )	ro   r  r  rl   r  r  r  r  r  r+   r+   r,   r    s   zGemmSm90.epi_end)locipargsc                C   s   |   S r  )EpilogueParams)ro   r  r  r  r+   r+   r,   r     s   z$GemmSm90.epi_to_underlying_argumentsc                C   s   g S zSubclasses can override thisr+   )ro   r  r  r  r+   r+   r,   rX    s   zGemmSm90.epi_get_tma_atomsr  r  c                C   s   g g fS r!  r+   )ro   r  r  r  r  r  r+   r+   r,   re    s   
z/GemmSm90.epi_get_tensormap_update_shapes_ordersrQ   c                 C   r  Nr   r+   )r  rQ   rl   r+   r+   r,   epi_smem_bytes_per_stage  s   z!GemmSm90.epi_smem_bytes_per_stagec                 C   s   t jjtdf S r"  )r   r   r   r   )ro   r  r+   r+   r,   r     s   zGemmSm90.epi_get_smem_structc                 C   s   t  S r  )r   )ro   r  r|  r+   r+   r,   r-    s   zGemmSm90.epi_get_smem_tensorsr  r  r   c                 C   B   |dv sJ |dkrt jnt j}tjjt|| d| j d d S Nr$  r  rC   )r  number_of_threads)r   r&   r(   r   r   barrierrj  r\   ro   r  r  r(  r+   r+   r,   rh       

zGemmSm90.pingpong_barrier_syncc                 C   r%  r&  )r   r&   r(   r   r   barrier_arriverj  r\   r)  r+   r+   r,   ra    r*  z GemmSm90.pingpong_barrier_arrivec                 C   sP   t tj| jd ur| j nd| jd d dkrdnddt}t ||}|S )NFr9   rA   r   rF   rC   )num_matrices)	r   make_copy_atomr
   StMatrix8x8x16bOpr   is_m_major_crl   r   make_tiled_copy_C_atom)ro   r   copy_atom_Ctiled_copy_C_atomr+   r+   r,   epilog_smem_copy_atom  s   zGemmSm90.epilog_smem_copy_atomr   r	  r   c                 C   s   |d u rt j}| |}tj||| jd}t||}||}	|d ur)|		|nd }
|d ur6|j
d d n| j}|	t|j
}t|| j}|||
fS )N)	elem_ty_delem_ty_accrC   )r   	ROW_MAJORr3  r   sm90_get_smem_store_opr0   r   make_tiled_copy_SrL  partition_Dr   rl   partition_Smake_identity_tensorr`  )ro   r   r   r	  r   r  r2  copy_atom_r2sr  thr_copy_r2sr  sD_shapetRS_rD_shaper  r+   r+   r,   rp    s   


z(GemmSm90.epilog_smem_store_and_partitionr   r   tRS_rD_layoutc                 C   sX   |  |}t||}t||}	|	|}
|
|}t||}|
|}|	|||fS r  )	r3  rG  sm90_get_smem_load_opr   r8  rL  r:  r`  rq  )ro   r   r   r	  r   r@  r  r2  copy_atom_s2rr  thr_copy_s2rr  r  r  r+   r+   r,   rs  1  s   
	


z'GemmSm90.epilog_smem_load_and_partitionatommD_mnr  c                 C   sj   t |||d d }t ||}	t|jtjtjf}
|
r!||	fn|	|f\}}tj	|dt 
d|||dS )NrC   r   r9   )r  r  r  r  r  )r   r?  r  
isinstancer   r	   CopyBulkTensorTileS2GOpCopyReduceBulkTensorTileS2GOprG  rH  r   )ro   rD  rE  r2   rl   r   r  r  gDtDgD_for_tma_partitionis_s2gr  r  r+   r+   r,   rl  C  s   z'GemmSm90.epilog_gmem_copy_and_partitionr  r  c           
      C   s   t | j rdnd| jd  }ttjj|}| j| j d }||j	 t
jj }ttjj|}| js6tjnt}	|	j|| j||| j|dS )Nr9   r@   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)r   r7   rc   r7  CooperativeGroupAgentThreadrT   rU   r   r   r   rK  PipelineTmaAsyncr   r.  rg   r   )
ro   r   r  r  producer_cntab_pipeline_producer_group
mcast_sizeconsumer_arrive_cntab_pipeline_consumer_grouppipeline_clsr+   r+   r,   r$  _  s    zGemmSm90.make_ab_pipeliner  r  c                 C   sJ   t t jj}| j}t t jj|}t| j|}t jj	|| j
|||dS )N)rL  rM  rN  rO  rP  )r7  rR  rS  rT  rb   r   r   r   rU  r.  r   )ro   r  r  epi_pipeline_producer_grouprY  epi_pipeline_consumer_grouptma_copy_c_bytesr+   r+   r,   r&  x  s   zGemmSm90.make_epi_pipelinec                 C   s0   | j tjj }ttjj|}tjj	| j
|dS )N)rM  rN  )rb   r   r   rK  r7  rR  rS  rT  PipelineTmaStorer.  rh   )ro   num_epi_threadsepi_store_producer_groupr+   r+   r,   rc    s
   z GemmSm90.make_epi_store_pipeliner  r  c                 C   sz   t t jj}t|}| jr|r| jndd | j | d }t t jj|}t j	j
|| j||t|dkr9d dS ddS )Nr9   rF   r   )rL  rM  rN  rO  consumer_mask)r7  rR  rS  rT  r   r   r4   r[   rc   PipelineAsyncr.  r   r   )ro   r   r  r  sched_pipeline_producer_groupcluster_sizerY  sched_pipeline_consumer_groupr+   r+   r,   r(    s*   
zGemmSm90.make_sched_pipeliner   r   r   ra   rX   r   c                 C   s"  |d dkrdnd}|
rd}n|durt ||j d nd}|| ||| }|| }|du r2dn	|d dkr:dnd}|durN|t ||j d | 7 }t |d}t |d	}t ||j d t ||j d  }d
}||	 | | }|| }|
s|dkr||||  | 7 }|||fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (A/B operand stages, epilogue stages)
        :rtype: Tuple[int, int]
        r9   rA   rF   rC   r   Nr8   Nr   Nr   NNrN   )r   r   rO   r#  r   )clsrQ   rl   r1   r   r   r   rx   ra   rX   r   rh   	epi_bytesd_bytes_per_stageepi_bytes_per_stager   a_shapeb_shapeab_bytes_per_stagembar_helpers_bytesremaining_bytesrg   r+   r+   r,   r     s,     &
zGemmSm90._compute_stagesrS   r   epi_tile_overridec                 C   s  |dur|S | d d dkr0|d dkr0t dtj| dgd}t dtj| dgd}||fS | d d dkrZ|d dkrZt dtj| dgd}t dtj| dgd}||fS |dure|jdkred	nd}t d	tj| dgd}t |tj| dgd}||fS )
aV  Compute the epilogue tile shape or use override if provided.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param element_type: Data type of elements
        :type element_type: type[cutlass.Numeric]
        :param is_cooperative: Whether to use cooperative approach
        :type is_cooperative: bool
        :param epi_tile_override: Optional override for epilogue tile shape
        :type epi_tile_override: Tuple[int, int] or None

        :return: Computed epilogue tile shape
        :rtype: Tuple[int, int]
        Nr   r;   r9   r}   r@   r<   r8   r:   )rY   gcdr   r   rO   )rQ   rS   r   rr  tile_mtile_nn_perfr+   r+   r,   r     s   z-GemmSm90._sm90_compute_tile_shape_or_overrider   r   rg   rh   r   c                 C   s  t | d}| tjjk}| tjjk}| |rdnd }tt||||}t j	|t 
|||r5dndd}t | d}| |rDdnd }tt||||}t j	|t 
|||r^dndd}d	}|d	urpt||||	}d	}|
d	ur|d	us|J t|
|||}||||fS )
ag  Create shared memory layouts for A, B, and C tensors.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]
        :param a_dtype: Data type for matrix A
        :type a_dtype: type[cutlass.Numeric]
        :param a_layout: Layout enum for matrix A
        :type a_layout: LayoutEnum
        :param b_dtype: Data type for matrix B
        :type b_dtype: type[cutlass.Numeric]
        :param b_layout: Layout enum for matrix B
        :type b_layout: LayoutEnum
        :param ab_stage: Number of stages for A/B tensors
        :type ab_stage: int
        :param d_dtype: Data type for output matrix D
        :type d_dtype: type[cutlass.Numeric]
        :param d_layout: Layout enum for the output matrix C
        :type d_layout: LayoutEnum
        :param epi_stage: Number of epilogue stages
        :type epi_stage: int

        :return: Tuple of shared memory layouts for A, B, and C
        :rtype: Tuple[cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout]
        rg  rC   r   )r   r9   rC   )r9   r   rC   rz   rh  r9   N)r   r   r   r   OperandMajorModeKmake_smem_layout_atomr   get_smem_layout_atomtile_to_shapeappendquack_sm90_utilsmake_smem_layout_epi)rQ   rl   r1   r   r   r   rg   r   r   rh   r   r   r   a_smem_shapea_is_k_majorb_is_k_majora_major_mode_sizea_smem_layout_atomri   b_smem_shapeb_major_mode_sizeb_smem_layout_atomrj   rk   r   r+   r+   r,   r     sP   ,



zGemmSm90._make_smem_layoutstensor_drk   r   )r   r   r   c           	      C   sv   |dv sJ t |d}t t | j|}|dkrt n|dkr&t ntt j	j
}t|| ||\}}||fS )a  Create TMA atoms and tensors for storing D or loading C.

        :param tensor_d: Output tensor D
        :type tensor_d: cute.Tensor
        :param epi_smem_layout_staged: Shared memory layout for epilogue
        :type epi_smem_layout_staged: cute.ComposedLayout
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]

        :return: TMA atom and tensor for C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        )r   r   r   r   r   r   )r   r   compositionmake_identity_layoutr   r	   CopyBulkTensorTileG2SOprG  rH  ReductionOpADDmake_tiled_tma_atom)	r  rk   rl   r   r   d_cta_v_layoutr   r   r   r+   r+   r,   r   q  s   z(GemmSm90._make_tma_epi_atoms_and_tensorstensorsmem_layout	smem_tile	mcast_dimc                 C   s8   |dkrt  nt  }t j|| |||d\}}||fS )a  Create TMA atoms and tensors for input tensors.

        :param tensor: Input tensor (A or B)
        :type tensor: cute.Tensor
        :param smem_layout: Shared memory layout for the tensor
        :type smem_layout: cute.ComposedLayout
        :param smem_tile: Shared memory tile shape
        :type smem_tile: Tuple[int, int]
        :param mcast_dim: Multicast dimension
        :type mcast_dim: int

        :return: TMA atom and tensor
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        r9   )num_multicast)r	   r   CopyBulkTensorTileG2SMulticastOpr  )r  r  r  r  r   rz  
tma_tensorr+   r+   r,   r     s   
z$GemmSm90._make_tma_atoms_and_tensorsr;   c                 C   s   t jtjtjjd||d}||j }d| }t | jd | }||kr+t	
||}t j|| |f|dfd}	|tjkr^t | jd | }
|
|krQt	
|
|}
t j|
||
 fd|
fd}	|tjkrjt d|fnt |df}t ||	|S )N)
cache_mode)num_bits_per_copyrN   rC   r9   r   r   )r   r-  r	   	CopyG2SOpLoadCacheModeGLOBALrO   r   rQ   rY   rs  r   r   r6  make_tiled_copy_tv)ro   r	  
major_moder  	copy_bitsatom_async_copy
copy_elemsloads_per_cache_lineshape_dim_1thread_layoutshape_dim_0value_layoutr+   r+   r,   rI    s2   


z GemmSm90._make_gmem_tiled_copy_Aa_majorb_majorc                 C   s   d}| t tjtjtjhvrd}|t tjtjtjhvrd}|tt hvr$d}|dtt tjtjtjhvr3d}| jdkr>| |kr>d}| j|jkrFd}| jdkrO|dksX|jdkrZ|dkrZd}|S )a  
        Check if the dtypes are valid

        :param a_dtype: The data type of tensor A
        :type a_dtype: Type[cutlass.Numeric]
        :param b_dtype: The data type of tensor B
        :type b_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]
        :param a_major: major mode of tensor A
        :type a_major: str
        :param b_major: major mode of tensor B
        :type b_major: str

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFNrA   r8   k)r   r^   ro  Float8E4M3FN
Float8E5M2r   rO   )r1   r   r0   r   r  r  is_validr+   r+   r,   is_valid_dtypes  sB   $zGemmSm90.is_valid_dtypes)FTFFr   )T)Fr  )r;   )[r   r    r!   __doc__r   r/   rj  r   r   EpilogueArgumentsr   r   r   r^   Numericr   boolrw   r   r   jitTensorr   r   r   cudaCUstreamr   r   TiledMmaCopyAtomr   ParamsLayoutComposedLayout	Constexprr   r7  rc  PipelineStater   rP  rQ  r  listPointerTile	TiledCopyThrCopyCoordri  r   rw  r   r   rr  r  r  r  rv  r  r   rX  r   re  staticmethodr#  r   r-  r   rh  ra  r3  r   rp  rs  r   rl  r$  r&  rc  r(  classmethodr   r   r   r   r   rI  strr  r+   r+   r+   r,   r-   W   s  
 $
	
 [	 0	
    		
)		>	
V
	
 
5	



	







	





	

?
(
	
[
!

"r-   );r"   typingr   r   r   r   r   r   	functoolsr   rY   cuda.bindings.driverbindingsdriverr  r^   cutlass.cuter   cutlass.pipeliner7  cutlass.cute.nvgpur	   r
   r   cutlass.utils.hopper_helpersr_   hopper_helpersr   r   r   r   r   r   cutlass.cutlass_dslr   cutlass.utilsr   quack.cute_dsl_utilsr   r   quack.tile_schedulerr   r   r   r   r   quack.varlen_utilsr   r   quack.pipeliner   r   quack.copy_utilsrG  quack.sm90_utilsr}  IntEnumr   r-   r+   r+   r+   r,   <module>   s,    &