o
    c۷iJ                    @   sZ  d dl Z d dlmZmZmZmZmZmZ d dlm	Z	 d dl
Z
d dlm  mZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlm  mZ d dlmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d d	l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d d
l/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7mZ8 	 G dd de j9Z:G dd dZ;dS )    N)TupleTypeCallableOptionalUnionLiteral)partial)pipeline_init_arrivepipeline_init_wait)cpasyncwarp	warpgroup)Int32Float32Float16Boolean
const_expr)
LayoutEnum)
ParamsBaseArgumentsBase)TileSchedulerOptionsTileSchedulerArgumentsTileSchedulerVarlenMTileSchedulerArgumentsVarlenMTileSchedulerPersistenceMode)VarlenArgumentsVarlenManager)make_pipeline_statePipelineTmaCpAsyncc                   @   sD   e Zd Ze Ze Ze Ze Ze Z	e Z
e ZdS )NamedBarrierGemmN)__name__
__module____qualname__enumautoEpilogueEpilogueLoadMmaWG0MmaWG1EpiWG0EpiWG1TmemPtr r-   r-   E/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_sm90.pyr    L   s    r    c                3   @   s  e Zd ZdZdZeZeZ				dde	e
j de	e
j deeef deeeef d	ed
ededefddZdefddZejdejdejdeej deej dededee dejfddZejdejdeej dejdejdejd eej d!eej d"eej d#eej d$ed%ejd&ej d'ej!d(ej!d)ej!d*ej!d+ed,e
j"e# f$d-d.Zej	/	/dd0e
j$j%d1e
j$j&d2ee# d3e#d4e'd5ee# d6ee# d7e
j$j&fd8d9Z(ej	dd0e
j$j%d1e
j$j&d2e#d:ee# d3e#d4e'd;ed7e
j$j&fd<d=Z)ejd0e
j$j%d>e
j$j&d?e#d@ejdAeej d4e'dBe'd7e
j$j&fdCdDZ*ejdEedFeejdGf dHe
j$j%dIe
j$j%dJe
j$j&dKee
j$j& dLej+dMe#dNejdOeej dPeej, dQej,dRejdSeej- dTeej dUeej dVee# dWee# dXej.dYedZe
j$j/d[e'd\e0d7ee
j$j&e
j$j&f f0d]d^Z1dd;efd_d`Z2dejdejdeej fdadbZ3ejdcejdNejddefdedfZ4ejdEedFeejdGf dLej+dPeej, dQej,dXej.dYedZe
j$j/d[e'd7eejdGf fdgdhZ5dEedieejdGf djej.d7eejdGf fdkdlZ6	/ddEedmeejdGf dNejdOeej d7eej f
dndoZ7dEed@ejdejdXej.d[e'd7d/fdpdqZ8ejdEedieejdGf dLej+dPeej, dQej,dXej.d7d/fdrdsZ9d/d/dtdued7efdvdwZ:d/d/dtdEed7e;ej fdxdyZ<e=duee dzeeeef dLej+d7efd{d|Z>dEefd}d~Z?dEed7eejdGf fddZ@dBe'deAd fddZBdBe'deAd fddZCdejd7ej,fddZDdejdeeE de	e
j deej d[e'd7eej,ejejf fddZFdejdeEde	e
j dejde
j d[e'd7eej,ejejf fddZGdeHejej,f dejdej+dLej+dejdXej.d7eejejf fddZIdejdej dejJfddZKdej ej!B dejJfddZLdd ZMd&ej dejJdefddZNeOdzeeeef dLeeef de	e
j de	e
j dee	e
j  dee	e
j  dededed7eeef fddZPe=	/	/ddzeeeef deeeef dee	e
j  deeef d/B d7eeef f
ddZQe=dzeeeef dLeeef de	e
j deEde	e
j deEdedee	e
j  deEdedee	e
j  deeE ded7eej!ej!ej!eej! f fddZRe=dejdej!dLeeef deAd d7eejejf f
ddZSe=dejdej!deeef ded7eejejf f
ddÄZTdddƄZUe=de	e
j de	e
j de	e
j dee	e
j  deVdeVd7efddʄZWd/S )GemmSm90aX  
    This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Hopper GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param tile_shape_mn: Shape of the CTA tile (M,N)
    :type tile_shape_mn: Tuple[int, int, int]
    :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
    :type cluster_shape_mnk: Tuple[int, int, int]

    :note: Data type requirements:
        - For 16-bit types: A and B must have the same data type
        - For 8-bit types: A and B can have different types (Float8E4M3FN/Float8E5M2) as long as both are 8-bit
        - Float8 types only support k-major layout

    :note: Supported data types:
        - Float16
        - BFloat16
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulation types:
        - Float32 (for all floating point inputs)

    :note: Constraints:
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 4

    Example:
        >>> gemm = GemmSm90(
        ...     acc_dtype=Float32,
        ...     tile_shape_mn=(128, 256),
        ...     cluster_shape_mnk=(1, 1, 1)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, stream)
    Z   FT	acc_dtypea_dtypetile_shape_mncluster_shape_mnkpingpongis_persistentfp8_fast_accumgather_Ac	                 C   s  || _ || _|| _| jr| jsJ d| o|jdk| _|| _|r+|d dks+J d|| _g |dR | _| jd | jd }	}
| js|	dvrLtd|	dv rm|	d	krVd
nd}|
d dkrb|
|ksltd|	 d| n@|
d dkrw|
d
ks|
d dkr|
dkstdn'|	dvrtd|	dkrd
n|	dkrdnd}|
d dkr|
|kstd| | js|	dkrd\}}n$|	d	kr|
dkrd\}}nd\}}n| jd d
k r| jd d nd}d}|dv r|dv sJ nd\}}||df| _	| jd | _
| jr| j
dksJ | jd | _| j
dk| _| jdk| _d| _t| j	| js$dnd | _| jr4| jdks4J | jdv s<J d| _| jd | j | _tjd| _| jsV| jndd | _| jsadnd| _| jd | _t| jd d t| j	| j  }| jr|d9 }| js| jd!krd"\| _| _n#|dk}|sd#nd$\| _| _n| jd!krd%\| _| _nd&\| _| _d | _d | _d | _d | _ d | _!d | _"d | _#d'| _$d S )(a5  
        Initializes the configuration for a Hopper dense GEMM kernel.

        This configuration includes data types for operands, tile shape, cluster configuration,
        and thread layout.

        :param acc_dtype: Data type for accumulation during computation
        :type acc_dtype: type[cutlass.Numeric]
        :param tile_shape_mn: Shape of the CTA tile (M,N)
        :type tile_shape_mn: Tuple[int, int]
        :param cluster_shape_mnk: Cluster dimensions (M,N,K) for parallel processing
        :type cluster_shape_mnk: Tuple[int, int, int]
        z+Pingpong gemm requires persistent scheduler      z'Cluster shape N must be 1 for gather A r   )@            @  z+CTA tile shape M must be 64/128/192/256/320)r=   r?   r=   r>          zIf tile_m == z2, CTA tile shape N must be divisible by 32 and <=    i   zRCTA tile shape N must be divisible by 16 and <= 256, or divisible by 32 and <= 512)r;   r<   r=   z/CTA tile shape M must be 64/128/192 if pingpongr;   r<      z0CTA tile shape N must be divisible by 16 and <= r?   )r:      )   r:   rD   )r:   rD   rE   )r:   r:   sm_90   NrE   )rA   r@   )(      )      )8      )rL         )%r1   r5   r6   widthfp8_slow_accumr8   r4   cta_tile_shape_mnk
ValueErroratom_layout_mnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcast	occupancymathprodmma_warp_groupsnum_threads_per_warp_groupthreads_per_ctacutlassutilsget_smem_capacity_in_bytessmem_capacitynum_epi_warpsnum_ab_load_warpsab_load_warp_idnum_regs_loadnum_regs_mmaab_stage	epi_stagea_smem_layout_stagedb_smem_layout_stagedepi_smem_layout_stagedepi_tileshared_storagebuffer_align_bytes)selfr1   r2   r3   r4   r5   r6   r7   r8   tile_Mtile_N
tile_N_maxatom_layout_matom_layout_nregs_per_threadheavy_register_pressurer-   r-   r.   __init__   s   (



zGemmSm90.__init__epilogue_argsc                 C   s  t j| j| j| j | j | j| jd| j	d | jd  fd| _
t| jd dkrR| jd }tjd| j	d | d |fdd}tjt| j
j| jd|dfd| _
tj| j
jd	gd
}d}| j	d | j	d || f| _	t| j| _| | j	| j| j| _| | j	| j| j| j| j| j|tjd| j | j	\| _ | _!| _"| j#rd	nd| _$| %| j	| j| j| j| j| j| j | j| j&| j!| j| j'| j"\| _(| _)| _*| _+dS )a  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        r;   r:   )tiler_mnr9   )r   rD   r:   orderN)permutation_mnkrD   moderG   r   sm_),
sm90_utilsmake_trivial_tiled_mmar2   b_dtypea_layoutsm90_mma_major_modeb_layoutr1   rT   rR   	tiled_mmar   cutemake_ordered_layoutmake_tiled_mmamake_mma_atomopsize	shape_mnkmake_layoutr4   cluster_layout_mnk$_sm90_compute_tile_shape_or_overrided_dtyperm   _compute_stagesc_dtyper_   r`   ra   archrY   rh   ri   epi_c_stager5   sched_stage_make_smem_layoutsd_layoutc_layoutrj   rk   rl   epi_c_smem_layout_staged)rp   ry   atom_npermutation_nmma_inst_shape_kmma_inst_tile_kr-   r-   r.   _setup_attributes  s~   	
zGemmSm90._setup_attributesmAmBmDmCscheduler_argsvarlen_argsstreamc	                    s  |j _|j _|dur|j nd_|dur|j nd_t|_t|_|dur1t|nd_	|dur=t|nd_
tjjdkoLjjkrZtdj dj tjjjjkrrtdjj djj tjjdko~jjdkrtdt|du rt }|jdujksJ |jdu}	|jdu}
dtjfd	d
fdd||fD \}}| tjd}tjd}d\}}tj r|
rjstj|ddn||jd jd fjd \}}|
rtj|ddn||jd jd fjd \}}tj|_tj r- jtj|7  _d\}}t|dur\j |	rEtj|dddn|j!j"t#|drT|j$sVdndd\}}d\}}t|durtj |j%j"dd\}}&|t'(|}j)|	d}*||||||}|(|}|+||j,}|durt-j!nd|durt-j%nd tj.G  fddd}|_/0j1|tj r|n||||||||j2jjj!j%||j3|j4ddgj|dd dS )a  Execute the GEMM operation in steps:
        - Setup static attributes
        - Setup TMA load/store atoms and tensors
        - Compute grid size
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param mA: Input tensor A
        :type mA: cute.Tensor
        :param mB: Input tensor B
        :type mB: cute.Tensor
        :param mD: Output tensor D
        :type mD: cute.Tensor
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        NrB   zType mismatch: z != zType width mismatch: r9   z#a_dtype should be float16 or float8tc                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )r<   )divbyN)r   	is_staticassumeelement_typerP   ).0sr   r-   r.   	<genexpr>  s
    "
z8GemmSm90.__call__.<locals>.new_stride.<locals>.<genexpr>)tuplestrider   r-   r   r.   
new_stride  s   z%GemmSm90.__call__.<locals>.new_stridec              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr   )r   make_tensoriteratorr   shape)r   r   )r   r-   r.   
<listcomp>  s    z%GemmSm90.__call__.<locals>.<listcomp>NNr   NNr:   )
ragged_dimr   rD   T)r   	ptr_shiftadd_to_outputstoreadd)op_typeloadvarlen_mc                       sF  e Zd ZU ejjejjd f e	d< ejjejj
d f e	d< ejjejjd f e	d< ejjejd f e	d< ejjejjjdurJjnef jf e	d< ejjejjjdurcjne f jf e	d	< e	d
< ejjejjjejf jf e	d< ejjejjjejf jf e	d< dS )z(GemmSm90.__call__.<locals>.SharedStoragerD   ab_pipeline_array_ptrepi_pipeline_array_ptrsched_pipeline_array_ptrrG   
sched_dataNsDsCepisAsB)r!   r"   r#   r   structMemRanger_   Int64rh   __annotations__r   r   r   Alignr   ro   r   epi_get_smem_structr2   cosizerj   r   rk   r-   )epi_c_smem_sizeepi_smem_sizeepilogue_paramsrp   r-   r.   SharedStorage  s<   
 r   )gridblockclusterr   min_blocks_per_mp)5r   r2   r   r   r   r   from_tensorr   r   r   r   r   rP   	TypeErrorr   mAIdxr8   mCuSeqlensMmCuSeqlensKr   Tensorr   slice_rj   rk   _make_tma_atoms_and_tensors
copy_utilscreate_ragged_tensor_for_tmarR   r4   size_in_bytesnum_tma_load_bytes_make_tma_epi_atoms_and_tensorsrl   rm   hasattrr   r   epi_to_underlying_argumentsr   to_underlying_argumentsget_scheduler_classget_scheduler_argumentsget_grid_shapemax_active_clustersr   r   rn   kernelr   r   launchr^   )rp   r   r   r   r   ry   r   r   r   r   varlen_ka_smem_layoutb_smem_layout
tma_atom_atma_tensor_a
tma_atom_btma_tensor_b
tma_atom_dtma_tensor_d
tma_atom_ctma_tensor_cvarlen_paramsTileSchedulerClstile_sched_argstile_sched_paramsr   r   r-   )r   r   r   r   rp   r.   __call__]  s   










zGemmSm90.__call__r   r   mA_mklr   mB_nklr   mD_mnlr   mC_mnlr   r   r   r   r   epi_smem_layoutepi_c_smem_layoutr   r   c           [      C   s@  t |jdu}t |jdu}|r|rJ t | jr|s|sJ t |du}t |	du}tjtj }|| jkrM||||fD ]}t |durLt	
| q?tj }|| j}| j|tdg|jR |j d}d}t |r| jt|d|j d}d}d}t | jr| j||j |d}|jd| jf}t| jdd d	d
 |j j|j!|j"d}|j#j|j!|j"d} d}!t |r|j$j|j!|j"d}!d}"t |r|j%j|j!|j"d}"| &|
|}#t'j(|t)|s|j*du r|jd n|j*jd t)|jd d}$t+|j(|||}t,| jdd d || jkrtj-| j. || jkr|| j| j/ k r| j/dkp8|| jk}%tjtj0 }&|1|&}'tj2||'dd}(tj2||'dd})| j3r]|(nd}(| j4re|)nd})| j/dkpq|| jk}*t t5|dkr|*otj0 dk}*| }+|+6 },t7t8j9j:| j;}-|,j<r|,j=}.|.d }/t | j r|$>||/}0t?|0t@| jAddg|.d df}1n9|$B|/}2t |rt?|2| jAd f|.d f}3|}0n|sJ tC|2| jAd f}3t?|| jAd f|.d df}0t?|$D||/t@| jAddg|.d df}4|$E|/}5|$F|/}6d}7t | j r;tGjH||'d tt|dj|1||(d\}7}8}8nS| I|jJ| jK| j/d }9tjL d tjjM| j  }:|9N|:};d\}7}<t |rxtGjO|;|0||3|5|.d | jAd   |6d}7ntGjP|;|0||3|5|.d | jAd   |6d\}7}<tGjH||'d tt|dj|4| |)d\}=}8}8tQ|6| jAd }>t | j r| R||-|7|=|>}-n| jS||-|7|<|=|>|d}-|+jT|*d |+U },|,j<st | jVo| r|*r|+W|, |+U },|X|- |*r|+X  || jk rtjY| jZ t[| jV r|dkp | jVo |dkp |dk}%tjL \}:}8}8tj|:| j\ }?t | jVr>|:| j\ }:tjt | jV rJ| j]nd| j\d}@|N|@| jVsY|?nd}At^_|A| jA|| \}B}C}Dd}Et | j`ryta|Bj| jb}Et+t^jc||B|C|D}Ft | jVr|?dkr| jdddd | jdddd tQ|jd | jAd }Gt5tQ| jAdd | je}Ht7t8j9jf| j;}I| g }Jt7t8j9jf| jh}Kt7t8j9j:| jh}L| }+|+6 },t | jVr|dkr|Ki|H |Li|H t | r|Ii|G n|$jF|,j=d d}6tQ|6| jAd }>|Ii|> |+T  |+U },|,j<r|,j=}.|.d }/|$F|/}6tQ|6| jAd }>| j||I|F|B|E|>|?}It |rL|>dkrL|Bkd t | jVrX| l|?d t8jmtntojp| jqtjjM d }Md}Nt |r| r||$s||/| jAdd | je|!|.\}N}8}8d}Ot |r| r||$s|	|/| jAdd | je|"|.\}P}8}8tGt|P|}O| judur| juntjv}Q| w|| jx|Q|!|:\}R}S}TtC|B|Sjy}Ut+| jz|U}Vt |r| {|| j|| j}|"|Sjy|:\}W}X}Y}Znd!\}W}Z}X}Y| ~|
|B||.|: | |
|#||J|K|L| je|V|S|Xd|R|T|W|Y|Z|N|O|.|$|M|+|:|%\}K}Lt | jVr*|%r!|JX  | jdd|? dd" t | jV r:|+T  |+U },nL|Ki|H |Li|H t | r[|Ii|G |+jT| j]d# |+U },n+|+T  |+U },|,j<r|$jF|,j=d d}6tQ|6| jAd }>|Ii|> |+T  |+U },|,j<st | jV r|%r|JX  dS dS dS dS )$a7  
        GPU device kernel performing the batched GEMM computation.

        :param tma_atom_a: TMA copy atom for A tensor
        :type tma_atom_a: cute.CopyAtom
        :param mA_mkl: Input tensor A
        :type mA_mkl: cute.Tensor
        :param tma_atom_b: TMA copy atom for B tensor
        :type tma_atom_b: cute.CopyAtom
        :param mB_nkl: Input tensor B
        :type mB_nkl: cute.Tensor
        :param tma_atom_d: TMA copy atom for D tensor
        :type tma_atom_d: cute.CopyAtom
        :param mD_mnl: Output tensor D
        :type mD_mnl: cute.Tensor
        :param tiled_mma: Tiled MMA object
        :type tiled_mma: cute.TiledMma
        :param cluster_layout_mnk: CTA layout
        :type cluster_layout_mnk: cute.Layout
        :param a_smem_layout: Shared memory layout for A
        :type a_smem_layout: cute.ComposedLayout
        :param b_smem_layout: Shared memory layout for B
        :type b_smem_layout: cute.ComposedLayout
        :param epi_smem_layout: Shared memory layout for epilogue
        :type epi_smem_layout: cute.ComposedLayout
        Nr:   )r   cluster_layout_vmnkab_pipeline_mbar_ptrr   )c_smem_layoutepi_pipeline_mbar_ptr)sched_pipeline_mbar_ptrr   rG   T)cluster_shape_mn
is_relaxed)swizzler   )len_m_staticlen_k_static)r  r~   rE   rD   )r   Nr   )	cta_coord
cta_layout
src_tensor
dst_tensor
mcast_maskrA   r   )limit_mlimit_k)Nr   r   r   )is_scheduler_warpr   mma)warp_group_idxstager   )	batch_idxg        )
barrier_idnum_threads)NNNNr  )advance_count)r   cu_seqlens_mcu_seqlens_kr8   r   r   make_warp_uniformwarp_idxre   r   prefetch_descriptorr_   r`   SmemAllocatorallocatern   make_ab_pipeliner   r   r   data_ptrmake_epi_pipeliner   r   r6   make_sched_pipeliner   r   
get_tensorr   r	   r4   r   outerinnerr   r   r   epi_get_smem_tensorsr   creater   r   r   r
   setmaxregister_decreaserf   rd   block_idx_in_clusterget_flat_coordmake_layout_image_maskrW   rX   r   initial_work_tile_infor   pipelinePipelineUserTypeProducerrh   is_valid_tiletile_idxoffset_batch_A
local_tileselectrR   offset_batch_AIdxflat_divideoffset_batch_Blen_mlen_kr   tma_get_copy_fn_make_gmem_tiled_copy_Ar   r   
thread_idx	WARP_SIZE	get_slicegather_m_get_copy_fngather_k_get_copy_fnceil_divload_ABload_AB_gather_Aadvance_to_next_workget_current_workr5   write_work_tile_to_smemproducer_tailsetmaxregister_increaserg   r   r]   r\   quack_sm90_utilspartition_fragment_ABCrQ   make_rmem_tensorr1   
gemm_w_idxpingpong_barrier_arriverm   Consumermake_epi_store_pipeliner   advance_itersr  fillpingpong_barrier_syncNamedBarrierintr    r&   rc   epilog_gmem_copy_and_partitionoffset_batch_epitma_producer_copy_fnr   BFloat16epilog_smem_store_and_partitionr   layoutepi_load_acc_subtileepilog_smem_load_and_partitionr   r   epi_visit_accepilogue)[rp   r   r   r  r   r  r   r  r   r  r   r   r   r   r   r  r  r   r   r   r   has_Dhas_Cr&  tma_atomsmemstorageab_pipelineepi_pipelinesched_pipeliner   r   r   r   r   epi_smem_tensorsvarlen_manageris_tma_warpcta_rank_in_clusterblock_in_cluster_coord_mnka_mcast_maskb_mcast_maskr  tile_scheduler	work_tileab_producer_statetile_coord_mnklr  mA_mkgA_mkmAIdx_mkgAIdxgB_nkrC  rD  copy_A_tiled_copy_Atidx
thr_copy_A
prefetch_Acopy_B
k_tile_cntr  warp_group_thread_layoutthr_mmaacctCrAtCrBacc_slowmma_fnk_tile_cnt_static
c_tile_cntab_read_stateepi_store_pipelineepi_read_stateepi_producer_stateepilogue_barriercopy_Dcopy_C	copy_C_fnd_dtype_for_layouttiled_copy_r2stRS_rDtRS_sDtRS_rAccload_acc_subtiletiled_copy_s2rtRS_rCtSR_rCtSR_sCr-   r-   r.   r     s  1





















	
	

	i





























v  4zGemmSm90.kernelNro  r{  r  r  r  copy_SFAcopy_SFBreturnc                 C   s   t |d u}t |r|d usJ td}	d|k r||}	tj|ddD ]N}
|||	 ||}|j}t |d urA||
||d ||
||d t |rZ||
||d ||
||d || |	  td}	|
d |k rr||}	q$|S )NTr   r:   unrolltma_bar_ptr)
r   r   producer_try_acquirer_   rangeproducer_acquireproducer_get_barrierindexproducer_commitadvance)rp   ro  r{  r  r  r  r  r  blockscaledpeek_ab_empty_statusk_tiler  smem_idxr-   r-   r.   rM    s.   



zGemmSm90.load_ABr  r   c                 C   s  t jt j }td}	d|k r||}	tj|d ddD ]R}
d}t|d ur.||
f}|| j	|
| j
  k}|||	| |j}|rP||}||
||d ||
|g|R   || |  td}	|
d |k rq||}	qd|k r|d }
d}t|d ur||
ddf}|| j	t|r|
| j
 nd k}|||	| |j}|r||}||
||d ||
|g|R ddi || |  |S )	NTr   r:   r  r-   r  )predr  )r   r   r%  r&  r   r  r_   r  r   re   rd   r  r  r  producer_cpasync_commitr  )rp   ro  r{  r  r  r  r  r   r&  r  r  prefetch_outrt  r  r  r-   r-   r.   rN     sL   






zGemmSm90.load_AB_gather_Ar  r  r  r  r  c                 C   s   d}|  }	t||}
t| jr| j|dd td}d|k r$||}td}t|
D ](}|	|| ||j
|j
|d td}|  td}|d |k rU||}q-t| jrgtd ||  tj|
|ddD ]V}|	|| t| jrtd}||j
|j
|d td}t| j rt| ntd || |   ||	 |  |	  td}|d |k r||}qot| jr| jd| dd t| j rtd tj|
ddD ]}||	 |	  qt| jr||  |S )	Nr:   r  r!  Tr   )A_idxB_idx	zero_initFr  )cloneminr   r5   r]  r   consumer_try_waitr_   r  consumer_waitr  r  rQ   r   
wait_groupr   r   consumer_releaserX  )rp   ro  r  r  r  r  r  r  k_pipe_mmasab_release_statenum_prologue_mmapeek_ab_full_statusr  r  r-   r-   r.   r  9  s`   














zGemmSm90.mmaparamsrr  .rp  r  r  r  rm   r  r  r  tiled_copy_t2rr  r  r  r  r  r  r  r|  rs  r  r  rt  c           &      C   s  t |
d u}t |d u}tt| jd d |jd }tj|dd}t|}|j| }| 	|||||||||	}t |d urjt
jt|| jddD ]} || }!|re|| ||!|d || |  qLt
|D ]} || }"||	|  | |||"}#t |r|| t||d d d |jf | tj  tj  tj  || W d    n1 sw   Y  |  t |d uo| | j |k r|| | j }!|r|| ||!|d || |  | ||#|	|
}$|r|  |  ||  | j }%t |rt ||	|d d d |%f  tj  |  |r2t |r.||%|"d |  qo| !|||||||| ||fS )NrD   r:   )r:   r   r{   r  )src_idxproducer_state)r  dst_idx)"r   r   zipped_divider   rR   r   r   r   num_tiles_executed	epi_beginr_   r  r  r   get_hier_coordr  r  r  range_constexprepi_begin_loopr  copyr  r   fence_view_async_shared	sync_warp	elect_oner  epi_visit_subtilearrive_and_waitri   r   cvt_copyepi_end)&rp   r  rr  rp  r  r  r  rm   r  r  r  r  r  r  r  r  r  r  r  r|  rs  r  ry  r  rt  rk  rj  epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_C
gmem_coordepi_loop_tensorstRS_rEpi
epi_bufferr-   r-   r.   ri  ~  s   















zGemmSm90.epiloguec                 C   s   |st S tS )zPReturn the scheduler class to use. Override in subclasses for custom schedulers.)r   r   )rp   r   r-   r-   r.   r     s   zGemmSm90.get_scheduler_classc                 C   sr  t | j r
tj}nt | jdkrtj}nt |jdur tj}ntj}t |j	du rp|dur3|j
d n|jdu r=|j
d n|jj
d d }t|j
d | jd t|j
d | jd |f}	t|	|j|j| j|j|j|d}
|
S |dus~|jdus~| jr~J dt|j
d | jd |j	j
d d f}	t|	|dur|j
d n|jj
d |j	|j|j| jdd | j|j|d	}
|
S )zICreate scheduler arguments. Override in subclasses for custom schedulers.d   NrD   r   r:   )problem_shape_ntile_mnlraster_order
group_sizer4   tile_count_semaphorebatch_idx_permutepersistence_mode)	r  total_mr#  r  r  r3   r4   r  r  )r   r6   r   NONEr   CLCr  DYNAMICSTATICr   r   r   r   rL  rR   r   r  max_swizzle_sizer4   r  mPostActr8   r   r   )rp   r   r   r   r   r   ry   r  num_problemsr  r   r-   r-   r.   r     sZ   




z GemmSm90.get_scheduler_argumentsr  r  c                 C   s   t |d d d |f | d S N)r   autovec_copy)rp   r  r  r  r-   r-   r.   rf  0  s   zGemmSm90.epi_load_acc_subtilec
           
      C      dS Nr-   r-   )
rp   r  rr  rm   r  r  r|  rs  r  r  r-   r-   r.   r  4  s   zGemmSm90.epi_beginr  	epi_coordc                 C   r  r  r-   )rp   r  r  r  r-   r-   r.   r  C  s   zGemmSm90.epi_begin_loopr  c                 C      d S r  r-   )rp   r  r  r  r  r-   r-   r.   r  H  s   zGemmSm90.epi_visit_subtilec                 C   r  r  r-   )rp   r  r  r   r|  r  r-   r-   r.   rh  Q  s   zGemmSm90.epi_visit_accc	           	      C   r  r  r-   )	rp   r  r  rm   r  r  r|  rs  r  r-   r-   r.   r  [  s   zGemmSm90.epi_end)locipargsc                C   s   |   S r  )EpilogueParams)rp   r  r  r  r-   r-   r.   r   i  s   z$GemmSm90.epi_to_underlying_argumentsc                C   s   g S )zSubclasses can override thisr-   )rp   r  r  r  r-   r-   r.   epi_get_tma_atomsn  s   zGemmSm90.epi_get_tma_atomsrR   c                 C   r  Nr   r-   )r  rR   rm   r-   r-   r.   epi_smem_bytes_per_staget  s   z!GemmSm90.epi_smem_bytes_per_stagec                 C   s   t jjtdf S r  )r   r   r   r   )rp   r  r-   r-   r.   r   |  s   zGemmSm90.epi_get_smem_structc                 C   s   t  S r  )r   )rp   r  rn  r-   r-   r.   r1    s   zGemmSm90.epi_get_smem_tensorsr  r  r   c                 C   B   |dv sJ |dkrt jnt j}tjjt|| d| j d d S Nr  r  rD   )r  number_of_threads)r    r(   r*   r   r   barrierr_  r]   rp   r  r  r  r-   r-   r.   r]       

zGemmSm90.pingpong_barrier_syncc                 C   r  r  )r    r(   r*   r   r   barrier_arriver_  r]   r  r-   r-   r.   rX    r  z GemmSm90.pingpong_barrier_arrivec                 C   sP   t tj| jd ur| j nd| jd d dkrdnddt}t ||}|S )NFr:   rB   r   rG   rD   )num_matrices)	r   make_copy_atomr   StMatrix8x8x16bOpr   is_m_major_crm   r   make_tiled_copy_C_atom)rp   r   copy_atom_Ctiled_copy_C_atomr-   r-   r.   epilog_smem_copy_atom  s   zGemmSm90.epilog_smem_copy_atomr   dtyper   c                 C   s   |d u rt j}| |}tj||| jd}t||}||}	|d ur)|		|nd }
|d ur6|j
d d n| j}|	t|j
}t|| j}|||
fS )N)	elem_ty_delem_ty_accrD   )r   	ROW_MAJORr  r   sm90_get_smem_store_opr1   r   make_tiled_copy_SrI  partition_Dr   rm   partition_Smake_identity_tensorrV  )rp   r   r   r  r   r  r  copy_atom_r2sr  thr_copy_r2sr  sD_shapetRS_rD_shaper  r-   r-   r.   rd    s   


z(GemmSm90.epilog_smem_store_and_partitionr   r   tRS_rD_layoutc                 C   sX   |  |}t||}t||}	|	|}
|
|}t||}|
|}|	|||fS r  )	r  r   sm90_get_smem_load_opr   r  rI  r  rV  retile)rp   r   r   r  r   r  r  r  copy_atom_s2rr  thr_copy_s2rr  r  r  r-   r-   r.   rg    s   
	


z'GemmSm90.epilog_smem_load_and_partitionatommD_mnc                 C   sh   t |||d d }t ||}t|jtjtjf}	|	r!||fn||f\}
}tj	|dt 
d|
|dS )NrD   r   r:   )r  r  r  r  )r   r>  r  
isinstancer   r   CopyBulkTensorTileS2GOpCopyReduceBulkTensorTileS2GOpr   rE  r   )rp   r#  r$  r3   rm   r   r|  gDtDgD_for_tma_partitionis_s2gr  r  r-   r-   r.   r`    s   
z'GemmSm90.epilog_gmem_copy_and_partitionr  r	  c           
   	   C   s   t | j rdnd| jd  }ttjj|}| j| j d }||j	 t
jj }ttjj|}| js6tjnt}	|	j|| j||| j|ddS )Nr:   rA   T)barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk
defer_sync)r   r8   rd   r8  CooperativeGroupAgentThreadrU   rV   r   r   r   rH  PipelineTmaAsyncr   r2  rh   r   )
rp   r   r  r	  producer_cntab_pipeline_producer_group
mcast_sizeconsumer_arrive_cntab_pipeline_consumer_grouppipeline_clsr-   r-   r.   r*    s"   zGemmSm90.make_ab_pipeliner
  r  c                 C   sL   t t jj}| j}t t jj|}t| j|}t jj	|| j
|||ddS )NT)r+  r,  r-  r.  r/  r1  )r8  r2  r3  r4  rc   r   r   r   r5  r2  r   )rp   r
  r  epi_pipeline_producer_groupr9  epi_pipeline_consumer_grouptma_copy_c_bytesr-   r-   r.   r,    s   zGemmSm90.make_epi_pipelinec                 C   s0   | j tjj }ttjj|}tjj	| j
|dS )N)r,  r-  )rc   r   r   rH  r8  r2  r3  r4  PipelineTmaStorer2  ri   )rp   num_epi_threadsepi_store_producer_groupr-   r-   r.   rZ    s
   z GemmSm90.make_epi_store_pipeliner  r   c                 C   sz   t t jj}t|}| jr|r| jndd | j | }t t jj|}t j	j
|| j||t|dkr8d ddS dddS )Nr:   rG   r   T)r+  r,  r-  r.  consumer_maskr1  )r8  r2  r3  r4  r   r   r5   r\   rd   PipelineAsyncr2  r   r   )rp   r   r  r   sched_pipeline_producer_groupcluster_sizer9  sched_pipeline_consumer_groupr-   r-   r.   r-    s*   
zGemmSm90.make_sched_pipeliner   r   r   rb   rY   c
                 C   s  |d dkrdnd}
|durt ||j d nd}|| ||| }||
 }|du r-dn	|d dkr5dnd}|durI|t ||j d | 7 }t |d}t |d	}t ||j d t ||j d  }d
}||	 | | }|| }|dkr|
|||  | 7 }
||
|fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (A/B operand stages, epilogue stages)
        :rtype: Tuple[int, int]
        r:   rB   rG   rD   Nr9   r   Nr   Nr   NNrO   )r   r   rP   r   r   )clsrR   rm   r2   r   r   r   ry   rb   rY   ri   d_bytes_per_stageepi_bytes_per_stage	epi_bytesr   a_shapeb_shapeab_bytes_per_stagembar_helpers_bytesremaining_bytesrh   r-   r-   r.   r   2  s&     &
zGemmSm90._compute_stagesrT   r   epi_tile_overridec                 C   s  |dur|S | d d dkr0|d dkr0t dtj| dgd}t dtj| dgd}||fS | d d dkrZ|d dkrZt dtj| dgd}t dtj| dgd}||fS |dure|jdkred	nd}t d	tj| dgd}t |tj| dgd}||fS )
aV  Compute the epilogue tile shape or use override if provided.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param element_type: Data type of elements
        :type element_type: type[cutlass.Numeric]
        :param is_cooperative: Whether to use cooperative approach
        :type is_cooperative: bool
        :param epi_tile_override: Optional override for epilogue tile shape
        :type epi_tile_override: Tuple[int, int] or None

        :return: Computed epilogue tile shape
        :rtype: Tuple[int, int]
        Nr   r<   r:   r~   rA   r=   r9   r;   )rZ   gcdr   r   rP   )rR   rT   r   rR  tile_mtile_nn_perfr-   r-   r.   r   l  s   z-GemmSm90._sm90_compute_tile_shape_or_overrider   r   rh   ri   r   c                 C   s  t | d}| tjjk}| tjjk}| |rdnd }tt||||}t j	|t 
|||r5dndd}t | d}| |rDdnd }tt||||}t j	|t 
|||r^dndd}d	}|d	urpt||||	}d	}|
d	ur|d	us|J t|
|||}||||fS )
ag  Create shared memory layouts for A, B, and C tensors.

        :param cta_tile_shape_mnk: CTA tile shape (M,N,K)
        :type cta_tile_shape_mnk: Tuple[int, int, int]
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]
        :param a_dtype: Data type for matrix A
        :type a_dtype: type[cutlass.Numeric]
        :param a_layout: Layout enum for matrix A
        :type a_layout: LayoutEnum
        :param b_dtype: Data type for matrix B
        :type b_dtype: type[cutlass.Numeric]
        :param b_layout: Layout enum for matrix B
        :type b_layout: LayoutEnum
        :param ab_stage: Number of stages for A/B tensors
        :type ab_stage: int
        :param d_dtype: Data type for output matrix D
        :type d_dtype: type[cutlass.Numeric]
        :param d_layout: Layout enum for the output matrix C
        :type d_layout: LayoutEnum
        :param epi_stage: Number of epilogue stages
        :type epi_stage: int

        :return: Tuple of shared memory layouts for A, B, and C
        :rtype: Tuple[cute.ComposedLayout, cute.ComposedLayout, cute.ComposedLayout]
        rG  rD   r   )r   r:   rD   )r:   r   rD   r{   rH  r:   N)r   r   r   r   OperandMajorModeKmake_smem_layout_atomr   get_smem_layout_atomtile_to_shapeappendrT  make_smem_layout_epi)rR   rm   r2   r   r   r   rh   r   r   ri   r   r   r   a_smem_shapea_is_k_majorb_is_k_majora_major_mode_sizea_smem_layout_atomrj   b_smem_shapeb_major_mode_sizeb_smem_layout_atomrk   rl   r   r-   r-   r.   r     sP   ,



zGemmSm90._make_smem_layoutstensor_drl   r   )r   r   r   c           	      C   sv   |dv sJ t |d}t t | j|}|dkrt n|dkr&t ntt j	j
}t|| ||\}}||fS )a  Create TMA atoms and tensors for storing D or loading C.

        :param tensor_d: Output tensor D
        :type tensor_d: cute.Tensor
        :param epi_smem_layout_staged: Shared memory layout for epilogue
        :type epi_smem_layout_staged: cute.ComposedLayout
        :param epi_tile: Epilogue tile shape
        :type epi_tile: Tuple[int, int]

        :return: TMA atom and tensor for C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        )r   r   r   r   r   r   )r   r   compositionmake_identity_layoutr   r   CopyBulkTensorTileG2SOpr&  r'  ReductionOpADDmake_tiled_tma_atom)	rf  rl   rm   r   r  d_cta_v_layoutr   r   r   r-   r-   r.   r     s   z(GemmSm90._make_tma_epi_atoms_and_tensorstensorsmem_layout	smem_tile	mcast_dimc                 C   s8   |dkrt  nt  }t j|| |||d\}}||fS )a  Create TMA atoms and tensors for input tensors.

        :param tensor: Input tensor (A or B)
        :type tensor: cute.Tensor
        :param smem_layout: Shared memory layout for the tensor
        :type smem_layout: cute.ComposedLayout
        :param smem_tile: Shared memory tile shape
        :type smem_tile: Tuple[int, int]
        :param mcast_dim: Multicast dimension
        :type mcast_dim: int

        :return: TMA atom and tensor
        :rtype: Tuple[cute.CopyAtom, cute.Tensor]
        r:   )num_multicast)r   ri   CopyBulkTensorTileG2SMulticastOprl  )rn  ro  rp  rq  r   rl  
tma_tensorr-   r-   r.   r     s   
z$GemmSm90._make_tma_atoms_and_tensorsr<   c                 C   s   t jtjtjjd||d}||j }d| }t | jd | }||kr+t	
||}t j|| |f|dfd}	|tjkr^t | jd | }
|
|krQt	
|
|}
t j|
||
 fd|
fd}	|tjkrjt d|fnt |df}t ||	|S )N)
cache_mode)num_bits_per_copyrO   rD   r:   r   r   )r   r
  r   	CopyG2SOpLoadCacheModeGLOBALrP   r   rR   rZ   rS  r   r   r  make_tiled_copy_tv)rp   r  
major_moder   	copy_bitsatom_async_copy
copy_elemsloads_per_cache_lineshape_dim_1thread_layoutshape_dim_0value_layoutr-   r-   r.   rF  6  s2   


z GemmSm90._make_gmem_tiled_copy_Aa_majorb_majorc                 C   s   d}| t tjtjtjhvrd}|t tjtjtjhvrd}|tt hvr$d}|dtt tjtjtjhvr3d}| jdkr>| |kr>d}| j|jkrFd}| jdkrO|dksX|jdkrZ|dkrZd}|S )a  
        Check if the dtypes are valid

        :param a_dtype: The data type of tensor A
        :type a_dtype: Type[cutlass.Numeric]
        :param b_dtype: The data type of tensor B
        :type b_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param d_dtype: The data type of the output tensor
        :type d_dtype: Type[cutlass.Numeric]
        :param a_major: major mode of tensor A
        :type a_major: str
        :param b_major: major mode of tensor B
        :type b_major: str

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFNrB   r9   k)r   r_   rc  Float8E4M3FN
Float8E5M2r   rP   )r2   r   r1   r   r  r  is_validr-   r-   r.   is_valid_dtypesT  s.   $zGemmSm90.is_valid_dtypes)FTFFr   )T)Fr  )r<   )Xr!   r"   r#   __doc__r   r   EpilogueArgumentsr   r  r   r_   Numericr   r_  boolrx   r   r   jitr   r   r   r   r   cudaCUstreamr  r   TiledMmaCopyAtomr   ParamsLayoutComposedLayout	Constexprr   r8  rC  PipelineStater   rM  rN  r  Tile	TiledCopyThrCopyCoordr^  r   ri  r   r   rf  r  r  r  rh  r  r   listr  staticmethodr   r   r1  r   r]  rX  r  r   rd  rg  r   r`  Pointerr*  r,  rZ  r-  classmethodr   r   r   r   r   rF  strr  r-   r-   r-   r.   r/   X   s   $
	
 Y	 <	
   A		
'		8	D
	
n
?	



	












	

9
(
	
[
!

"r/   )<r$   typingr   r   r   r   r   r   	functoolsr   rZ   cuda.bindings.driverbindingsdriverr  r_   cutlass.cuter   cutlass.pipeliner8  r	   r
   cutlass.cute.nvgpur   r   r   cutlass.utils.hopper_helpersr`   hopper_helpersr   r   r   r   r   r   cutlass.utilsr   quack.cute_dsl_utilsr   r   quack.tile_schedulerr   r   r   r   r   r   quack.varlen_utilsr   r   quack.pipeliner   r   quack.copy_utilsr   quack.sm90_utilsrT  IntEnumr    r/   r-   r-   r-   r.   <module>   s,     &