o
    پitG                    @   sJ  d dl mZmZmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlm  mZ d dlm  mZ d dlmZmZ d dlmZmZmZmZmZmZmZ ddeddfddZd d	l m!Z! d d
l"m#Z# d dl$m%Z%m&Z&m'Z' ej(ddddededede)ddf
ddZ*	ddededdfddZ+	 G dd dZ,dS )    )OptionalTupleTypeUnionN)cpasynctcgen05)PointerInt32Float16BFloat16Float32Float8E4M3FN
Float8E5M2lock_ptrreturnc                 C   s   t j| ||d dS )zE
    arrive a spin lock when the lock_ptr is a multimem address.
    locipN)distributedmultimem_red_relaxed_gpu_add1)r   r   r    r   _/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/cute_dsl/gemm_allreduce_two_shot.pyspin_lock_multimem_arrive   s   r   )nvvm)T)MemOrderKindMemScopeKindAtomicOpKindr   expected_val	reset_valscopec                C   s   |dkr4d}||kr2t jt tj| jt|j||dt|j||dt	j
tj||d	}||ks
dS dS |dkrfd}||krht jt tj| jt|j||dt|j||dt	j
tj||d	}||ks>dS dS dS )z|
    wait on a spin lock until the expected count is reached. Reset flag to reset_val if the expected count is reached.
    gpur   r   )b	mem_order	syncscoper   r   sysN)r   	atomicrmwr   i32r   CASllvm_ptrr	   ir_valuer   ACQUIREr   GPUSYS)r   r   r   r    r   r   resultr   r   r   spin_lock_atom_cas_acquire_wait*   s@   r/   barrier
barrier_mcc                 C   sz   t j \}}}t j \}}	}
|||  || |	  }tj|| ||d t jt jjj t	| | |dd||d dS )z'
    barrier for inter-gpu sm-wise
    r   r   r%   )r   r   r    r   r   N)
cutearch	block_idxgrid_dimr   multimem_red_release_sys_add1fence_proxy	ProxyKindaliasr/   )r0   r1   	num_ranksr   r   bidxbidybidzbdimxbdimy_pidr   r   r   "sm_wise_inter_gpu_multimem_barrierU   s   
rB   c                #   @   s  e Zd ZdZ		d\deej dedee	e	f dee	e	f def
d	d
Z
dd Zdd Zejdd dddfdejdejdejdejdejdejdejdejfddZejdejdejdejdejd ejd!eej d"ejd#ejd$ejd%ejd&eejejdf d'ejd(ejdejdejdejdejf"d)d*Zd+ejd,ejd-ejd'ejdeejef d.eejejejf fd/d0Zd1ejd2ejd+ejd3ejd.eejejejf f
d4d5Z d+ejd6eejejf d-ejd'ejd3ejd.eejejejf fd7d8Z!e"dejd9ee	e	e	f d:eej d;eej d'ejd<eej d=ej#d>e	d?e	ded.ee	e	e	f fd@dAZ$e"dejdBee	e	e	f dee	e	f dejd.eejee	e	e	f f f
dCdDZ%e"dejdEee	e	e	f dFe	d.e	fdGdHZ&e"	d]dIeej deej d<eej dJe'd.ef
dKdLZ(e"dedee	e	f dee	e	f d.efdMdNZ)e"	d]dOe	dPe	dQe	dRe	dIeej d<eej dSe'dTe'dUe'dJe'd.efdVdWZ*e"dededOe	dPe	dee	e	f d.efdXdYZ+e"	d]dIeej deej d<eej dedee	e	f dee	e	f dedOe	dPe	dQe	dRe	dSe'dTe'dUe'dJe'd.ef dZd[Z,dS )^PersistentDenseGemmKernelag	  This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
    :type use_2cta_instrs: bool
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]
    :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results
    :type use_tma_store: bool
    :param all_reduce: All-reduce mode, can be "none", "two_shot"
    :type all_reduce: str

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported A/B data types:
        - TFloat32
        - Float16/BFloat16
        - Int8/Uint8
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulator data types:
        - Float32 (for all floating point A/B data types)
        - Float16 (only for fp16 and fp8 A/B data types)
        - Int32 (only for uint8/int8 A/B data types)

    :note: Supported C data types:
        - Float32 (for float32 and int32 accumulator data types)
        - Int32 (for float32 and int32 accumulator data types)
        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
        - Int8/Uint8 (for uint8/int8 accumulator data types)
        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)

    :note: Constraints:
        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
        - MMA tiler N must be 32-256, step 32
        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16

    Example:
        >>> gemm = PersistentDenseGemmKernel(
        ...     acc_dtype=cutlass.Float32,
        ...     use_2cta_instrs=True,
        ...     mma_tiler_mn=(128, 128),
        ...     cluster_shape_mn=(2, 2)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
    nonesm_100	acc_dtypeuse_2cta_instrsmma_tiler_mncluster_shape_mnuse_tma_storec                 C   s   || _ || _|| _|| _g |dR | _|| _|rtjjntjj	| _
|| _d| _d| _d| _d| _d| _d| _|dkrA|| _d| _dt| j| jg| j| jR  | _d	| _d| _d
| _d| _t|| _d| _d	| _|dkr}tj | _tj | _dS dS )ar  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
              with cta_group=2 should be used.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        3. Output C tensor store mode:
            - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results.

        :param acc_dtype: Data type of the accumulator.
        :type acc_dtype: type[cutlass.Numeric]
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
        :type use_2cta_instrs: bool
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor.
        :type use_tma_store: bool
        :param all_reduce: All-reduce mode, can be "none", "two_shot"
        :type all_reduce: str
           )r   rK               r   rD   )         	       r   rL   rM   N) rF   rG   rI   rH   	mma_tilerrJ   r   CtaGroupTWOONE	cta_group
all_reduce	occupancyepilog_warp_idmma_warp_idtma_warp_idall_reduce_warp_idlenthreads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idall_reduce_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacityr:   rank_idtorchr   get_world_sizeget_rank)selfrF   rG   rH   rI   rJ   rZ   
sm_versionr   r   r   __init__   sL   (	z"PersistentDenseGemmKernel.__init__c                 C   sh   | j \}}|| jrdnd dvrdS | jd | jrdnd dkr"dS | jd dkr2| jd dkr2dS dS )NrL   rK   @      Fr   rN   T)mma_tile_shape_mnrG   rI   )rm   mma_mmma_nr   r   r   is_validH  s   
z"PersistentDenseGemmKernel.is_validc                 C   s  t | j| j| j| j| j| jdd }tj	|j
dgd}d}| jd | jd || f| _| jd t	|jj | jd | jd f| _ttg | jdR |jjf| _t	| jjd | _t	| jjd | _| jdk| _| jdk| _t| jrt | j| j| j| j| _n| jdd | _| || j| j| j| j| j| j| j | j!| j
\| _"| _#| _$t %|| j| j| j#| _&t '|| j| j| j#| _(| jrt )| j| j| j| j$nd| _*| +|| j| j"| _,dS )a2  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        - Computing tensor memory allocation columns
        NrL   moderN   r   rK   )-sm100_utilsmake_trivial_tiled_mmaa_dtypea_major_modeb_major_moderF   rY   rU   r2   size	shape_mnkthr_idshapecta_tile_shape_mnktiled_dividemake_layoutrI   cluster_layout_vmnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcastcutlass
const_exprrJ   compute_epilogue_tile_shaperG   c_layoutc_dtypeepi_tile_compute_stagesb_dtyperh   r[   num_acc_stagenum_ab_stagenum_c_stagemake_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedmake_smem_layout_epic_smem_layout_staged_compute_num_tmem_alloc_colsnum_tmem_alloc_cols)rm   	tiled_mmamma_inst_shape_kmma_inst_tile_kr   r   r   _setup_attributesR  s   


z+PersistentDenseGemmKernel._setup_attributesc                 C   s   | S )Nr   xr   r   r   <lambda>  s    z"PersistentDenseGemmKernel.<lambda>Nar"   cmax_active_clustersepilogue_opc_mcbarrier_flagbarrier_flag_mcc
                    s  |j _|j _|j _tj| _tj| _	tj|_
tjjkr:tdj dj   tjjj	jjjdd }
t|
jj}tj|
j}tjd}tjj|||j|
jj|j tju r}tj ndd\}}t!j|
j}tj"d}tjj#|||j|
jj|j tju rtj ndd\}}t$j|}t$j|}|| | _%d}d}tj&rtj'd}t()t(* ||j+\}},|j-j|\_.}d_/j&rt0j'j1nd	 tj2G  fd
dd}|_34|
|||||j&r|n|jjj"j'j+j.||||	j5|j6ddgg jdR |d dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel synchronously

        :param a: Input tensor A
        :type a: cute.Tensor
        :param b: Input tensor B
        :type b: cute.Tensor
        :param c: Output tensor C
        :type c: cute.Tensor
        :param c_mc: Output symmetric tensor C_mc, any write or read to a multicast tensor will be broadcasted to all GPUs
        :type c_mc: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
        :type epilogue_op: cutlass.Constexpr
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
        zType must match: z != NrL   )NNNr   )internal_typeNNr      r   c                       s  e Zd ZU ejjejjf e	d< ejjejjf e	d< ejjejj
f e	d< ejjejj
f e	d< eje	d< eje	d< ejjejjj f jf e	d< ejjejjjejjf jf e	d< ejjejjjejjf jf e	d	< d
S )z9PersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsCsAsBN)__name__
__module____qualname__r2   structMemRanger   Int64r   __annotations__r   r	   Alignr   buffer_align_bytesr{   cosizer   outerr   r   r   c_smem_sizerm   r   r   SharedStorage6  s<   
 

r   rK   )gridblockclusterstream)7element_typer{   r   r   rf   
LayoutEnumfrom_tensormma_major_moder|   r}   r   r   r   	TypeErrorr   ry   rz   rF   rY   rU   r2   r~   r   r   cluster_shape_to_tma_atom_ArI   slice_r   nvgpumake_tiled_tma_atom_Ar   r   TFloat32cluster_shape_to_tma_atom_Br   make_tiled_tma_atom_Bsize_in_bytesnum_tma_load_bytesrJ   r   r   make_tiled_tma_atomCopyBulkTensorTileS2GOpr   _compute_gridr   tile_sched_paramsr   r   r   r   shared_storagekernellaunchra   )rm   r   r"   r   r   r   r   r   r   r   r   atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_ba_copy_sizeb_copy_size
tma_atom_ctma_tensor_cepi_smem_layoutr   r   r   r   r   __call__  s   &



z"PersistentDenseGemmKernel.__call__r   r   mA_mklr   mB_nklr   mC_mnlr   r   r   r   r   r   c                 C   sL  t j }t j|}|| jkr%t| t| t| j	r%t| t 
|jjdk}t j \}}}|t 
|jj }|dk}t jt j }||}t j \}}}t }|| j}|j}|j} ttjj}!| j| j d }"ttjj|"}#tjj|j  | j!|!|#| j"|d}$ttjj}%t#| j$|rdnd }&ttjj|&}'tj%j|j&  | j'|%|'|d}(|r|| jkrd})t j(  t j)||) W d   n1 sw   Y  t j*  t 
| j+dkrt j,  | j	r|j-j.|j/|j0dnd}*|j1j.|	j/|	j0d}+|j2j.|
j/|
j0d},d}-d}.t| j3p#| j4p#|r6tj5||dd	}-tj5||dd	}.t 6|t 7| j8d
d}/t 6|t 7| j8dd}0t 6|t 7| j8dd}1t j
|/dgd}2|9|}3|3:|/}4|3;|0}5|3<|1}6t =t 7|dj}7t>||d |7t ?|+ddt ?|4dd\}8}9t =t 7|dj}:t>||d |:t ?|,ddt ?|5dd\};}<|@|+}=|A|,}>|B| j8dd }?|Ct D|?| j'}@t 
| j+dkrt jE  n
t jjF| jG| jHd || jkrtjI|t j t jJ }A|AK }BtLtjMjN| j!}C|BjOr|BjP}D|Dd t 
|jj |Dd |Dd f}E|9d|Ed d|Ed f }F|<d|Ed d|Ed f }G|CQ  tRd}H|CjS|2k rU|$T|C}HtjUd|2dddD ]K}I|$V|C|H t jW||Fd|CjSf |8d|CjXf |$Y|C|-d t jW||Gd|CjSf |;d|CjXf |$Y|C|.d |CZ  tRd}H|CjS|2k r|$T|C}Hq^|A[  |A\ }B|BjOs|$]|C || j^krdt#| j^g| j$R  }Jt jjF| j_|Jd t jj`| jad| d}Kt b|K|@jc}LtjI|t j t jJ }A|AK }BtLtjMjd| j!}MtLtjMjN| j'}N|BjOr|BjP}D|Dd t 
|jj |Dd |Dd f}E|Lddd|NjXf }O|MQ  tRd}P|MjS|2k rD|rD|$e|M}P|rL|(V|N |ftgjhjid tU|2D ]W}I|r|$j|M|P t j
|=dgd}QtjU|QddD ]}Rdd|R|MjXf}St k||O|=|S |>|S |O |ftgjhjid qr|$l|M |MZ  tRd}P|MjS|2k r|r|$e|M}PqX|r|(m|N |NZ  |A[  |A\ }B|BjOs|(]|N || j^k r|| j$d krt jjn| jo| |d dt#| j^g| j$R  }Jt jjF| j_|Jd t jj`| jad| d}Kt b|K|@jc}L|}T| p|T|L|6||\}U}V}Wd}Xd}Yd}Zd}[d}\d}]d}^d}_t| j	rPt q|Wj| jr}X| s|U|X|T|*\}Y}[}\| t|T||6||*\}}]}^n| t|T|U|6||*\}Z}X}_tjI|t j t jJ }A|AK }BtLtjMjd| j'}`d}at| j	rttjjdt#| j$ dt#| j$ }btjuj| jv|bd}a|BjOrI|BjP}D|Dd t 
|jj |Dd |Dd f}Ed}cd}dt| j	r|^dddg|ER  }cn|_dddddg|ER  }d|Vddddd|`jXf }e|(j|` t ?|edt w|e}et| j	rt ?|cdt w|c}cn
t ?|ddt w|d}dt j
|ejdgd}f|Ajx|f }gtU|fD ]}h|eddd|hf }it W|U|i|W t| j	r|Yy|Wz }j||j{| jr}j|[||j |g|h | jv }kt W|Y|[|\ddd|kf  t jj}t jj~jt jjjd dt#| j$ }lt jjF| j|ld || j$d krt W||]d|kf |cd|hf  |am  |aV  t jjF| j|ld q |Wz }j||j{| jr}j|X||j t W|Z|X|dddd|hf  q t j(  |(l|` W d   n	1 sw   Y  |`Z  t| jdkr=t|Ajt 
| j+ t j  }m|| j$d kr=t jjddd t j(  |j|m }nt j  t|n t j}t jj~j W d   n	1 s8w   Y  |A[  |A\ }B|BjOs|| j$d krXt jj|d dt#| j$ }lt jjF| j|ld || j$d kr|rt j||dA  t j|d t jj|K| jo|d t| j	r|a]  t| jdkr|| jd kr| j}ot| j}pt j }qtjI|t j t jJ }A|AK }Bd|jj }r| j8d |r }st#| jt jj|s  }tt j=|t|sf|sdfd }ut j=d|rf|rdfd }vt t j |j}wt |w|u|v}x|x9|| jd d  }y|BjOrW|BjP}Dt|Ajt 
| j+ t j  }m|Dd t 
|jj |Dd |Dd f}E|| jd krht j(  |j|m }ntj|n|pdd!d" W d   n	1 scw   Y  t jjF| jdt#| j d t 6|t 7| j8dd}z|3<|z}{|{d#ddg|ER  }|| j8d t 
|jj }}t|}| j }~t |||~| j8d f}t 7|d#|odff}|y|}|j\}}}t|D ]}t|D ]x}|d||f j}d$\}}}}t| jrtkrt|\}}}}nKt| jrtkrt|\}}}}n8t| jrtkrt|\}}}}n%t| jrtkr,t|\}}}}nt| jrtkr>t|\}}}}t||||| qАq|A[  |A\ }B|BjOst jjF| jdt#| j d t 
|Ajjt 
| j+ }|| jd krt j(  t|j| |j| | j W d   dS 1 sw   Y  dS dS dS dS )%zW
        GPU device kernel performing the Persistent batched GEMM computation.
        rL   r   rK   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)r   r   r   r   r   rT   N)swizzle)
mcast_mode)Nr   NNNN)r   NNr   rM   rw   )r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addrFT)unroll_full)
is_two_cta)r   r   )spacetwo_shot)readrr   )strider!   )r   r   r    NN)r   r   r   r   )r2   r3   warp_idxmake_warp_uniformr^   r   prefetch_descriptorr   r   rJ   r~   r   r   r4   block_idx_in_clusterget_flat_coord
thread_idxrf   SmemAllocatorallocater   r   r   pipelineCooperativeGroupAgentThreadr   r   PipelineTmaUmmacreater   data_ptrr   r   r`   r\   PipelineUmmaAsyncr   r   	elect_onembarrier_initmbarrier_init_fencerI   cluster_arrive_relaxedr   
get_tensorr   innerr   r   r   r   create_tma_multicast_mask
local_tiler   rU   	get_slicepartition_Apartition_Bpartition_Cr   tma_partitiongroup_modesmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_Cappendcluster_waitr0   rb   ra   StaticPersistentTileSchedulerr5   initial_work_tile_infomake_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyindexproducer_get_barrieradvanceadvance_to_next_workget_current_workproducer_tailr]   rd   retrieve_tmem_ptrrF   make_tensorlayoutConsumerconsumer_try_waitsetr   Field
ACCUMULATEconsumer_waitgemmconsumer_releaseproducer_commit
alloc_tmemr   epilog_tmem_copy_and_partitionmake_fragmentr   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorer   ranknum_tiles_executedretileloadtostorer7   r8   async_sharedSharedSpace
shared_ctarc   rZ   r	   _current_work_linear_idxcp_async_bulk_wait_groupiteratorfence_acq_rel_gpur   r9   relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmemr_   ri   r:   lane_idxr   width	WARP_SIZEmake_copy_atomr   CopyUniversalOpmake_tiled_copy_tvr   spin_lock_atom_cas_relaxed_waitre   intzipped_dividepartition_Srange_constexprr
   multimem_ld_reduce_8xf16r   multimem_ld_reduce_4xf32r   multimem_ld_reduce_8xbf16r   multimem_ld_reduce_16xe4m3r   multimem_ld_reduce_16xe5m2multimem_st_4xb32paramsproblem_layout_ncluster_mnlrB   )rm   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rG   r;   r<   r=   mma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnktidxr@   smemstorager   r   ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsr   r   r   a_full_mcast_maskb_full_mcast_maskgA_mklgB_nklgC_mnl
k_tile_cntthr_mmatCgAtCgBtCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_statecur_tile_coordmma_tile_coord_mnl
tAgA_slice
tBgB_slicepeek_ab_empty_statusk_tiletmem_ptr_read_threadstmem_ptrtCtAcc_baseab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statusnum_kblocks
kblock_idxkblock_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcctTR_rCtiled_copy_r2s	simt_atomtRS_rCtRS_sCbSG_sCbSG_gC_partitionedtTR_gC_partitionedacc_consumer_state
c_pipelinec_producer_groupbSG_gCtTR_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mnacc_vecc_bufferepilog_threadstile_idflagri   r:   lane_idatom_val
atom_thr_n
atom_thr_m
thr_layout
val_layoutcopy_atom_loadtiled_copy_fakethr_copy_fakegC_mctCgC_mctCgC_mc_slicecta_mma_tile_mm_local_ranktCgC_mc_slice_partitionedtCgC_mc_local_rankfrgC_mcatomloop_mloop_nijmc_ptrr   yzwlast_flag_idxr   r   r   r   s  sh  






	












	

F







Z	


	

	










	  






O
$zz PersistentDenseGemmKernel.kernelrz  tAccr  r   c                 C   s   t | j| j| j| j||}t|d |}t	||d }|
|}	|	|}
t|d |}|	|}t|d j| j}||
|fS )a  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )r  r   r   N)NNr   r   r   r  r   r   NNNNNNr   r   r   r   r   )ry   get_tmem_load_opr   r   r   rF   r2   flat_divider   make_tmem_copyr   rl  partition_DrN  r   )rm   rz  r  r  r   rG   copy_atom_t2rtAcc_epir  thr_copy_t2rr  
gC_mnl_epir  r  r   r   r   rM    s2   	



z8PersistentDenseGemmKernel.epilog_tmem_copy_and_partitionr  r  r   c           
      C   sJ   t | j| j| j|}t||}||}||}|	|}	||	|fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rC: The partitioned accumulator tensor
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor
        :type sepi: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )
ry   get_smem_store_opr   r   rF   r2   make_tiled_copy_Dr   r  rT  )
rm   r  r  rz  r   copy_atom_r2sr  thr_copy_r2sr  r  r   r   r   rO    s   



z8PersistentDenseGemmKernel.epilog_smem_copy_and_partitionr  c                 C   s   t |d |}t| jr1|}t |dd}t |dd}	t|dt d||	\}
}||
|fS |}|	|}|
|}t |d j| j}t t j | j}|||fS )aK  Make tiledCopy for global memory store, then use it to:
        - partition register array (source) and global memory (destination) for none TMA store version;
        - partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
        :type atom: cute.CopyAtom or cute.TiledCopy
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param sC: The shared memory tensor to be copied and partitioned
        :type sC: cute.Tensor

        :return: A tuple containing either:
            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
                - tma_atom_c: The TMA copy atom
                - bSG_sC: The partitioned shared memory tensor C
                - bSG_gC: The partitioned global tensor C
            - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where:
                - simt_atom: The SIMT copy atom
                - tTR_rC: The register tensor C
                - tTR_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
        r  r   rL   rK   r  )r2   r  r   r   rJ   r%  r   r$  r   r   r  rN  r   r   rf  r   rg  )rm   rz  r  r  r   r   gC_epir   sC_for_tma_partitiongC_for_tma_partitionr  r  r  r  r  r  r  r   r   r   rP  -  s.   #



z8PersistentDenseGemmKernel.epilog_gmem_copy_and_partitionmma_tiler_mnkr{   r   r   r   rh   r[   c
                 C   s   d}
|	rdnd}t | ||d}t | ||d}|	r"t |||dnd}t||t|| }d}|	r:t||nd}|| }|| ||  | }|	r`|||| |  |||   ||  7 }|
||fS )a  Computes the number of stages for A/B/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param epi_tile: The epilogue tile shape.
        :type epi_tile: cute.Tile
        :param c_dtype: Data type of operand C (output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int
        :param use_tma_store: Whether TMA store is enabled.
        :type use_tma_store: bool

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, C stages)
        :rtype: tuple[int, int, int]
        rL   r   rK   Nr   )ry   r   r   r   r2   r   )r   r  r{   r   r   r   r   rh   r[   rJ   r   r   a_smem_layout_stage_oneb_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesc_bytes_per_stagec_bytesr   r   r   r   r   m  s^   )




z)PersistentDenseGemmKernel._compute_stagesr   c           
      C   sT   t |d}t j| |d}|d j}g |dR }t||}tj||}	||	fS )a
  Use persistent tile scheduler to compute the grid size for the output tensor C.

        :param c: The output tensor C
        :type c: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
        r   )tiler)r   r   rK   )r2   r   rk  r   rf   PersistentTileSchedulerParamsr,  get_grid_shape)
r   r   rI   r   c_shapegcnum_ctas_mnlcluster_shape_mnlr   r   r   r   r   r     s   
z'PersistentDenseGemmKernel._compute_gridrU   r   c                 C   s2   |  |dd }| t||}t|}|S )a  
        Compute the number of tensor memory allocation columns.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler: The shape (M, N, K) of the MMA tile.
        :type mma_tiler: tuple[int, int, int]
        :param num_acc_stage: The stage of the accumulator tensor.
        :type num_acc_stage: int

        :return: The number of tensor memory allocation columns.
        :rtype: int
        NrL   )r(  r)  r2   r*  rf   get_num_tmem_alloc_cols)r   rU   r   r  r  r   r   r   r   r     s   
z6PersistentDenseGemmKernel._compute_num_tmem_alloc_colsab_dtyperZ   c              	   C   s(  d}| t jt jt jt jt jt jt jhvrd}|t jt jt j	hvs<|t jkr/| t jt jt jhvs<|t j	kr>| t jt jhvr>d}|t jkrW|t jt jt jt jt jt j	t jt jhvsy|t jkrd|t jt jhvsy|t j	kr{|t jt jt jt j	t jt jhvr{d}t 
|dko|t jt jt jt jt jhvrd}|S )a  
        Check if the dtypes are valid

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]

        :return: True if the dtypes are valid, False otherwise
        :rtype: bool
        TFrD   )r   r
   r   r   Uint8Int8r   r   r   r	   r   )r  rF   r   rZ   rv   r   r   r   is_valid_dtypes  sz   	





z)PersistentDenseGemmKernel.is_valid_dtypesc                 C   s   d}| s
|d dv s| r|d dv sd}|d t dddvr d}|d | r'd	nd dkr.d}d
d }|d |d  dksT|d dksT|d dksT||d rT||d sVd}|S )a	  
        Check if the mma tiler and cluster shape are valid

        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]

        :return: True if the mma tiler and cluster shape are valid, False otherwise
        :rtype: bool
        Tr   rp   )rr      FrK   rT   i  rL   c                 S   s   | dko| | d @ dkS )Nr   rK   r   r   r   r   r   r     s    zPPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape.<locals>.<lambda>r   )r7  )rG   rH   rI   rv   is_power_of_2r   r   r   $is_valid_mma_tiler_and_cluster_shapee  s,   

z>PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shapemnkla_majorb_majorc_majorc
                 C   sz   d}
dd }|||dk| ||fr'|||dk|||fr'|||dk| ||fs)d}
|	dkr;| d d	kr;|d d	kr;d}
|
S )
a  
        Check if the tensor alignment is valid

        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the problem shape is valid, False otherwise
        :rtype: bool
        Tc                 S   s*   |rdnd}|| }d| j  }|| dkS )Nr   rK   rr   )rd  )dtypeis_mode0_majortensor_shapemajor_mode_idxnum_major_elementsnum_contiguous_elementsr   r   r   check_contigous_16B_alignment  s   
zZPersistentDenseGemmKernel.is_valid_tensor_alignment.<locals>.check_contigous_16B_alignmentr  r  FrD   rr   r   r   )r  r  r  r  r  r   r  r  r  rZ   rv   r  r   r   r   is_valid_tensor_alignment  s   & z3PersistentDenseGemmKernel.is_valid_tensor_alignmentc                 C   sL   d}|d | r	dnd |d f}|s$||d  dkr"||d  dks$d}|S )ah  
        Check if the epilogue store option is valid

        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param use_tma_store: Whether to use TMA store
        :type use_tma_store: bool
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]

        :return: True if the epilogue store option is valid, False otherwise
        :rtype: bool
        Tr   rL   rK   Fr   )rG   rJ   r  r  rH   rv   cta_tile_shape_mnr   r   r   is_valid_epilog_store_option  s    z6PersistentDenseGemmKernel.is_valid_epilog_store_optionc                 C   sz   d}t | ||sd}t |||sd}t |||	|
| |||||
s$d}t |||||s/d}t dvr;|dkr;d}|S )a  
        Check if the gemm can be implemented

        :param ab_dtype: The data type of the A and B operands
        :type ab_dtype: Type[cutlass.Numeric]
        :param acc_dtype: The data type of the accumulator
        :type acc_dtype: Type[cutlass.Numeric]
        :param c_dtype: The data type of the output tensor
        :type c_dtype: Type[cutlass.Numeric]
        :param use_2cta_instrs: Whether to use 2 CTA groups
        :type use_2cta_instrs: bool
        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
        :type mma_tiler_mn: Tuple[int, int]
        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
        :type cluster_shape_mn: Tuple[int, int]
        :param use_tma_store: Whether to use TMA store
        :type use_tma_store: bool
        :param m: The number of rows in the A tensor
        :type m: int
        :param n: The number of columns in the B tensor
        :type n: int
        :param k: The number of columns in the A tensor
        :type k: int
        :param l: The number of columns in the C tensor
        :type l: int
        :param a_major: The major axis of the A tensor
        :type a_major: str
        :param b_major: The major axis of the B tensor
        :type b_major: str
        :param c_major: The major axis of the C tensor
        :type c_major: str

        :return: True if the gemm can be implemented, False otherwise
        :rtype: bool
        TF)rL   rN   rR   rD   )rC   r  r  r  r  distrk   )r  rF   r   rG   rH   rI   rJ   r  r  r  r  r  r  r  rZ   can_implementr   r   r   r    s$   5
z'PersistentDenseGemmKernel.can_implement)rD   rE   )rD   )-r   r   r   __doc__r   r   Numericboolr   rj  ro   rv   r   r2   jitTensor	Constexprr   r   TiledMmaCopyAtomr   LayoutComposedLayoutr   Tilerf   r  r	   r4  	TiledCopyrM  rO  rP  staticmethodr   r   r   r   strr  r  r  r  r  r   r   r   r   rC      s   <


Z
h	
 9	
      [
?
%
@	
b
$V

*	
8
$

	
rC   r  )-typingr   r   r   r   rj   torch.distributedr   r  r   cutlass.cuter2   cutlass.utilsrf   cutlass.pipeliner  cutlass.utils.blackwell_helpersblackwell_helpersry   cutlass.utils.distributedcutlass.cute.nvgpur   r   cutlass.cute.typingr   r	   r
   r   r   r   r   r   cutlass._mlir.dialectsr   cutlass.cutlass_dslr   cutlass._mlir.dialects.nvvmr   r   r   r"  r,  r/   rB   rC   r   r   r   r   <module>   sN    $+
P