o
    ԰iG                    @   s   d dl mZmZmZmZ d dlm  mZ d dl	Z	d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlm  mZ d dlm  mZ d dlmZ d dlm  mZ d dlZdZ	 G dd dZG dd	 d	ZdS )
    )OptionalTypeTupleUnionN)cpasynctcgen05)from_dlpackg+eG?c                '   @   s<  e Zd ZdZdeej dedee	e	f dee	e	f fddZ
dd	 Zejd
d fdejdejdejdejdejdejdejdejfddZejdejdejdejdejdejdeej deej dejdejdejd ejd!ejd"eejejd#f d$eejejd#f d%ejd&ejd'ejdejdejf&d(d)Zd*ejd+ejd,ejd-ejd%ejd&ejdeejef d.eej ejejejf fd/d0Z!d1ej d2ejd3ejd4ejd*ejd5ejd6ejd.eej ejejejejejf fd7d8Z"d*ejd9eejej f d:eejej f d,ejd-ejd%ejd&ejd5ejd6ejd.eejejejejejejf fd;d<Z#e$dejd=ee	e	e	f d>eej d?eej d%ejd&ejd@eej dAej%dBeej dCej%dDe	dEe	d.ee	e	e	f fdFdGZ&e$dejdHee	e	e	f dee	e	f dejd.eejee	e	e	f f f
dIdJZ'e$dejdKee	e	e	f dLe	d.e	fdMdNZ(d#S )OPersistentDenseGemmKernela  This class implements batched matrix multiplication (C = A x B) with support for various data types
    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.

    :param acc_dtype: Data type for accumulation during computation
    :type acc_dtype: type[cutlass.Numeric]
    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
    :type use_2cta_instrs: bool
    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
    :type mma_tiler_mn: Tuple[int, int]
    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
    :type cluster_shape_mn: Tuple[int, int]
    :note: This kernel always uses Tensor Memory Access (TMA) for storing results.

    :note: In current version, A and B tensor must have the same data type
        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported

    :note: Supported A/B data types:
        - TFloat32
        - Float16/BFloat16
        - Int8/Uint8
        - Float8E4M3FN/Float8E5M2

    :note: Supported accumulator data types:
        - Float32 (for all floating point A/B data types)
        - Float16 (only for fp16 and fp8 A/B data types)
        - Int32 (only for uint8/int8 A/B data types)

    :note: Supported C data types:
        - Float32 (for float32 and int32 accumulator data types)
        - Int32 (for float32 and int32 accumulator data types)
        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
        - Int8/Uint8 (for uint8/int8 accumulator data types)
        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)

    :note: Constraints:
        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
        - MMA tiler N must be 32-256, step 32
        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16

    Example:
        >>> gemm = PersistentDenseGemmKernel(
        ...     acc_dtype=cutlass.Float32,
        ...     use_2cta_instrs=True,
        ...     mma_tiler_mn=(128, 128),
        ...     cluster_shape_mn=(2, 2)
        ... )
        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
    	acc_dtypeuse_2cta_instrsmma_tiler_mncluster_shape_mnc                 C   s   || _ |d dk| _|| _g |dR | _|rtjjntjj| _d| _	d| _
d| _d| _dt| j| jg| j
R  | _d| _d| _d| _td	| _d
S )aY  Initializes the configuration for a Blackwell dense GEMM kernel.

        This configuration includes several key aspects:

        1.  MMA Instruction Settings (tcgen05):
            - acc_dtype: Data types for MMA accumulator.
            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
              with cta_group=2 should be used.

        2.  Cluster Shape:
            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.

        3. Output C tensor store mode:
            - TMA store is always enabled for output tensors.

        :param acc_dtype: Data type of the accumulator.
        :type acc_dtype: type[cutlass.Numeric]
        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
        :type mma_tiler_mn: Tuple[int, int]
        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
        :type use_2cta_instrs: bool
        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
        :type cluster_shape_mn: Tuple[int, int]
        r         )r   r                   r   sm_100N)r
   r   r   	mma_tilerr   CtaGroupTWOONE	cta_group	occupancyepilog_warp_idmma_warp_idtma_warp_idlenthreads_per_ctacta_sync_bar_idepilog_sync_bar_idtmem_ptr_sync_bar_idutilsget_smem_capacity_in_bytessmem_capacityselfr
   r   r   r    r)   b/home/ubuntu/.local/lib/python3.10/site-packages/cudnn/gemm_swiglu/dense_gemm_persistent_swiglu.py__init__   s   !z"PersistentDenseGemmKernel.__init__c                 C   s0  t | j| j| j| j| j| jdd }tj	|j
dgd}d}| jd | jd || f| _| jd | jd d | jd f| _| jd t	|jj | jd | jd f| _| jd t	|jj | jd | jd f| _ttg | jdR |jjf| _t	| jjd | _t	| jjd | _| jdk| _| jdk| _t | j| j| j| j| _t | j| j| j| j| _|  || j| j| j!| j| j| j| j| j| j| j"| j#\| _$| _%| _&| _'t (|| j| j| j%| _)t *|| j| j!| j%| _+t ,| j| j| j| j&| _-t ,| j| j| j| j'| _.| /|| j| j$| _0dS )a2  Set up configurations that are dependent on GEMM inputs

        This method configures various attributes based on the input tensor properties
        (data types, leading dimensions) and kernel settings:
        - Configuring tiled MMA
        - Computing MMA/cluster/tile shapes
        - Computing cluster layout
        - Computing multicast CTAs for A/B
        - Computing epilogue subtile
        - Setting up A/B/C stage counts in shared memory
        - Computing A/B/C shared memory layout
        - Computing tensor memory allocation columns
        Nr   moder   r   r   )1sm100_utilsmake_trivial_tiled_mmaa_dtypea_major_modeb_major_moder
   r   r   cutesize	shape_mnkmma_tiler_cthr_idshapecta_tile_shape_mnkcta_tile_shape_mnk_ctiled_dividemake_layoutr   cluster_layout_vmnknum_mcast_ctas_anum_mcast_ctas_b
is_a_mcast
is_b_mcastcompute_epilogue_tile_shaper   ab12_layout
ab12_dtypeepi_tilec_layoutc_dtype
epi_tile_c_compute_stagesb_dtyper&   r   num_acc_stagenum_ab_stagenum_ab12_stagenum_c_stagemake_smem_layout_aa_smem_layout_stagedmake_smem_layout_bb_smem_layout_stagedmake_smem_layout_epiab12_smem_layout_stagedc_smem_layout_staged_compute_num_tmem_alloc_colsnum_tmem_alloc_cols)r(   	tiled_mmamma_inst_shape_kmma_inst_tile_kr)   r)   r*   _setup_attributes   s   
z+PersistentDenseGemmKernel._setup_attributesc                 C      | dt |  d  S Nr   Tmathexpxr)   r)   r*   <lambda>L      z"PersistentDenseGemmKernel.<lambda>abab12calphamax_active_clustersstreamepilogue_opc	                    s  |j _|j _|j _|j _tj| _	tj| _
tj|_|j _tj|_tjjkrItdj dj   tjj	j
jjjdd }	t|	jj}
tj|	j}tjd}tjj|||j|	j j|j tj!u rtj"ndd\}}t#j|	j}tj$d}tjj%|||j|	j j|j tj!u rtj"ndd\}}t&j|}t&j|}|| |
 _'t(t)|jj*}t(t)|jj+}tj,d}tj-d}t./t.0 |||\}}t./t.0 |||\}}1|j2j|\_3}d_4t5j,j6 t5j-j6tj7G  fd	d
d
}|_89|	||||||||j jj$j,j-j*j+j3||j:|j;ddgg jdR j8& |d dS )a  Execute the GEMM operation in steps:
        - Setup static attributes before smem/grid/tma computation
        - Setup TMA load/store atoms and tensors
        - Compute grid size with regard to hardware constraints
        - Define shared storage for kernel
        - Launch the kernel asynchronously

        :param a: Input tensor A
        :type a: cute.Tensor
        :param b: Input tensor B
        :type b: cute.Tensor
        :param ab12: Output tensor AB12 (full GEMM result)
        :type ab12: cute.Tensor
        :param c: Output tensor C (SwiGLU result)
        :type c: cute.Tensor
        :param max_active_clusters: Maximum number of active clusters
        :type max_active_clusters: cutlass.Constexpr
        :param stream: CUDA stream for asynchronous execution
        :type stream: cuda.CUstream
        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
        :type epilogue_op: cutlass.Constexpr
        :raises TypeError: If input data types are incompatible with the MMA instruction.
        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
        zType must match: z != Nr   )NNNr   )internal_typeNNr      c                       s&  e Zd ZU ejjejjf e	d< ejjejjf e	d< ejjejj
f e	d< ejjejj
f e	d< eje	d< eje	d< ejjejjj f jf e	d< ejjejjjf jf e	d< ejjejjjejjf jf e	d	< ejjejjjejjf jf e	d
< dS )z9PersistentDenseGemmKernel.__call__.<locals>.SharedStorageab_full_mbar_ptrab_empty_mbar_ptracc_full_mbar_ptracc_empty_mbar_ptrtmem_dealloc_mbar_ptrtmem_holding_bufsAB12sCsAsBN)__name__
__module____qualname__r3   structMemRangecutlassInt64rL   __annotations__rK   Int32AlignrD   buffer_align_bytesrG   r0   cosizerP   outerrJ   rR   r)   ab12_smem_sizec_smem_sizer(   r)   r*   SharedStorage  sF   
 

	
r   r   )gridblockclustersmemrk   )<element_typer0   rJ   rD   rG   r$   
LayoutEnumfrom_tensormma_major_moder1   r2   rC   rF   r   
const_expr	TypeErrorr[   r.   r/   r
   r   r   r3   r4   r7   r8   cluster_shape_to_tma_atom_Ar   slice_rP   nvgpumake_tiled_tma_atom_Ar=   Float32TFloat32cluster_shape_to_tma_atom_BrR   make_tiled_tma_atom_Bsize_in_bytesnum_tma_load_bytescompositionmake_identity_layoutrE   rH   rT   rU   r   make_tiled_tma_atomCopyBulkTensorTileS2GOp_compute_gridr9   tile_sched_paramsr   r   r   r}   shared_storagekernellaunchr    )r(   re   rf   rg   rh   ri   rj   rk   rl   rX   atom_thr_sizea_opa_smem_layout
tma_atom_atma_tensor_ab_opb_smem_layout
tma_atom_btma_tensor_ba_copy_sizeb_copy_sizeab12_cta_v_layoutc_cta_v_layoutepi_smem_layoutepi_smem_layout_ctma_atom_ab12tma_tensor_ab12
tma_atom_ctma_tensor_cr   r   r)   r   r*   __call__B  s   %


%
z"PersistentDenseGemmKernel.__call__rX   r   mA_mklr   mB_nklr   r   	mAB12_mnlmC_mnlr=   rP   rR   rT   NrU   rE   rH   r   c                 C   s  t j }t j|}|| jkr$t| t| t| t| t |jj	dk}t j
 \}}}|t |jj	 }|dk}t jt j }|
|}t j \}}}t }|| j} | j}!| j}"ttjj}#| j| j d }$ttjj|$}%tjj| j | j|#|%| j|
d}&ttjj}'t | j!|rdnd }(ttjj|(})tj"j| j# | j$|'|)|
d}*|r|| jkrd}+t j%  t j&|!|+ W d   n1 sw   Y  t j'  t | j(dkrt j)  | j*j+|j,|j-d},| j.j+|j,|j-d}-| j/j+|j,|j-d}.| j0j+|j,|j-d}/d}0d}1t12| j3p'| j4p'|r:tj5|
|dd	}0tj5|
|dd	}1t 6|t 7| j8d
d}2t 6|t 7| j8dd}3t 6|t 7| j8dd}4t 6|	t 7| j9dd}5t j|2dgd}6|:|}7|7;|2}8|7<|3}9|7=|4}:|7=|5};t j \}}}t >t 7|
dj	}<t?||d |<t @|.ddt @|8dd\}=}>t >t 7|
dj	}?t?||d |?t @|/ddt @|9dd\}@}A|A|.}B|B|/}C|C| j8dd }D|Dt E|D| j$}Et | j(dkrt jF  n
t jjG| jH| jId || jkrtjJ|t j
 t jK }F|FL }GtMtjNjO| j}H|GjPr|GjQ}I|Id t |jj	 |Id |Id f}J|>d|Jd d|Jd f }K|Ad|Jd d|Jd f }L|HR  t1Sd}M|HjT|6k rr|&U|H}Mt1jVd|6dddD ]K}N|&W|H|M t jX||Kd|HjTf |=d|HjYf |&Z|H|0d t jX||Ld|HjTf |@d|HjYf |&Z|H|1d |H[  t1Sd}M|HjT|6k r|&U|H}Mq{|F\  |F] }G|GjPs1|&^|H || j_krdt | j_g| j!R  }Ot jjG| j`|Od t jja| jbd|"d}Pt c|P|Ejd}QtjJ|t j
 t jK }F|FL }GtMtjNje| j}RtMtjNjO| j$}S|GjPr|GjQ}I|Id t |jj	 |Id |Id f}J|Qddd|SjYf }T|RR  t1Sd}U|RjT|6k ra|ra|&f|R}U|ri|*W|S |gthjijjd t1jVd|6dddD ]W}N|r|&k|R|U t j|Bdgd}Vt1jV|VddD ]}Wdd|W|RjYf}Xt l||T|B|X |C|X |T |gthjijjd q|&m|R |R[  t1Sd}U|RjT|6k r|r|&f|R}Uqz|r|*n|S |S[  |F\  |F] }G|GjPs,|*^|S || j_k r|| j!d krt jjo| jp|"|d dt | j_g| j!R  }Ot jjG| j`|Od t jja| jbd|"d}Pt c|P|Ejd}Q|}Y| q|Y|Q|:|;|||\}Z}[}\}]d}^d}_d}`d}ad}bd}cd}dd}ed}fd}gd}ht r|\j	| js}^t r|\j	| js}it r|\j	| jt}_| u|Z|^|i|_|Y|,|-\}`}a}j}b}c}d| v|Y|||:|;|||,|-	\}}}e}f}g}htjJ|t j
 t jK }F|FL }GtMtjNje| j$}kttjjdt | j! }ltjwj| jx|ld}m|GjPr|GjQ}I|Id t |jj	 |Id |Id f}J|gdddg|JR  }n|hdddg|JR  }o|[ddddd|kjYf }p|*k|k t @|pdt y|p}pt @|ndt y|n}nt @|odt y|o}ot j|pj	dgd}q|Fjz|q }rt1Vd|qdD ]A}s|pddd|sf }t|pddd|sd f }ut X|Z|u|] t X|Z|t|\ |`{|\| }v|`{|]| }w|v| }v|w| }wdt j}~d|w t d | jb}xt r|xj	t1j}y|y|x t1t |yj	D ]}zt j|y|z |y|z< q|y| }{|{|w }{|v|{ | jt}||v| js}v|w| js}w|a|v |j|w |b|| |r|s | jx }}|r|s d | jx }~|r|sd  | j }t X|`|a|cddd|}f  t X|`|j|cddd|~f  t X|`|b|dddd|f  t jjt jjjt jjjd dt | j! }t jjG| j|d || j!d krit X||ed|}f |nd|sf  t X||ed|~f |nd|sd f  t X||fd|f |od|sd f  |mn  |mW  t jjG| j|d q1t j%  |*m|k W d   n	1 sw   Y  |k[  |F\  |F] }G|GjPs|| j!d krt jj|d dt | j! }t jjG| j|d || j!d kr|rt j|!|dA  t j|!d t jj|P| jp|d |m^  dS dS )zW
        GPU device kernel performing the Persistent batched GEMM computation.
        r   r   r   )barrier_storage
num_stagesproducer_groupconsumer_grouptx_countcta_layout_vmnk)r   r   r   r   r   r   N)swizzle)
mcast_mode)Nr   NNNN)r   NNrn   r   r,   )r   r   Nr   )r   Nr   r   )
barrier_idnumber_of_threads)unroll)tma_bar_ptr
mcast_mask   )	alignmentptr_to_buffer_holding_addrFT)unroll_full)
is_two_cta)r   r   )space)r3   archwarp_idxmake_warp_uniformr   r   prefetch_descriptorr4   r7   r8   	block_idxblock_idx_in_clusterget_flat_coord
thread_idxr$   SmemAllocatorallocater   rt   ru   pipelineCooperativeGroupAgentThreadr>   r?   PipelineTmaUmmacreaterp   data_ptrrL   r   r   r   PipelineUmmaAsyncrr   rK   	elect_onembarrier_initmbarrier_init_fencer   cluster_arrive_relaxedrv   
get_tensorr   innerrw   rx   ry   r   r   r@   rA   create_tma_multicast_mask
local_tiler   r   r6   	get_slicepartition_Apartition_Bpartition_Cr<   tma_partitiongroup_modesmake_fragment_Amake_fragment_Bpartition_shape_Cmake_fragment_Cappendcluster_waitbarrierr!   r    StaticPersistentTileSchedulergrid_diminitial_work_tile_infomake_pipeline_statePipelineUserTypeProduceris_valid_tiletile_idxreset_countBooleancountproducer_try_acquirerangeproducer_acquirecopyindexproducer_get_barrieradvanceadvance_to_next_workget_current_workproducer_tailr   r#   retrieve_tmem_ptrr
   make_tensorlayoutConsumerconsumer_try_waitsetr   Field
ACCUMULATEconsumer_waitgemmconsumer_releaseproducer_commit
alloc_tmemrW   epilog_tmem_copy_and_partitionmake_rmem_tensorrD   rG   epilog_smem_copy_and_partitionepilog_gmem_copy_and_partitionPipelineTmaStorerM   ranknum_tiles_executedretileloadr_   exp2LOG2_Etor   storerange_constexpr
rcp_approxrN   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctar"   relinquish_tmem_alloc_permitmbarrier_arrivembarrier_waitdealloc_tmem)r(   rX   r   r   r   r   r   r   r   r   r=   rP   rR   rT   rU   rE   rH   r   rl   ri   r   r   bidxbidybidzmma_tile_coord_vis_leader_ctacta_rank_in_clusterblock_in_cluster_coord_vmnktidx_r   storagert   ru   ab_pipeline_producer_groupnum_tma_producerab_pipeline_consumer_groupab_pipelineacc_pipeline_producer_groupnum_acc_consumer_threadsacc_pipeline_consumer_groupacc_pipelinenum_tmem_dealloc_threadsrv   rw   rx   ry   a_full_mcast_maskb_full_mcast_maskgA_mklgB_nkl	gAB12_mnlgC_mnlk_block_cntthr_mmatCgAtCgBtCgAB12tCgCa_cta_layouttAsAtAgAb_cta_layouttBsBtBgBtCrAtCrB	acc_shapetCtAcc_fake
tile_sched	work_tileab_producer_statecur_tile_coordmma_tile_coord_mnl
tAgA_slice
tBgB_slicepeek_ab_empty_statusk_blocktmem_ptr_read_threadstmem_ptrtCtAcc_baseab_consumer_stateacc_producer_statetCtAccpeek_ab_full_statusnum_kphases
kphase_idxkphase_coordepi_tidxtiled_copy_t2rtTR_tAcc_basetTR_rAcc	tTR_rAcc1	tTR_rAB12tTR_rCtiled_copy_r2s	tRS_rAB12tRS_rC	tRS_sAB12tRS_sC	bSG_sAB12bSG_sCbSG_gAB12_partitionedbSG_gC_partitionedtTR_rAB12_1tRS_rAB12_1acc_consumer_statec_producer_group
c_pipeline	bSG_gAB12bSG_gCtTR_tAccsubtile_cntnum_prev_subtilessubtile_idxtTR_tAcc_mntTR_tAcc_mn1acc_vec0acc_vec1gate_rcpresigate	acc_vec_cab12_buffer0ab12_buffer1c_bufferepilog_threadsr)   r)   r*   r     s&  







	












	


<







T	
	


"








    z PersistentDenseGemmKernel.kernelr=  tAccrM  rN  returnc                 C   s   t | j| j| j| j||}t|d |}	t	||	d }
|

|}||	}t|d |}||}t|d j| j}t|d j| j}|
|||fS )az  
        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param tAcc: The accumulator tensor to be copied and partitioned
        :type tAcc: cute.Tensor
        :param gAB12_mnl: The global tensor AB12
        :type gAB12_mnl: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tiler for C
        :type epi_tile_c: cute.Tile
        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
        :type use_2cta_instrs: bool

        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
            - tTR_tAcc: The partitioned accumulator tensor
            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
        )NNr   r   N)NNr   r   r   r  r   r   NNN)NNNr   r   r   r   r   )r.   get_tmem_load_opr9   rC   rD   r
   r3   flat_divider   make_tmem_copyr   partition_Spartition_Dr  r8   )r(   r=  r  rM  rN  rE   rH   r   copy_atom_t2rtAcc_epirs  thr_copy_t2rr  gAB12_mnl_epi	tTR_gAB12ru  rv  r)   r)   r*   r    s(   #	


z8PersistentDenseGemmKernel.epilog_tmem_copy_and_partitionrs  rw  r  rx  rv   rw   c                 C   sn   t | j| j| j|}t||}	|	|}
|
|}|
|}|		|}|		|}|		|}|	|||||fS )a  
        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).

        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
        :type tiled_copy_t2r: cute.TiledCopy
        :param tTR_rAB12: The partitioned accumulator tensor for AB12
        :type tTR_rAB12: cute.Tensor
        :param tTR_rAB12_1: The partitioned accumulator tensor for AB12 (second tile)
        :type tTR_rAB12_1: cute.Tensor
        :param tTR_rC: The partitioned accumulator tensor for C
        :type tTR_rC: cute.Tensor
        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param sAB12: The shared memory tensor for AB12
        :type sAB12: cute.Tensor
        :param sC: The shared memory tensor for C
        :type sC: cute.Tensor

        :return: A tuple containing (tiled_copy_r2s, tRS_rAB12, tRS_rAB12_1, tRS_rC, tRS_sAB12, tRS_sC) where:
            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
            - tRS_rAB12: The partitioned tensor AB12 (register source)
            - tRS_sAB12: The partitioned tensor AB12 (smem destination)
            - tRS_rC: The partitioned tensor C (register source)
            - tRS_sC: The partitioned tensor C (smem destination)
        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
        )
r.   get_smem_store_oprC   rD   r
   r3   make_tiled_copy_Dr   r  r%  )r(   rs  rw  r  rx  r=  rv   rw   copy_atom_r2sry  thr_copy_r2sr|  r}  rz  r  r{  r)   r)   r*   r     s   $





z8PersistentDenseGemmKernel.epilog_smem_copy_and_partitionatom1atom2c
                 C   s   t |d |}
t |d |}|}|}t |dd}t |	dd}t |
dd}t |dd}t|dt d||\}}t|dt d||\}}||||||fS )a%  Make tiledCopy for global memory store, then use it to:
        - partition register array (source) and global memory (destination) for none TMA store version;
        - partition shared memory (source) and global memory (destination) for TMA store version.

        :param tidx: The thread index in epilogue warp groups
        :type tidx: cutlass.Int32
        :param atom1: The copy_atom for AB12 TMA store
        :type atom1: cute.CopyAtom or cute.TiledCopy
        :param atom2: The copy_atom for C TMA store
        :type atom2: cute.CopyAtom or cute.TiledCopy
        :param gAB12_mnl: The global tensor AB12
        :type gAB12_mnl: cute.Tensor
        :param gC_mnl: The global tensor C
        :type gC_mnl: cute.Tensor
        :param epi_tile: The epilogue tiler for AB12
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tiler for C
        :type epi_tile_c: cute.Tile
        :param sAB12: The shared memory tensor for AB12
        :type sAB12: cute.Tensor
        :param sC: The shared memory tensor for C
        :type sC: cute.Tensor

        :return: A tuple containing:
            - tma_atom_ab12: The TMA copy atom for AB12
            - tma_atom_c: The TMA copy atom for C
            - bSG_sAB12: The partitioned shared memory tensor AB12
            - bSG_sC: The partitioned shared memory tensor C
            - bSG_gAB12: The partitioned global tensor AB12
            - bSG_gC: The partitioned global tensor C
        :rtype: Tuple[cute.CopyAtom, cute.CopyAtom, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
        r  r   r   r   )r3   r  r   r   r   r<   )r(   r=  r  r  rM  rN  rE   rH   rv   rw   	gAB12_epigC_epir   r   sAB12_for_tma_partitionsC_for_tma_partitiongAB12_for_tma_partitiongC_for_tma_partitionr~  r  r  r  r)   r)   r*   r!    s.   -z8PersistentDenseGemmKernel.epilog_gmem_copy_and_partitionmma_tiler_mnkr0   rJ   rD   rC   rG   rF   r&   r   c                 C   s   d}d}d}t | ||d}t | ||d}t |||d}t ||	|d}t||t|| }d}t||}|| }t||}|| }|
| || |  | }|| | }||| ||   }|| ||  }||||fS )a  Computes the number of stages for A/B/AB12/C operands based on heuristics.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
        :type mma_tiler_mnk: tuple[int, int, int]
        :param a_dtype: Data type of operand A.
        :type a_dtype: type[cutlass.Numeric]
        :param b_dtype: Data type of operand B.
        :type b_dtype: type[cutlass.Numeric]
        :param epi_tile: The epilogue tile shape for AB12.
        :type epi_tile: cute.Tile
        :param epi_tile_c: The epilogue tile shape for C.
        :type epi_tile_c: cute.Tile
        :param ab12_dtype: Data type of operand AB12 (full GEMM output).
        :type ab12_dtype: type[cutlass.Numeric]
        :param ab12_layout: Layout enum of operand AB12.
        :type ab12_layout: utils.LayoutEnum
        :param c_dtype: Data type of operand C (SwiGLU output).
        :type c_dtype: type[cutlass.Numeric]
        :param c_layout: Layout enum of operand C.
        :type c_layout: utils.LayoutEnum
        :param smem_capacity: Total available shared memory capacity in bytes.
        :type smem_capacity: int
        :param occupancy: Target number of CTAs per SM (occupancy).
        :type occupancy: int

        :return: A tuple containing the computed number of stages for:
                 (ACC stages, A/B operand stages, AB12 stages, C stages)
        :rtype: tuple[int, int, int, int]
        r   r   r   ro   )r.   rO   rQ   rS   r3   r   )rX   r  r0   rJ   rE   rH   rD   rC   rG   rF   r&   r   rK   rM   rN   a_smem_layout_stage_oneb_smem_layout_staged_oneab12_smem_layout_staged_onec_smem_layout_staged_oneab_bytes_per_stagembar_helpers_bytesab12_bytes_per_stage
ab12_bytesc_bytes_per_stagec_bytesrL   total_ab_smemtotal_output_smemtotal_smem_usedr)   r)   r*   rI   V  sL   /z)PersistentDenseGemmKernel._compute_stagesr9   c           
      C   sT   t |d}t j| |d}|d j}g |dR }t||}tj||}	||	fS )a  Use persistent tile scheduler to compute the grid size for the output tensor AB12.

        :param ab12: The output tensor AB12
        :type ab12: cute.Tensor
        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
        :type cta_tile_shape_mnk: tuple[int, int, int]
        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
        :type cluster_shape_mn: tuple[int, int]
        :param max_active_clusters: Maximum number of active clusters.
        :type max_active_clusters: cutlass.Constexpr

        :return: A tuple containing:
            - tile_sched_params: Parameters for the persistent tile scheduler.
            - grid: Grid shape for kernel launch.
        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
        rn   )tiler)r   r   r   )r3   r   zipped_divider8   r$   PersistentTileSchedulerParamsr   get_grid_shape)
rg   r9   r   rj   
ab12_shapegab12num_ctas_mnlcluster_shape_mnlr   r   r)   r)   r*   r     s   
z'PersistentDenseGemmKernel._compute_gridr   rK   c                 C   s2   |  |dd }| t||}t|}|S )a  
        Compute the number of tensor memory allocation columns.

        :param tiled_mma: The tiled MMA object defining the core computation.
        :type tiled_mma: cute.TiledMma
        :param mma_tiler: The shape (M, N, K) of the MMA tile.
        :type mma_tiler: tuple[int, int, int]
        :param num_acc_stage: The stage of the accumulator tensor.
        :type num_acc_stage: int

        :return: The number of tensor memory allocation columns.
        :rtype: int
        Nr   )r   r   r3   r   r$   get_num_tmem_alloc_cols)rX   r   rK   r]  r^  rW   r)   r)   r*   rV     s   
z6PersistentDenseGemmKernel._compute_num_tmem_alloc_cols))rz   r{   r|   __doc__r   r   Numericboolr   intr+   r[   r3   jitTensorr   	ConstexprcudaCUstreamr   r   TiledMmaCopyAtomr   LayoutComposedLayoutr   Tiler$   r  r   r  	TiledCopyr  r   r!  staticmethodr   rI   r   rV   r)   r)   r)   r*   r	   ^   sl   2


:w
	 A	
      	
@	
0	

G	
j
 r	   c                   @   s   e Zd ZdZdeej dedee	e	f dee	e	f fddZ
ejdd	 fd
ejdejee	e	e	f  dejee	e	e	f  dejdejee	e	e	f  dejee	e	e	f  dejdejee	e	e	f  dejee	e	e	f  dejdejdejdejdejfddZdS )!PersistentDenseGemmKernelNoDlpackzWrapper around PersistentDenseGemmKernel that avoids DLPack.

    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
    explicit layout orders for operands A, B, AB12 and C.
    r
   r   r   r   c                 C   s   t ||||d| _d S )N)r
   r   r   r   )r	   r   r'   r)   r)   r*   r+     s   z*PersistentDenseGemmKernelNoDlpack.__init__c                 C   r\   r]   r^   ra   r)   r)   r*   rc   !  rd   z*PersistentDenseGemmKernelNoDlpack.<lambda>a_ptra_shapea_orderb_ptrb_shapeb_orderab12_ptrr  
ab12_orderc_cuteri   rj   rk   rl   c              
   C   sd   t j|t j||dd}t j|t j||dd}t j|t j||	dd}| ||||
|||| d S )N)order)r  )r3   r  make_ordered_layoutr   )r(   r  r  r  r  r  r  r  r  r  r  ri   rj   rk   rl   a_cuteb_cute	ab12_cuter)   r)   r*   r     s   z*PersistentDenseGemmKernelNoDlpack.__call__N)rz   r{   r|   r  r   r   r  r  r   r  r+   r3   r  Pointerr  r  r   r  r  r   r)   r)   r)   r*   r    sT    


	
r  ) typingr   r   r   r   cuda.bindings.driverbindingsdriverr  r   cutlass.cuter3   cutlass.cute.nvgpur   r   cutlass.utilsr$   cutlass.pipeliner   cutlass.cute.testingtestingcutlass.utils.blackwell_helpersblackwell_helpersr.   cutlass.cute.runtimer   cutlass.cute.mathr_   inspectr(  r	   r  r)   r)   r)   r*   <module>   s6   /           )