o
    c۷i4                     @   s  d dl mZmZmZmZmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZmZmZ d dlmZ d dlmZ d dlZd dlmZ d d	lmZ d d
lmZ dZedddddde
jde
j de
j dee
j  de!ddfddZ"edddde
j de
j fddZ#edddde
jde
j de
j e
j$B de
j fddZ%edddde
j&de
j$de
j de
j fddZ'e	ddddd eej( d!e)d"e!de
j*fd#d$Z+eddddd%de
j de
j dee
j  d"e!ddf
d&d'Z,	dd eej( d)e)d!e)d"e!de
jf
d*d+Z-	(	dd eej( d,e)d)e)d!e)d"e!de
jfd-d.Z.e
j/d/e
j d0ede
j fd1d2Z0d3Z1d4Z2d5e1 Z3e	 	ddddd6e
j d7e)d8e!de
j fd9d:Z4e	 	ddddd6e
j d;ed<ed7e)d8e!de
j fd=d>Z5d?ed@e)dAe)dBe)def
dCdDZ6dEe
j7fdFdGZ8dHe
j de
j fdIdJZ9de
j:j&dHe
j de
j fdKdLZ;de
j:j&dHe
j de
j fdMdNZ<eddddOej=j>dPeej( de
j*fdQdRZ?		ddSej@e) dTee
j( dUe!dVee) de
j*f
dWdXZA		ddSej@e) dTee
j( dUe!dVee) de
j*f
dYdZZB			dd[e
jCd\e
j d]edSe)dUe!dVee) deee
je
j f fd^d_ZD		dd[e
jCd\e
j d]edSe)dUe!deee
je
j f fd`daZE	dd[e
jCdbe
j$dUe!de
jfdcddZF		dd[e
jCdbe
j$d\ee
j  d]edSe)dUe!deee
je
j e
j f fdedfZG	dd[e
jCdge
j d]edSe)deee
je
j f f
dhdiZH		dd[e
jCdge
j d]edSe)dje!deee
je
j f fdkdlZIeddddme
j7dne
j7doe)eB fdpdqZJeddddre
j*de
j7fdsdtZKed(ddddudve
j7dwe
j7dxe
j7dyedzee d{e)ddfd|d}ZL	dd~e
j de
j de!defddZMe		ddddde
j*de
jNde
jOd~e
j de
j de!de!defddZPd'edejQjRfddZSe
j/de
j&de
j dge
j de
j dededefddZTe
j/de
j&de
j dge
j de
j dededefddZUe
j/	(ddre
j*de
j dge
j de
j dede)d{e)defddZVdS )    )OptionalTypeTupleCallableSequence)partialN)Int32Int16Boolean
const_expr)cpasyncwarp	warpgroup)CtaGroup)dsl_user_op)llvm)ir)
cute_nvgpul   } F)predretilelocip
tiled_copysrcdstr   r   returnc          	      K   s   t |jtjr|jtjjksJ t|j|jkr,t	||j}|
| |j |}t|r5| |}tj| ||f|||d| d S )Nr   r   r   )
isinstanceiteratorcutePointermemspaceAddressSpacermemr   element_typemake_rmem_tensor_likestoreloadtor   copy)	r   r   r   r   r   r   r   kwargssrc_cvt r,   F/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/copy_utils.pycvt_copy   s    
"r.   r   r   c                C   s*   t j| | j||d}t j| |||d |S Nr/   )r   r%   r$   autovec_copy)r   r   r   r   r,   r,   r-   load_s2r,   s   r2   	dst_shapec                C   sJ   t t|tj rtj||j||d}n|}tj| || |||d |S r0   )r   r   r   Tensormake_rmem_tensorr$   r)   r   )r   r   r3   r   r   r   r,   r,   r-   load_s2r_retile3   s
   
r6   thr_copyshapec                C   s>   t |}t j| |j|j||d}t j| ||||d |S r0   )r   make_identity_tensorr5   partition_Dr8   r$   r)   )r7   r8   r   r   r   cDstr   r,   r,   r-   load_t2rE   s   
r<   dtypenum_copy_elemsis_asyncc                C   s:   t td|| j }|rt ntj }tj|| |dS )N   num_bits_per_copy)	r   minwidthr   	CopyG2SOpr   nvgpuCopyUniversalOpmake_copy_atom)r=   r>   r?   r   r   num_copy_bitscopy_opr,   r,   r-   get_copy_atomO   s   rK   )r   r?   r   r   c          	      K   s>   | j d d }t| j||}tj|| |f|||d| d S )Nr   r   )r8   rK   r$   r   r)   )	r   r   r   r?   r   r   r*   r>   	copy_atomr,   r,   r-   r)   X   s   "r)      num_threadsc           	      C   sR   || j  }|rt ntj }tj|| |d}t|}t|}t|||S )NrA   )	rD   r   rE   r   rF   rG   rH   make_layoutmake_tiled_copy_tv)	r=   rN   r>   r?   rI   rJ   rL   
thr_layout
val_layoutr,   r,   r-   tiled_copy_1dh   s   


rS   threads_per_rowc           
      C   sr   || j  }|rt ntj }tj|| |d}|| dks J tj|| |fdd}td|f}	t	|||	S )NrA   r   )rM   r   )orderrM   )
rD   r   rE   r   rF   rG   rH   make_ordered_layoutrO   rP   )
r=   rT   rN   r>   r?   rI   rJ   rL   rQ   rR   r,   r,   r-   tiled_copy_2ds   s   

rW   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdt}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q<q2|S )Nr   rM   mode   stride)	r   r5   rO   sizer
   cutlassrange_constexprr8   	elem_less)rX   rY   tApArest_vrest_kr,   r,   r-   predicate_k   s   .*rf   i   @il            T
ragged_dim	ptr_shiftc                C   sB  t | }|dk r||7 }|rX|dksJ d| jd | tf | j|d d   tf }| j| j| f }d| t f d|| d   }t || j}	t |	t j	||dS |dks`J d| j| }
| jd | tf | j|d d   ttf }| jd | |
f | j|d d   t
|
 |
f }t | jt j	||dS )	Nr      z8ptr_shift ragged tensor only supports up to 4 dimensionsrM   Nr]      z<non-ptr_shift ragged tensor only supports up to 3 dimensions)r   rankr8   BIG_INTMAX_INTr^   domain_offsetr   make_tensorrO   BIG_INT_INV)rg   rh   ri   r   r   rm   	new_shape
new_stride
ptr_offsetnew_ptrstride_rr,   r,   r-   create_ragged_tensor_for_tma   s.   
	, 
,
rx   offsetlengthc                C   s   t | }|dk r||7 }t j| |gd}|| }	|r>||d ks#J d| |	f d|| d   }
d|d  || f }n#||d ksFJ d| |	f d|| d   }
d|d  ||| f }t |
| | S )Nr   rZ   r\   rk   rM   rl   )r   rm   r_   rp   )rg   ry   rz   rh   ri   r   r   rm   big_int
offset_valoffset_tupleindex_tupler,   r,   r-   offset_ragged_tensor   s   
r   ptr_intbmsc                 C   s(   d|> d }||| > }| | |@ |? A S )NrM   r,   )r   r   r   r   bit_mskyyy_mskr,   r,   r-   swizzle_int  s   r   ptrc                 C   s8   | j j}t|  |j|j|j}tj| j	|| j
| jdS )N)assumed_align)typeswizzle_typer   tointnum_bitsnum_base	num_shiftr   make_ptrr=   r!   	alignment)r   swzr   r,   r,   r-   swizzle_ptr  s   r   tensorc                 C   sh   | j }| jj}| jjj}t|j|j	|j
}t|dt|dtd||}ttj| j| jd|S )N   r   )r=   )layoutr$   rD   r   r   r   r   make_swizzler   r   r   recast_layoutmake_composed_layoutrq   
recast_ptr)r   outerrD   r   inner
new_layoutr,   r,   r-   &as_position_independent_swizzle_tensor  s   
r   c                 C   $   t t| |j| t|jS rk   )r   rq   r   r:   r   r   r   r7   r   r,   r,   r-    partition_D_position_independent)     r   c                 C   r   rk   )r   rq   r   partition_Sr   r   r   r   r,   r,   r-    partition_S_position_independent2  r   r   layout_c	elem_ty_cc                C   s`   t |tjjstd| |  }|jdkr$tjt	
|d|||dS tjtj |||dS )a  
    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.

    Parameters:
    -----------
    layout_c : LayoutEnum
        The layout enum of the output tensor D.

    elem_ty_c : Type[Numeric]
        The element type for output tensor D.

    Returns:
    --------
    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
    z%elem_ty_c must be a Numeric, but got    rj   r/   )r   r`   cutlass_dslNumericMeta	TypeErroris_m_major_crD   r   rH   r   LdMatrix8x8x16bOprF   rG   )r   r   r   r   
is_m_majorr,   r,   r-   sm90_get_smem_load_op;  s   
r   archr$   	transposemajor_mode_sizec                 C   |   t | dk p	|jdkrtjtj ||sdnd|j dS |d u s'|d dkr)dn	|d dkr1dnd}ttj||d	|S 
NZ   r   r\   rM   rA   r   rj   r   )r   num_matrices)r   rD   r   rH   rF   rG   r   StMatrix8x8x16bOpr   r$   r   r   r   r,   r,   r-   get_smem_store_atom\     r   c                 C   r   r   )r   rD   r   rH   rF   rG   r   r   r   r,   r,   r-   get_smem_load_atomt  r   r   	tiled_mmasCtidxc                    sv   |j }t||||d}t|| |}	t| r!|	| nt|	| ddtjdt	t
 f fdd}
|
|	 fS )N)r   r   dst_idxc                    s<   t |d u r n d d d |f }t| |fddi| d S Nr   T)r   r.   )r   r   
new_kwargs
dst_tensortRS_sCr   r,   r-   copy_fn  s    z!get_smem_store_C.<locals>.copy_fnrk   )r$   r   r   make_tiled_copy_C	get_slicer   r:   r   r4   r   r   )r   r   r   r   r   position_independentr   r=   rL   r7   r   r,   r   r-   get_smem_store_C  s   	


 
r   c                    s   |j }t|||}t|| |}t| r||nt||t|||}	t|	| |}
|
t	|j
d d j
 ddtt f fdd}||fS )Nr\   src_idxc                    s6   t | d u rnd d d | f }t|fd i|S Nr3   )r   r6   )r   r   
src_tensor	tRS_shapetSR_sCr   r,   r-   r     s    z get_smem_load_C.<locals>.copy_fnrk   )r$   r   r   r   r   r   r   r   r   r9   r8   r   r   )r   r   r   r   r   r   r=   rL   r7   copy_atom_RSthr_copy_RSr   r,   r   r-   get_smem_load_C  s   



r   epi_tilec                 C   s<   t tj||d d dkrdnddtj}t || }|S )NrM   r   r   rj   r\   )r   )r   rH   r   r   r`   Float16make_tiled_copy_C_atom)r   r   r   copy_atom_Ctiled_copy_C_atomr,   r,   r-   epilog_smem_copy_atom  s    r   c                    s   t |d ur	|jntj}t| |}t|||}	t|	||}
d  t |d ur:t | r5|
	| nt
|
| |d urE|jd d n|}|
t|j}t|| jj}dtjdtf fdd}t |d urm|nd |
 |fS )Nr\   r   r   c                    s$   t |  d d d |f fi | d S rk   r.   r   r   r   r   r,   r-   r     s   $z#get_smem_store_epi.<locals>.copy_fn)r   r$   r`   r   r   r   r   make_tiled_copy_Sr   r:   r   r8   r   r9   r5   op	acc_dtyper4   r   )r   r   r   r   r   r   r   r=   r   rL   r7   sC_shapetRS_rC_shapetRS_rCr   r,   r   r-   get_smem_store_epi  s   	



r   sAc           
         s|   |j }| jjtjjk}t|||}t|| 	|}t
| r'|| nt|| dtjdtf fdd}	|	| fS )Nr   r   c                    s(   t |  d d d |f fddi| d S r   r   r   tRS_sAr   r,   r-   r     s   (z!get_smem_store_A.<locals>.copy_fn)r$   r   a_major_moder   OperandMajorModeMNr   r   make_tiled_copy_Ar   r   r:   r   r4   r   )
r   r   r   r   r   r=   r   rL   r7   r   r,   r   r-   get_smem_store_A  s   



r   with_dst_tensorc                    s   |j }| jjtjjk}t|||}t|| 	|}	t
| r'|	|nt|	|| |jd d  dtf fdd}
dtdtjffdd}|sU|
|	fS ||	fS )Nr\   r   c                    s"   t d d d | f fd i|S r   r6   )r   r   r   tSR_sAr   r,   r-   r     s   z get_smem_load_A.<locals>.copy_fnr   c                    s    t  d d d | f |fi |S rk   r   )r   r   r   )r   r   r,   r-   copy_fn_w_dst_tensor  s    z-get_smem_load_A.<locals>.copy_fn_w_dst_tensor)r$   r   r   r   r   r   r   r   r   r   r   r   r   partition_shape_Ar8   r   r4   )r   r   r   r   r   r   r=   r   rL   r7   r   r   r,   r   r-   get_smem_load_A  s   


r   smem_ptrgmem_ptrstore_bytesc             	   C   sB   | j ||d }tjd |j|t| gddddtjjd d S )Nr/   zJcp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [$0], [$1], $2;zl,r,rTF)has_side_effectsis_align_stackasm_dialect)r   ir_valuer   
inline_asmllvm_ptrr   
AsmDialectAD_ATT)r   r   r   r   r   smem_ptr_i32r,   r,   r-   cpasync_reduce_bulk_add_f32  s   	
r   tma_atomc                C   s2   t j| jj||d}tjd}t j||||dS )aw  
    Get the address of the TMA descriptor embedded in a TMA Copy Atom.

    Extracts the constant memory address of the TMA descriptor for use with
    custom PTX instructions.

    :param tma_atom: TMA Copy Atom from make_tiled_tma_atom
    :return: Pointer to TMA descriptor in constant memory

    Example:
        >>> desc_ptr = get_tma_descriptor_address(tma_atom)
    r/   z@!cute.ptr<!cute_nvgpu.tma_descriptor_tiled, generic, align<128>>)_cute_nvgpu_iratom_make_exec_tma_traitvaluer   r   parseget_tma_desc_addr)r   r   r   	exec_atomtma_desc_ptr_typer,   r,   r-   r  4  s
   r  )num_ctamulticast_maskr   r   tma_desc_ptrdst_smem_ptrmbarrier_ptrcol_idxrow_indicesr  c                C   s   t |dkrtdt | t| }	dd |D }
| j||d }|j||d }|j||d}|dkr=|t@ }| }d}|durMt| }|du sUJ dd	| d
}tjd|||	|
d |
d |
d |
d |g|dddtj	j
||d	 dS )a!  
    Perform TMA gather4 load from global memory to shared memory.

    Issues PTX instruction:
    cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes
        [dstMem], [tensorMap, {col_idx, row0, row1, row2, row3}], [smem_bar];

    This loads 4 rows (specified by row_indices) from a 2D tensor at the given
    column index into shared memory, using the TMA descriptor.

    :param tma_desc_ptr: Pointer to TMA descriptor in constant memory (128-byte aligned)
    :type tma_desc_ptr:  Pointer
    :param dst_smem_ptr: Destination address in shared memory
    :type dst_smem_ptr:  Pointer
    :param mbarrier_ptr: Pointer to mbarrier in shared memory for completion tracking
    :type mbarrier_ptr:  Pointer
    :param col_idx:      Column index
    :type col_idx:       Int32
    :param row_indices:  Sequence of exactly 4 row indices
    :type row_indices:   Sequence[Int32]
    :param num_cta:      Number of CTAs participating (default: 1)
    :type num_cta:       int
    :param multicast_mask: Optional multicast mask
    :type multicast_mask: Int16

    Requirements:
        - row_indices must contain exactly 4 elements
        - Compute capability >= SM_100 (Blackwell)
        - TMA descriptor must be properly initialized for 2D tensor

    Example:
        >>> from cutlass.cute.nvgpu import cpasync
        >>> from cutlass.cute import core
        >>>
        >>> # Create TMA descriptor
        >>> tma_atom, tma_tensor = cpasync.make_tiled_tma_atom(...)
        >>> tma_desc_ptr = get_tma_descriptor_address(tma_atom)
        >>>
        >>> # Compute indices (typically from kernel logic)
        >>> col_idx = core.get(...) or 5  # Int32 value
        >>> row_indices = [core.get(...) for _ in range(4)]  # 4 Int32 values
        >>>
        >>> # Gather 4 rows at computed column
        >>> tma_gather4_load(
        ...     tma_desc_ptr=tma_desc_ptr,
        ...     dst_smem_ptr=smem_ptr,
        ...     mbarrier_ptr=barrier_ptr,
        ...     col_idx=col_idx,
        ...     row_indices=row_indices
        ... )
    rj   z,gather4 requires exactly 4 row indices, got c                 S   s   g | ]}t | qS r,   )r   r   ).0row_idxr,   r,   r-   
<listcomp>      z$tma_gather4_load.<locals>.<listcomp>r/   rM   Nzmulticast is not supported yetzacp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::z( [$0], [$1, {$2, $3, $4, $5, $6}], [$7];r   r\   rl   zr,l,r,r,r,r,r,rTF)r   r   r   r   r   )len
ValueErrorr   r   r   Sm100MmaPeerBitMaskr	   r   r   r   r   )r  r  r	  r
  r  r  r  r   r   col_valrow_vals	desc_addrdst_addr	mbar_addrmulticast_mask_valptxr,   r,   r-   tma_gather4_loadI  sF   @


r  r   r   single_stagec                    s   t t| |s	dnd }t t||sdnd }t| d|t|d| dtjf fdd}dtjf fdd}t | rG|S |S )NrM   r   tma_bar_ptrc                    sp   t t j}t j   t j|d | f  d |f fd|i| W d    d S 1 s1w   Y  d S Nmbar_ptrr   rH   r   CopyBulkG2SOpr$   r   	elect_oner)   )r   r   r  r   atomr   r*   r   r,   r-   	copy_bulk  s   

"z+cpasync_bulk_get_copy_fn.<locals>.copy_bulkc                    s`   t t j}t j  t j| fd| i| W d    d S 1 s)w   Y  d S r  r  )r  r   r"  r#  r,   r-   copy_bulk_single_stage  s    "z8cpasync_bulk_get_copy_fn.<locals>.copy_bulk_single_stage)r   r   rm   group_modesr    )r   r   r  r*   group_rank_srcgroup_rank_dstr$  r%  r,   r#  r-   cpasync_bulk_get_copy_fn  s   r)  r"  	cta_coord
cta_layoutfilter_zerosc             
      s*  t t|jtjo|jtjjk}
|
r||fn||f\}}t t||s%dnd }t t||s2dnd }t	j
 ||t|d|t|d|||d\}}t |r\t|}t|}|
rb||fn||f\td d d fdd
}td d d fdd
}t | r|||fS |||fS )NrM   r   r/   c                   s8   t j d | f d |f fi |||d d S r0   r   r)   )r   r   r   r   r   r"  r   r*   r   r,   r-   copy_tma  s   z!tma_get_copy_fn.<locals>.copy_tmac                    s(   t j fi || |d d S r0   r-  )r   r   r   r.  r,   r-   copy_tma_single_stage  s   (z.tma_get_copy_fn.<locals>.copy_tma_single_stage)r   r   r   r   r    r!   r"   smemrm   r   tma_partitionr&  r,  r   )r"  r*  r+  r   r   r,  r  r   r   r*   src_is_smemsmem_tensorgmem_tensorgroup_rank_smemgroup_rank_gmemr   gr/  r0  r,   r.  r-   tma_get_copy_fn  s2   
	

r9  pipelinec                    s   dt jjf fdd}|S )Nproducer_statec                    s"    d| |j |d| d S )N)r   r   r  r,   )indexproducer_get_barrier)r   r;  r   r)   r:  r,   r-   r     s   
z%tma_producer_copy_fn.<locals>.copy_fn)r`   r:  PipelineState)r)   r:  r   r,   r>  r-   tma_producer_copy_fn
  s   r@  
thr_copy_AmAgsAIdxlimit_mlimit_kc                    s  t j|dgdt j|dgdf
|		jd dksJ t t 	ddd	d 
jd j dkt rAt|d }t 	jd d t 	}

|
d
||d d  }d d  tt jjdgd}tt jjdgd t |ttj|ddD ]}d|df d |k |< qt |ttj|ddD ]}d|df d }	| r||	 |< qd|< qt |d d fdd	tf 	
fd
d}
|
S )Nr   rZ   rM   r\   )NNr   NTunroll_fullFr   c           
         s
  d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd d | ff }tjd D ]G}ttj	|| d f dddfd }t sZ| rtj
jdgddksgJ d	 d  }	tj
|d |	f 	d |f|f |d
 q;d S )NrM   TrF  r   r\   
up_to_rankNNr   rZ   )r   r   r   r   )r   r   r5   r
   r`   rangera   r8   tiled_divideappend_onesr_   r)   )
r   r   r   tApA_klimit_k_curkmA_curr   mA_rowkicols_per_threadelems_per_loadis_even_m_smemrE  mA_km_idxt0AcArX   tApA_mtAsArA  tile_shape_mkr,   r-   r   A  s&   &z%gather_m_get_copy_fn.<locals>.copy_fnF)r   r_   r:   r8   r&  slice_tiler_mnr   rC   r9   r   r   r5   r
   r`   rL  r   logical_dividebool)rA  rB  r   rC  rD  rE  cArows_per_threadr   r  r   r,   rU  r-   gather_m_get_copy_fn  s6    	




*rf  c                    sJ  d\t |jtjjkr|n|jtjjksJ |tj|dgdtj|dgdf
	|tdd
d 	j	d j
 dk}t | rOt|
d }tj
d d }t
}	|	d||d d  }d d  t tjj
dgd}	t tjj
dgd t|	ttj|	ddD ]}
d|
df d |k |
< qt 	j	d j
| }	j}tt||f||fd || d fd f dd
tdttjtjf f 
fdd}		dd
tdttjtjf f 
fdd}		ddttjtjf d
tf	fdd}|t d ur"|fS |fS )N)NNr   rZ   rM   rl   r\   TrF  Fr   r   c                    s   d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd | f }t t}t D ]&}dd|f d }t | rS|| ||< q<|| r^|| ||< q<d||< q<||fS )NrM   TrF  r   )r   r   r5   r
   r`   rL  r   )r   r   rO  rP  rQ  	gAIdx_curk_idxr
  )rV  gAIdxrE  r[  rX   r^  r,   r-   prefetch_from_gmem_fn  s    

z3gather_k_get_copy_fn.<locals>.prefetch_from_gmem_fnc                    s   d }t |r+t t}|d   }tj ddD ]}dd|f d |k ||< q| | d |f }t t}	t D ]}dd|f d }
||
 |	|< qAtj	  tj
  | | W d    |	|fS 1 spw   Y  |	|fS )NrM   TrF  r   )r   r   r5   r
   r`   rL  consumer_waitr   r   	sync_warpr!  consumer_release)a_prefetch_pipeliner   r   a_prefetch_consumer_stater   rO  rP  rQ  	sAIdx_curri  r
  )rV  rE  sAIdxr[  rX   r^  r,   r-   prefetch_from_smem_fn  s&   


z3gather_k_get_copy_fn.<locals>.prefetch_from_smem_fnk_idx_tApA_kc           	   	      s   |\}}d }t |rtj|dd}t jd D ]4}t jd D ])}| rLtjd ||| f d ||f|f t |d u rDd n|d |f d q#qd S )Nr\   rH  rM   rK  )r   r   prepend_onesr`   ra   r8   r)   )	r   r   rt  r   ri  rO  tApA_k_predrQ  r   )rX   tAmAr\  r]  rA  r,   r-   r     s    z%gather_k_get_copy_fn.<locals>.copy_fnr_  )r   r!   r   r"   gmemr1  r_   r:   r&  ra  r8   rC   r9   r   r   r5   r
   r`   rL  thr_idxrb  flat_dividerc  r   r4   )rA  rB  r   rC  rD  rE  rX  rW  rd  re  r   threads_per_colr   rk  rs  r   r,   )rV  rj  rE  rr  r[  rX   rw  r\  r]  rA  r^  r-   gather_k_get_copy_fnZ  s^   	 



.r|  rr  warp_idx	num_warpsc                    s   t j|dgd}t |d | |d dksJ |}t t jt j tddt |t d}	|	|}
|
	|}|
	|t
| t| }tt||ddt jf fd	d
}|S )Nr   rZ   rJ  rj   r@   rA   )r  r  c              	      s   |  }t jtjdgdddD ]3  fddtdD }d  d |f j}tj  |||| W d    n1 s?w   Y  qd S )NrM   rZ   TrF  c                    s   g | ]}| f qS r,   r,   )r  v)r   	tSR_rAIdxr,   r-   r    r  z=gather_m_get_tma_copy_fn.<locals>.copy_fn.<locals>.<listcomp>rj   )r`   rL  r   r_   r   r   r!  )r   r   r  r
  r  r   r  r   tile_Ktma_gather4_load_fn)r   r-   r     s   z)gather_m_get_tma_copy_fn.<locals>.copy_fn)r   r_   rP   rH   rF   rG   r   rO   r   r   r2   r  r   r  r    )r   rB  r   rr  r}  r~  r  tile_M	cta_groupcopy_AIdx_s2rwarp_copy_AIdx_s2r	tSR_sAIdxr  r   r,   r  r-   gather_m_get_tma_copy_fn  s"   



r  r_  )rM   F)r   F)FN)FFN)FF)rM   )Wtypingr   r   r   r   r   	functoolsr   r`   cutlass.cuter   r   r	   r
   r   cutlass.cute.nvgpur   r   r   cutlass.cute.nvgpu.tcgen05.mmar   cutlass.cutlass_dslr   cutlass.pipelinecutlass._mlir.dialectsr   cutlass._mlirr   r   r   r  	TiledCopyr4   rc  r.   r2   Shaper6   ThrCopyr<   NumericintCopyAtomrK   r)   rS   rW   jitrf   rn   ro   rr   rx   r   r   r    r   r   corer   r   utils
LayoutEnumr   	Constexprr   r   TiledMmar   r   r   r   r   r   r   r  r  r)  CoordLayoutr9  r:  PipelineAsyncr@  rf  r|  r  r,   r,   r,   r-   <module>   s0  
 
		

D!	
	
	#





 

 r
 	0Co