o
    پiII                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlZedddddejdejd	ejd
eej ddf
ddZeddddejdejfddZeddddejdejdejejB dejfddZe	dYddddeej dededejfddZeddddddejd	ejd
eej deddf
ddZ	dZdeej dedededejf
d d!Z		dZdeej d"ededededejfd#d$Zejd%ejd&e	dejfd'd(Z d)ej!deeeef fd*d+Z"d,e	d-ed.ed/ede	f
d0d1Z#d)ej!fd2d3Z$d4ejdejfd5d6Z%d7ej&j'd4ejdejfd8d9Z(d7ej&j'd4ejdejfd:d;Z)edddd<ej*j+d=eej dejfd>d?Z,	dYd@ej-e dAeej dBedejfdCdDZ.	dYdejdEej/dFej0dGejdHejdIedefdJdKZ1dedLej2j3fdMdNZ4ejdOej'dPejdQejdRejdSe	dTe	defdUdVZ5ejdOej'dPejdQejdRejdSe	dTe	defdWdXZ6dS )[    N)OptionalTypeTupleCallable)Int32Boolean
const_expr)cpasync)dsl_user_oppredlocipatomsrcdstr   returnc                K   sz   t |jtjr|jtjjksJ t|j|jkr,t	||j}|
| |j |}tj| ||f|||d| d S )Nr   )
isinstanceiteratorcutePointermemspaceAddressSpacermemr   element_typemake_fragment_likestoreloadtocopy)r   r   r   r   r   r   kwargssrc_cvt r"   D/home/ubuntu/.local/lib/python3.10/site-packages/quack/copy_utils.pycvt_copy   s    "r$   r   r   c                C   s*   t j| | j||d}t j| |||d |S Nr%   )r   r   r   autovec_copy)r   r   r   r   r"   r"   r#   load_s2r"   s   r(   
tiled_copy	dst_shapec                C   sJ   t t|tj rtj||j||d}n|}tj| || |||d |S r&   )r   r   r   Tensormake_fragmentr   r   retile)r)   r   r*   r   r   r   r"   r"   r#   load_s2r_retile)   s
   
r.   Fdtypenum_copy_elemsis_asyncc                C   s:   t td|| j }|rt ntj }tj|| |dS )N   num_bits_per_copy)	r   minwidthr	   	CopyG2SOpr   nvgpuCopyUniversalOpmake_copy_atom)r/   r0   r1   r   r   num_copy_bitscopy_opr"   r"   r#   get_copy_atom;   s   r=   )r   r1   r   r   c          	      K   s>   | j d d }t| j||}tj|| |f|||d| d S )Nr   r   )shaper=   r   r   r   )	r   r   r   r1   r   r   r    r0   	copy_atomr"   r"   r#   r   D   s   "r      num_threadsc           	      C   sR   || j  }|rt ntj }tj|| |d}t|}t|}t|||S )Nr3   )	r6   r	   r7   r   r8   r9   r:   make_layoutmake_tiled_copy_tv)	r/   rA   r0   r1   r;   r<   r?   
thr_layout
val_layoutr"   r"   r#   tiled_copy_1dT   s   


rF   threads_per_rowc           
      C   sr   || j  }|rt ntj }tj|| |d}|| dks J tj|| |fdd}td|f}	t	|||	S )Nr3   r   )r@   r   )orderr@   )
r6   r	   r7   r   r8   r9   r:   make_ordered_layoutrB   rC   )
r/   rG   rA   r0   r1   r;   r<   r?   rD   rE   r"   r"   r#   tiled_copy_2d_   s   

rJ   tAcAlimitc              	   C   s   t t jt j| ddgdt j| dgdt j| dgdft j| dgdddfdt}t|jd D ]!}t|jd D ]}t | d|fd|f d |||d|f< q<q2|S )Nr   r@   mode   )stride)	r   r,   rB   sizer   cutlassrange_constexprr>   	elem_less)rK   rL   tApArest_vrest_kr"   r"   r#   predicate_kr   s   .*rX   ptrc                 C   sb   t | jj}td|}|r*t|dt|dt|d}}}|||fS td| )ai  Extract swizzle parameters from a pointer's swizzle_type.

    The swizzle_type string has the form '!cute.swizzle<"S<b,m,s>">' where
    b, m, s are the swizzle parameters (bits, base, shift).

    Returns:
        A cute.Swizzle object constructed from the extracted parameters

    Raises:
        ValueError: If the swizzle_type string cannot be parsed
    zS<(\d+),(\d+),(\d+)>r@   rO      zCould not parse swizzle_type: )strtypeswizzle_typeresearchintgroup
ValueError)rY   swizzle_strmatchbmsr"   r"   r#   parse_swizzle_from_pointer   s   .
rh   ptr_intre   rf   rg   c                 C   s(   d|> d }||| > }| | |@ |? A S )Nr@   r"   )ri   re   rf   rg   bit_mskyyy_mskr"   r"   r#   swizzle_int   s   rl   c                 C   s8   t | \}}}t|  |||}tj| j|| j| jdS )N)assumed_align)rh   rl   tointr   make_ptrr/   r   	alignment)rY   re   rf   rg   ri   r"   r"   r#   swizzle_ptr   s   rq   tensorc                 C   sZ   | j }| jj}tjt| j }t|dt|dtd||}t	tj
| j| jd|S )N   r   )r/   )layoutr   r6   r   make_swizzlerh   r   recast_layoutmake_composed_layoutmake_tensor
recast_ptr)rr   outerr6   inner
new_layoutr"   r"   r#   &as_position_independent_swizzle_tensor   s   r}   thr_copyc                 C   $   t t| |j| t|jS N)r   rx   rq   partition_Dr   r}   rt   r~   rr   r"   r"   r#    partition_D_position_independent      r   c                 C   r   r   )r   rx   rq   partition_Sr   r}   rt   r   r"   r"   r#    partition_S_position_independent   r   r   layout_c	elem_ty_cc                C   sd   t |tjjstd| |  }|jdkr&tjtj	j
|d|||dS tjtj	 |||dS )a  
    Selects the largest vectorized smem load atom available subject to constraint of gmem layout.

    Parameters:
    -----------
    layout_c : LayoutEnum
        The layout enum of the output tensor D.

    elem_ty_c : Type[Numeric]
        The element type for output tensor D.

    Returns:
    --------
    Either SmemLoadMatrix or SimtSyncCopy, based on the input parameters.
    z%elem_ty_c must be a Numeric, but got       r%   )r   rR   cutlass_dslNumericMeta	TypeErroris_m_major_cr6   r   r:   r8   warpLdMatrix8x8x16bOpr9   )r   r   r   r   
is_m_majorr"   r"   r#   sm90_get_smem_load_op   s   
r   archr   	transposec                 C   sT   t | dk p	|jdkrtjtj ||sdnd|j dS ttjjj|dd|S )NZ   r   rO   r@   r3   r   )r   num_matrices)r   r6   r   r:   r8   r9   r   StMatrix8x8x16bOp)r   r   r   r"   r"   r#   get_smem_store_atom   s   r   	cta_coord
cta_layout
src_tensor
dst_tensorfilter_zerosc                    s   t t|jtjo|jtjjk}|r||fn||f\}}	t	 ||t
|dt|d t
|	dt|	d \}
}t |rIt|
}
t|}|rO|
|fn||
f\ fdd}||
|fS )Nr   r@   c                    s.   t j d | f d |f fi | d S r   )r   r   )src_idxdst_idx
new_kwargsr   r   r    r   r"   r#   copy_tma$  s   .z!tma_get_copy_fn.<locals>.copy_tma)r   r   r   r   r   r   r   smemr	   tma_partitiongroup_modesrankr   )r   r   r   r   r   r   r    src_is_smemsmem_tensorgmem_tensorrg   gr   r"   r   r#   tma_get_copy_fn	  s$   	


r   pipelinec                    s   dt jjf fdd}|S )Nproducer_statec                    s"    d| |j |d| d S )N)r   r   tma_bar_ptrr"   )indexproducer_get_barrier)r   r   r   r   r   r"   r#   copy_fn+  s   
z%tma_producer_copy_fn.<locals>.copy_fn)rR   r   PipelineState)r   r   r   r"   r   r#   tma_producer_copy_fn*  s   r   
thr_copy_AmAsAgsAIdxlimit_mlimit_kc                    s  t j|dgdt j|dgdf
|		jd dksJ t t 	ddd	d 
jd j dkt rAt|d }t 	jd d t 	}

|
d
||d d  }d d  tt jjdgd}tt jjdgd t |ttj|ddD ]}d|df d |k |< qt |ttj|ddD ]}d|df d }	| r||	 |< qd|< qt |d d fdd	tf 	
fd
d}
|
S )Nr   rM   r@   rO   )NNr   NTunroll_fullFr   c           
         s
  d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd d | ff }tjd D ]G}ttj	|| d f dddfd }t sZ| rtj
jdgddksgJ d	 d  }	tj
|d |	f 	d |f|f |d
 q;d S )Nr@   Tr   r   rO   
up_to_rank)NNr   rM   )r   r   r   r   )r   r   r,   r   rR   rangerS   r>   tiled_divideappend_onesrQ   r   )
r   r   r   tApA_klimit_k_curkmA_currf   mA_rowkicols_per_threadelems_per_loadis_even_m_smemr   mA_km_idxt0AcArK   tApA_mtAsAr   tile_shape_mkr"   r#   r   a  s&   &z%gather_m_get_copy_fn.<locals>.copy_fnF)r   rQ   r   r>   r   slice_tiler_mnr   r5   make_identity_tensorr   	get_slicer,   r   rR   r   r   logical_dividebool)r   r   r   r   r   r   cArows_per_threadrf   row_idxr   r"   r   r#   gather_m_get_copy_fn6  s6    	




*r   c                    sJ  d\t |jtjjkr|n|jtjjksJ |tj|dgdtj|dgdf
	|tdd
d 	j	d j
 dk}t | rOt|
d }tj
d d }t
}	|	d||d d  }d d  t tjj
dgd}	t tjj
dgd t|	ttj|	ddD ]}
d|
df d |k |
< qt 	j	d j
| }	j}tt||f||fd || d fd f dd
tdttjtjf f 
fdd}		dd
tdttjtjf f 
fdd}		ddttjtjf d
tf	fdd}|t d ur"|fS |fS )N)NNr   rM   r@   rZ   rO   Tr   Fr   r   c                    s   d }t |r+t t}| d   }tj ddD ]}dd|f d |k ||< qd | f }t t}t D ]&}dd|f d }t | rS|| ||< q<|| r^|| ||< q<d||< q<||fS )Nr@   Tr   r   )r   r   r,   r   rR   r   r   )r   r   r   r   r   	gAIdx_curk_idxcol_idx)r   gAIdxr   r   rK   r   r"   r#   prefetch_from_gmem_fn  s    

z3gather_k_get_copy_fn.<locals>.prefetch_from_gmem_fnc                    s   d }t |r+t t}|d   }tj ddD ]}dd|f d |k ||< q| | d |f }t t}	t D ]}dd|f d }
||
 |	|< qAtj	  tj
  | | W d    |	|fS 1 spw   Y  |	|fS )Nr@   Tr   r   )r   r   r,   r   rR   r   consumer_waitr   r   	sync_warp	elect_oneconsumer_release)a_prefetch_pipeliner   r   a_prefetch_consumer_stater   r   r   r   	sAIdx_curr   r   )r   r   sAIdxr   rK   r   r"   r#   prefetch_from_smem_fn  s&   


z3gather_k_get_copy_fn.<locals>.prefetch_from_smem_fnk_idx_tApA_kc           	   	      s   |\}}d }t |rtj|dd}t jd D ]4}t jd D ])}| rLtjd ||| f d ||f|f t |d u rDd n|d |f d q#qd S )NrO   r   r@   r   )r   r   prepend_onesrR   rS   r>   r   )	r   r   r   r   r   r   tApA_k_predr   rf   )rK   tAmAr   r   r   r"   r#   r     s    z%gather_k_get_copy_fn.<locals>.copy_fnr   )r   r   r   r   gmemr   rQ   r   r   r   r>   r5   r   r   r   r,   r   rR   r   thr_idxr   flat_divider   r   r+   )r   r   r   r   r   r   r   r   r   r   rf   threads_per_coltidxr   r   r   r"   )r   r   r   r   r   rK   r   r   r   r   r   r#   gather_k_get_copy_fnz  s^   	 



.r   r   )r@   F)7r^   typingr   r   r   r   rR   cutlass.cuter   r   r   r   cutlass.cute.nvgpur	   cutlass.cutlass_dslr
   cutlass.pipelineCopyAtomr+   r$   r(   	TiledCopyShaper.   Numericr`   r   r=   r   rF   rJ   jitrX   r   rh   rl   rq   r}   coreThrCopyr   r   utils
LayoutEnumr   	Constexprr   CoordLayoutr   r   PipelineAsyncr   r   r   r"   r"   r"   r#   <module>   sl  	 
	

 
	
	#

!C