o
    پi                     @   s  d dl mZmZmZ d dlZd dlmZ d dlm  m	Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ e	d(ddddee d	ed
ejdee deejejf f
ddZeZeddddejdejdejdededejfddZej 		 	d)dej!dejdejdejdej"e dej"e dej"e ddfddZ#			 	d*dej!d!ej$dejdejd"ee d#ee dededejfd$d%Z%			 	d*dej!dejdejdejded"ee d#ee dededdfd&d'Z&dS )+    )TypeUnionOptionalN)	warpgroup)Numericdsl_user_op)Float32Int32Boolean
const_expr)
LayoutEnumlocipdtypelayouttilestagereturnc          
      C   s   t jt j|||d||d}| r|d n|d }tt|| || }t j|t	|d ur4t 
||n|| r;dndd}	|	S )Nr      r   )r   r      )r   r   r   )order)cuteproduct_eachshapeis_n_major_cr   make_smem_layout_atomsm90_utils_ogget_smem_layout_atomtile_to_shaper   appendis_m_major_c)
r   r   r   r   r   r   r   major_mode_sizesmem_layout_atomsmem_layout_staged r%   D/home/ubuntu/.local/lib/python3.10/site-packages/quack/sm90_utils.pymake_smem_layout   s   
r'   cTepi_tile
tiled_copytidxreference_srcc          	      C   s>   | |}t| |}t|r|j|||dS |j|||dS )Nr   )	get_slicer   flat_divider   partition_Spartition_D)	r(   r)   r*   r+   r,   r   r   thr_copycT_epir%   r%   r&   partition_for_epilogue*   s
   
r3   F	tiled_mmaacctCrAtCrB	zero_initwg_waitswap_ABc           	   
   C   s   t |rt| |||||dd d S t  t| j}|tjj	|  t
t|jd D ]}t|||d d |f |d d |f | |tjj	d q/t  t |dkr^t| d S d S )NF)r8   r9   r:   r   Tr   )r   gemmr   fencer   make_mma_atomopsetField
ACCUMULATEcutlassrange_constexprsizer   commit_group
wait_group)	r4   r5   r6   r7   r8   r9   r:   mma_atomkr%   r%   r&   r;   >   s   &r;   r   A_idxB_idxc              
   C   s   t |rt| |d d d |||||ddS t| |t}t |d u r&|n|d d d |f }	t |d u r6|n|d d d |f }
t| ||	|
d|d |S )NrI   Fr:   Tr8   r9   )r   gemm_zero_initr   make_fragmentpartition_shape_Cr   r;   )r4   r   r6   r7   rJ   rK   r9   r:   r5   rArBr%   r%   r&   rN   Y   s   
  rN   c	                 C   s~   t |rt| |||||||dd	 d S t |d u r|n|d d d |f }	t |d u r+|n|d d d |f }
t| ||	|
||d d S )NFrL   rM   )r   
gemm_w_idxr;   )r4   r5   r6   r7   r8   rJ   rK   r9   r:   rQ   rR   r%   r%   r&   rS   o   s
     rS   )N)Fr   F)NNrI   F)'typingr   r   r   rB   cutlass.cuter   cutlass.utils.hopper_helpersutilshopper_helpersr   cutlass.cute.nvgpur   cutlass.cutlass_dslr   r   r   r	   r
   r   cutlass.utilsr   TileintLayoutComposedLayoutr'   make_smem_layout_epiTensor	TiledCopyboolr3   jitTiledMma	Constexprr;   ShaperN   rS   r%   r%   r%   r&   <module>   s   			
	
