o
    پi.                     @   s   d dl mZmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dlm
Z
mZmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d dlmZ d dlmZ d d
lmZ G dd dZG dd deeZG dd deeZdS )    )OptionalTuple)partial)	dataclassN)Int32Float32Boolean
const_expr)ArgumentsBase
ParamsBase)GemmSm90)	GemmSm100)partition_for_epilogue)VarlenManagerc                   @   sV  e Zd ZU dZeed< eG dd deZeG dd de	Z
dddd	ed
e
fddZejde
deejdf dejdeej dejdejdedejjdefddZde
dejfddZej	d(de
deejdf dejdeej d
eej f
dd Zed	ee d!eeeef dejd
efd"d#Zde
fd$d%Zde
d
eejdf fd&d'ZdS ))GemmDefaultEpiMixinr   num_epi_tensormapsc                   @   sj   e Zd ZU dZeeejB  ed< dZ	eeejB  ed< dZ
eej ed< dZeej ed< dZeed< dS )z%GemmDefaultEpiMixin.EpilogueArgumentsNalphabetamRowVecBroadcastmColVecBroadcastFadd_to_output)__name__
__module____qualname__r   r   r   cuteTensor__annotations__r   r   r   r   bool r   r   J/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_default_epi.pyEpilogueArguments   s   
 r    c                   @   s^   e Zd ZU dZeeejB  ed< dZ	eeejB  ed< dZ
eej ed< dZeej ed< dS )z"GemmDefaultEpiMixin.EpilogueParamsNr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   EpilogueParams   s
   
 r!   N)locipargsreturnc                   s<   dd   fdd|j |jfD \}}| j|j|j||dS )Nc                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )    )divbyN)r   	is_staticassumeelement_typewidth).0str   r   	<genexpr>*   s
    "
zTGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>)tuplestrider.   r   r.   r   <lambda>*   s    zAGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr2   )r   make_tensoriteratormake_layoutshape)r,   r/   
new_strider   r   
<listcomp>.   s    zCGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<listcomp>)r   r   r   r   )r   r   r!   r   r   )selfr$   r"   r#   r   r   r   r9   r   epi_to_underlying_arguments&   s   


z/GemmDefaultEpiMixin.epi_to_underlying_argumentsparamsepi_smem_tensors.epi_tiletiled_copy_t2rtiled_copy_r2stile_coord_mnklvarlen_managerepilogue_barriertidxc
           *      C   s  d\}
}t t|do|jd urt|j}
t t|do |jd ur(t|j}|^}}}| jd | jd }}|d }| jtj	j
 }tt||d urL|n||	|d u d}d }t |jd ur|jj}t td|j|j }tj|jj||d	d
|	}|j|d f }t||f|d f}||}||}|t|}t|jd |d |  |}tdt|jd ft}tjt|jd d	dD ]}|d|f |k |d|f< qtj||||d |t |j!tj"||fdd}t |d ur|#|}d } t |j$d ur|j$j}!t td|!j|!j }tj|j$j||d	d
|	}"t |j% r0|j$|d f }#nt&|j'j(| f|j$}#t|#|f|d f}$|"|$}%|"|}&|"t|}'t|)||d |  |}(tdt|&jd ft})tjt|&jd d	dD ]}|'d|f |(k |)d|f< qtj|"|%|&|)d |t |j!tj"||fdd} t |d ur|#| } t |jd up|j$d urtj	*  tj	+d |,  |
||| fS )N)NNr   r   r         )r@   
tiled_copyrF   reference_srcr&   T)is_asyncunroll_full)pred)r   rG   r4   )rG   r   )-r	   hasattrr   utilsload_scalar_or_pointerr   cta_tile_shape_mnknum_epi_warpsr   arch	WARP_SIZEr   r   r   r*   maxr+   
copy_utilstiled_copy_1d	get_slice
local_tilepartition_Spartition_Dmake_identity_tensorminr8   make_fragmentsizer   cutlassrangecopyr5   r6   r7   retiler   varlen_mdomain_offsetr>   cu_seqlens_mlen_mcp_async_commit_groupcp_async_wait_grouparrive_and_wait)*r<   r>   r?   r@   rA   rB   rC   rD   rE   rF   r   r   sRowVecsColVecresttile_Mtile_N	batch_idxnum_epi_threadspartition_for_epilogue_fn	tDsRowVecrowvec_dtypenum_copy_elemsthr_copy_RVmRowVecgRowVectRVgRVtRVsRVtRVcRVlimit_ntRVpRVm	tDsColVeccolvec_dtypethr_copy_CVmColVecgColVectCVgCVtCVsCVtCVcCVlimit_mtCVpCVr   r   r   	epi_begin;   s   







zGemmDefaultEpiMixin.epi_begin	epi_coordc                 C   s  |\}}}}d }t |d urDt|dt|d d d |f }	t|	j|	j}
tt|	t|
 t	|
| j
}||
 | j
 d }t |d urt|dt|d d d |f }t|j|j}tt|t| t	|| j
}|| | j
 ||||fS )NrH   )r	   r   group_modesrankr_   layoutr*   autovec_copyfilter_zerosmake_fragment_like	acc_dtypestoreloadto)r<   r>   epi_tensorsr   r   r   rt   r   tDrRowVec_cvttDsRowVec_cur	tDrRowVectDrColVec_cvttDsColVec_cur	tDrColVecr   r   r   epi_begin_loop   s(   

z"GemmDefaultEpiMixin.epi_begin_loopepi_loop_tensorstRS_rDtRS_rCc                 C   s"  |\}}}}|  }	tt|do|jd ur t|j}|	|9 }	t|d urPtt|d p1|jd u r>|	|  |j7 }	nt|j}|	||  |j 7 }	|	|	 t|d urrt
jt|ddD ]}
||
  ||
 7  < qet|d urt
jt|ddD ]}
||
  ||
 7  < qd S )Nr   r   TrL   )r   r	   rO   r   rP   rQ   r   r   r*   r   ra   rb   r   r`   )r<   r>   r   r   r   r   r   r   r   rDir   r   r   epi_visit_subtile   s$   
z%GemmDefaultEpiMixin.epi_visit_subtilerR   c                 C   sp   | j d u rdn|d }| jd u rdn|d }| j d ur| j jnt}| jd ur*| jjnt}||j ||j  d S )Nr   rG      )r   r   r*   r   r+   )r$   rR   r@   row_vec_smem_sizecol_vec_smem_sizerow_vec_dtypecol_vec_dtyper   r   r   epi_smem_bytes_per_stage   s   z,GemmDefaultEpiMixin.epi_smem_bytes_per_stagec                    s~   |j d u rdn| jd |jd u rdn| jd |j d ur!|j jnt|jd ur,|jjnt tjG  fddd}|S )Nr   rG   c                       sN   e Zd ZU ejjejjf df ed< ejjejj f df ed< dS )zAGemmDefaultEpiMixin.epi_get_smem_struct.<locals>.EpiSharedStorage   rl   rm   N)r   r   r   r   structAlignMemRanger   r   r   r   r   r   r   r   EpiSharedStorage   s   
  $r   )r   rR   r   r*   r   r   r   )r<   r>   r   r   r   r   epi_get_smem_struct   s   z'GemmDefaultEpiMixin.epi_get_smem_structc                 C   s`   d }t |jd ur|jjt| jd }d }t |jd ur,|jj	t| jd }||fS )NrG   r   )
r	   r   epirl   
get_tensorr   r7   rR   r   rm   )r<   r>   storagerl   rm   r   r   r   epi_get_smem_tensors   s   z(GemmDefaultEpiMixin.epi_get_smem_tensors)N) r   r   r   r   intr   r   r
   r    r   r!   r=   r   jitr   r   Tiler   	TiledCopyCoordr   ra   pipelineNamedBarrierr   r   r   r   staticmethodr   r   r   r   r   r   r   r      st   
 
	
] r   c                   @      e Zd ZdS )GemmDefaultSm90Nr   r   r   r   r   r   r   r          r   c                   @   r   )GemmDefaultSm100Nr   r   r   r   r   r     r   r   ) typingr   r   	functoolsr   dataclassesr   ra   cutlass.cuter   r   r   r   r	   quack.cute_dsl_utilsr
   r   quack.gemm_sm90r   quack.gemm_sm100r   quack.sm90_utilsr   quack.utilsrP   quack.copy_utilsrW   quack.varlen_utilsr   r   r   r   r   r   r   r   <module>   s"    k