o
    پi%4                     @   s  d dl mZmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlZd dlmZ d dlmZ d dl m!Z! d dlm"Z"m#Z#m$Z$m%Z% d dl&m'  m(Z) d dl*m'  m+Z, d dl-m.Z. G dd de	eZ/G dd de/eZ0G dd de/eZ1					d*dededee dee dee de2de2d e2d!e2d"e3d#e3d$e2d%e4eB d&e4eB d'dfd(d)Z5i e_6dS )+    )TupleOptionalCallable)partial)Tensor)GemmActMixin
act_fn_mapgemm_act)GemmSm90)	GemmSm100TriangularTileScheduler)GemmWrapperBase)get_device_capacityget_max_active_clusters)VarlenManagerN)make_ptr)Int32Float32Boolean
const_expr)if_generatec                5   @   s   e Zd Zd"defddZejdejde	ej
df deeej  d	ejjd
ejjdejjdejjdejdedej
deej
 deej dejdej
deej deej
 deej
 dee dee dejdedejjdedede	ejjejjf f2dd Zd!S )#GemmSymmetricMixinFvarlen_mc                 C   s   t S Nr   )selfr    r   H/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_symmetric.pyget_scheduler_class   s   z&GemmSymmetricMixin.get_scheduler_classparamsepi_smem_tensors.tma_desc_epi_ptrsepi_pipelineepi_store_pipelineepi_read_stateepi_producer_stateepi_tileload_acc_subtiletRS_rDtRS_rCtiled_copy_t2rtiled_copy_r2stRS_sDtiled_copy_s2rtSR_rCtSR_sCcopy_Dcopy_Ctile_coord_mnklvarlen_managerepilogue_barriertidxis_tma_warpreturnc           6         s  t |d u}t  d u|j}|j}|\}}}jdkr#ttj|dntj} | j	j
j}!t|!|}"|"||}#|\}$|d }%j||||%j|j|||$d\}&}&ttjd d |jd }'tj|'|'d dfd}(t|'})|j|) }*||||||||	}+t |d urtjt|)jddD ]},|(|,}-r| | ||-|d	 |!| |"  q fd
d}.d}/d\}0}1t#|)D ]},|(|,}2|	|
|, $||+|2}3t |r/|%| t&||d d d |j'f | tjj(tjj)j*tjj+j,d tj-  tj.  |/| W d    n	1 s&w   Y  |"  t |d uo;|,j |)k r]|(|,j }-rY| | ||-|d	 |!| |"  0||3|
|}4|*|, j1 }5t |/rt |,dkr|.|0|1|d |5|2}0}1t rt23||
|d d d |5f  t&|"|"4|4|#d d d |5f  t |/ r|.|5|2|d qt |/r|.|0|1|d 5||+|||||| ||fS )Nd   )tiled_tmem_load   )tma_desc_ptr      )stride)unroll)src_idxproducer_statec                    s   |d }|d }t jjt jjjt jjjd   r>|jd  }|jd  }t	r4 | |d ||kr>| |d t
fdd t
fdd   d S )Nr   r=   space)r@   dst_idxc                            S r   )producer_commitr   r#   r   r   <lambda>       zCGemmSymmetricMixin.epilogue.<locals>.tma_store_fn.<locals>.<lambda>c                      rE   r   )producer_acquirer   rG   r   r   rH      rI   )cutearchfence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctaarrive_and_waitcluster_shape_mnkr   r   )r@   rD   r2   pid_mpid_nsquare_tile_msquare_tile_nr0   copy_postactr#   r4   has_Dr6   r   r   r   tma_store_fnr   s    z1GemmSymmetricMixin.epilogue.<locals>.tma_store_fnT)NNrB   r   )r@   rD   r2   )6r   tma_atom_postactmPostAct_mnlrL   r   sm100_utilsget_smem_store_opsm90_utils_ogsm90_get_smem_store_oppostact_layoutpostact_dtype	acc_dtyperK   make_tiled_copy_S	get_slicepartition_Depilog_gmem_copy_and_partitionoffset_batch_epicta_tile_shape_postact_mnepi_tile_postactzipped_dividemake_layoutcta_tile_shape_mnkshapesizenum_tiles_executed	epi_begincutlassrangeminepi_c_stageget_hier_coordrJ   rF   advancerange_constexprepi_begin_loopconsumer_waitcopyindexrM   rN   rO   rP   rQ   	sync_warp	elect_oneconsumer_releaseepi_visit_subtile	epi_stage
copy_utilscvt_copyretileepi_end)6r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   tile_schedulerr5   r6   has_Cr\   r]   sRowVecsColVecsPostActr_   copy_atom_postact_r2stiled_copy_postact_r2stRS_sPostActtma_desc_postact_ptr	batch_idx_epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_Cr[   delay_tma_storesrc_idx_prevdst_idx_prev
gmem_coordepi_loop_tensorstRS_rPostAct
epi_bufferr   rX   r   epilogue   s   



















zGemmSymmetricMixin.epilogueN)F)__name__
__module____qualname__boolr   rK   jitr   EpilogueParamsr   r   listr   Pointerrs   pipelinePipelineAsyncPipelineStateTiler   	TiledCopyCoordr   NamedBarrierr   r   r   r   r   r   r   r      sl    	
r   c                   @      e Zd ZdS )GemmSymmetricSm90Nr   r   r   r   r   r   r   r          r   c                   @   r   )GemmSymmetricSm100Nr   r   r   r   r   r      r   r   FT         ?ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_Npingpong
persistentmax_swizzle_sizealphabetar7   c           %      C   sx  |j }tj| |||d|id\}}}}}||ksJ dt| t| dddddd}t|| t| j}|d d	v sCJ d
|d dkrKtnt	}t
}||f}||df}||d j|d j||d j|d j|d jsutd|
r}t|| nd}tdd | D | dttB fdd}d }t| }||d j|||||}tj|||d}d } t }!tj|||||	|
|d u||t|trdn|dkrdndt|trdn|dkrdnddd}"tj}#|"|#vr!|d dkrt||	|
d}|||d j||dd}$t |$|d j|d j|d j|d j||| |!	|#|"< |#|" |d j|d j|d j|d j||| |! d S )NPostAct)additional_tensorszFM and N must be the same; symmetric gemm only supports square matrices)mkl)nr   r   )r   r   r   )r   r   r   r   r   r   )	   
   z!Only SM90 and SM100 are supportedr   r=   r   r   r   z;Skipping due to unsupported combination of types and majorsc                 S   s   i | ]\}}||qS r   r   ).0r   vr   r   r   
<dictcomp>  s    z"gemm_symmetric.<locals>.<dictcomp>scalarc                 S   sD   t | tr| dkrt| S d S t | tsJ tt|  tjjddS )Nr      )assumed_align)	
isinstancefloatr   r   r   data_ptrrK   AddressSpacegmem)r   r   r   r   
scalar_arg  s   
z"gemm_symmetric.<locals>.scalar_arg)r   r<   r   )r   r   r   r   r   )key_tensor_names)r   is_persistentF)gather_Ar   )!mTr   validate_and_prepare_tensorspermute_tensorsextract_dtypesdetermine_major_ordersr   devicer   r   r   is_valid_dtypesdtypemajor	TypeErrorr   create_cute_tensorsitemsr   r   r   EpilogueArgumentscute_tensorcreate_scheduler_argscutlass_torchcurrent_streamget_compile_keyr   r	   compile_cacher   rK   compile)%r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   LMKNtensor_infosmajor_configsdevice_capacityGemmClsrd   tile_shape_mnrS   max_active_clustersr   
activationact_fnepi_argsscheduler_argsvarlen_argsr   compile_keycachegemm_objr   r   r   gemm_symmetric   s   




r  )FTr   r   r   )7typingr   r   r   	functoolsr   torchr   quack.gemm_actr   r   r	   quack.gemm_sm90r
   quack.gemm_sm100r   quack.tile_schedulerr   quack.gemm_wrapper_utilsr   quack.cute_dsl_utilsr   r   quack.varlen_utilsr   quack.copy_utilsr   rs   cutlass.cuterK   cutlass.torchr   cutlass.cute.runtimer   r   r   r   r   cutlass.utils.hopper_helpersutilshopper_helpersr`   cutlass.utils.blackwell_helpersblackwell_helpersr^   cutlass.cutlass_dslr   r   r   r   intr   r   r  r   r   r   r   r   <module>   sx     5	


x