o
    پi                  -   @   s2  d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ	 d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZ 	
											
d&dedededee dee dedededededededee dee deeB deeB dee dee d ee d!ee d"ed#df,d$d%Zi e_dS )'    )Optional)partial)TensorN)Float32)from_dlpackmake_ptr)get_device_capacityget_max_active_clusters)GemmWrapperBase)GemmDefaultSm90GemmDefaultSm100FT         ?ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_Npingpong
persistentmax_swizzle_sizerowvec_biascolvec_biasalphabetacu_seqlens_mcu_seqlens_kA_idxbatch_idx_permuteadd_to_outputreturnc           +      C   s  |d up|d u}|d ur|d urJ d|d u}|r(|s J d|dks(J d|r0|
s0J d|r:|d u s:J d|d urT|  ddksIJ d| ddksTJ d	|d urn|  d
dkscJ d| d
dksnJ dtj| ||||||d\}}}}}tj||d u|d ud t| ddddd}t|| t| j}|d dv sJ d|d dkrtnt	}t
}||f} ||df}!||d j|d j||d j|d j|d jstd|
rt|| nd}"t|| dttB fdd}#|j|#||#||d urt| ddjdd nd |d ur)t| ddj|d u r%dndd nd |d!}$t|"|||}%t||||"|!||j|	}&t }'tj|d | |!|	|
|d u|||d ur\|jnd |d ure|jnd t|trnd"n|d#krudndt|tr~d"n|d#krdnd||d u|d u||d udd$}(tj})|(|)vr|d dkrt ||	|
d%}|||d j| |!|d&}*t!"|*|d j#|d j#|d j#|d' j#|$|%|&|'	|)|(< |)|( |d j#|d j#|d j#|d' j#|$|%|&|' d S )(Nz:Only one of cu_seqlens_m and cu_seqlens_k can be specifiedzIgather_A requires varlen (cu_seqlens_m or cu_seqlens_k must be specified)   zgather_A requires cluster_N=1zvarlen requires persistent=Truez)Add to output not supported with varlen_mz!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz!varlen_k requires A to be m-majorz!varlen_k requires B to be n-major)r   r    r!   )varlen_mvarlen_k)mkl)nr+   r,   )r*   r-   r,   )r   r   r   r   r   )	   
   z!Only SM90 and SM100 are supportedr.   r   r   r   z;Skipping due to unsupported combination of types and majorsscalarc                 S   sD   t | tr| dkrt| S d S t | tsJ tt|  tjjddS )Nr      assumed_align)	
isinstancefloatr   r   r   data_ptrcuteAddressSpacegmem)r0    r:   >/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm.py
scalar_arg^   s   
zgemm.<locals>.scalar_argr1   r2   )leading_dim)mRowVecBroadcastmColVecBroadcastr#      r   )key_tensor_names)r   is_persistent)gather_Ar   )$strider
   validate_and_prepare_tensorspermute_tensorsextract_dtypesdetermine_major_ordersr   devicer   r   r   is_valid_dtypesdtypemajor	TypeErrorr	   create_cute_tensorsr5   r   EpilogueArgumentsr   detachmark_layout_dynamiccreate_scheduler_argscreate_varlen_argsnum_epi_tensormapscutlass_torchcurrent_streamget_compile_keyr4   gemmcompile_cacher   r7   compilecute_tensor)+r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   varlenrC   LMKNtensor_infosmajor_configsdevice_capacityGemmCls	acc_dtypetile_shape_mncluster_shape_mnkmax_active_clustersr<   epi_argsscheduler_argsvarlen_argsrV   compile_keycachegemm_objr:   r:   r;   rX      s  




  
rX   )FTr   NNr   r   NNNNF)typingr   	functoolsr   torchr   cutlass.cuter7   cutlass.torchrU   cutlassr   cutlass.cute.runtimer   r   quack.cute_dsl_utilsr   r	   quack.gemm_wrapper_utilsr
   quack.gemm_default_epir   r   intboolr5   rX   rY   r:   r:   r:   r;   <module>   s    	

 
3