o
    c۷i-                  -   @   sf  d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZ d d	lmZmZmZ d d
lmZmZmZmZmZmZmZmZm Z m!Z! edddd Z"												d*dedededee dee de#de#de#de#de$de$de#dee dee d e%eB d!e%eB d"ee d#ee d$ee d%ee d&e$d'df,d(d)Z&dS )+    )Optional)	lru_cache)TensorN)Float32)make_ptr)make_fake_tensor)get_device_capacityget_max_active_clusterstorch2cute_dtype_map)GemmDefaultEpiMixinGemmDefaultSm90GemmDefaultSm100)

get_majors
get_dtypesperm3dmake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsmake_fake_gemm_tensorscached_compilecompile_gemm_kernel)maxsizec           !         sL  d dkrt nt t|||||||||d\	}}}}dd }t|||fddd}|d	kr>t|||fddd}n|dkrLt||fddd}nd } j|||||||d
t||||rf|n|rj|nd }t|||d|||||||
||||||||||f } t|  	
fddS )Nr   	   )varlen_mvarlen_kgather_Ac                 S   s0   | dkrd S | dkrt dS tt dtjjddS )Nr            ?   )assumed_align)r   r   cuteAddressSpacegmem)mode r%   @/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm.pyfake_scalarH   s
   z"_compile_gemm.<locals>.fake_scalarr   r   )leading_dimdivisibility   alphabetamRowVecBroadcastmColVecBroadcastadd_to_output)gemmc                      s$   t  
	S )N)r   r%   GemmClsa_dtypecluster_shape_mnkdevice_capacityepi_argsr   mAmBmCmD
persistentpingpongscheduler_argstile_shape_mnvarlen_argsr%   r&   <lambda>~   s"    z_compile_gemm.<locals>.<lambda>)r   r   r   fake_tensorEpilogueArgumentsr   r   r   )!r4   b_dtyped_dtypec_dtypea_majorb_majord_majorc_majorr?   r5   r=   r<   has_semaphorerowvec_dtypecolvec_dtypecolvec_ndim
alpha_mode	beta_moder0   r   r   r   has_batch_idx_permuter6   mnklr'   mRowVecmColVecaidx_lenkeyr%   r2   r&   _compile_gemm   sz   &rZ   FT   r   ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_Nr=   r<   max_swizzle_sizerowvec_biascolvec_biasr,   r-   cu_seqlens_mcu_seqlens_kA_idxbatch_idx_permuter0   returnc           0      C   s  |d u}|d u}|p|}|d u}|r|rJ d|r(|s J d|dks(J d|r0|
s0J d|r8|r8J d|rP|  ddksEJ d| ddksPJ d	|rh|  d
dks]J d| d
dkshJ dt| |||||d\}}}}t||||\}}}} t| |||\}!}"}#}$t| j}%|%d dv sJ dt|trdn|dkrdnd}&t|trdn|dkrdnd}'|d ur|jnd}(t	|!|"|#|$|||| ||f||df|	|
|d u|d urt
|j nd |d urt
|j nd |(|&|'|||||d u|%})ddlm}* |*rd S dd }+|
rt|| nd},tj|+||&|+||'||d d}-t|,|||}.t|||}/|%d dkrD|)|||||-|.|/d d 	 d S |)|||||-|.|/ d S )Nz)Only one of cu_seqlens_m and cu_seqlens_kzgather_A requires varlenr   zgather_A requires cluster_N=1zvarlen requires persistent=Truez)Add to output not supported with varlen_mz!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz!varlen_k requires A to be m-majorz!varlen_k requires B to be n-major)r   r   r   )r   
      z)Only SM90, SM100, and SM110 are supportedr*   r   )COMPILE_ONLYc                 S   s$   |dkrd S |dkrt | S |  S )Nr   r   )r   data_ptr)scalarr$   r%   r%   r&   
scalar_arg   s
   zgemm.<locals>.scalar_argr+   r   )strider   r   r   r   device
isinstancer   ndimrZ   r
   dtypequack.cache_utilsrq   r	   r   rC   r   r   )0r\   r]   r^   r_   r`   ra   rb   rc   rd   r=   r<   re   rf   rg   r,   r-   rh   ri   rj   rk   r0   r   r   varlenr   A_pB_pD_pC_prG   rH   rI   rJ   r4   rD   rE   rF   r6   rO   rP   rN   compiled_fnrq   rt   max_active_clustersr7   r>   r@   r%   r%   r&   r1      s   
r1   )FTr[   NNr   r   NNNNF)'typingr   	functoolsr   torchr   cutlass.cuter!   cutlassr   cutlass.cute.runtimer   quack.compile_utilsr   rB   quack.cute_dsl_utilsr   r	   r
   quack.gemm_default_epir   r   r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r   r   rZ   intboolfloatr1   r%   r%   r%   r&   <module>   s   0
~	
