o
    c۷i4                     @   s  d dl mZmZmZ d dlmZmZ d dlmZ d dl	Z	d dl
mZ d dl	mZmZmZmZ d dlmZ d dlm  mZ d dlm  mZ d dlmZ d dlmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 G dd de#Z7G dd de7e%Z8G dd de7e'Z9edddd Z:					d.dededee d ee d!ee d"e;d#e;d$e;d%e;d&e<d'e<d(e;d)e=eB d*e=eB d+dfd,d-Z>dS )/    )TupleOptionalCallable)	lru_cachepartial)TensorN)Int32Float32Boolean
const_expr)make_ptr)make_fake_tensor)get_device_capacityget_max_active_clusterstorch2cute_dtype_map)
act_fn_map)GemmActMixin)GemmSm90)	GemmSm100)div_for_dtypeperm3d
get_majors
get_dtypesmake_scheduler_argsmake_fake_scheduler_argscached_compilecompile_gemm_kernelTriangularTileScheduler)VarlenManagerc                3   @   s   e Zd Zd!defddZejdejde	ej
df dejjd	ejjd
ejjdejjdejdedej
deej
 deej dejdej
deej deej
 deej
 dee dee dejdedejjdedede	ejjejjf f0ddZd S )"GemmSymmetricMixinFvarlen_mc                 C   s   t S )Nr   )selfr!    r#   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_symmetric.pyget_scheduler_class#   s   z&GemmSymmetricMixin.get_scheduler_classparamsepi_smem_tensors.epi_pipelineepi_store_pipelineepi_read_stateepi_producer_stateepi_tileload_acc_subtiletRS_rDtRS_rCtiled_copy_t2rtiled_copy_r2stRS_sDtiled_copy_s2rtSR_rCtSR_sCcopy_Dcopy_Ctile_coord_mnklvarlen_managerepilogue_barriertidxis_tma_warpreturnc           6      C   s  t |
d u}t |d u}|j}|j}|\}}}| jdkr#ttj|dntj} | | j	| j
| j}!t|!|}"|"||}#|d }$| ||||$| j|j||\}%}&}&tt| jd d |jd }'tj|'|'d dfd}(t|'})|j|) }*| |||||||||	}+t |d urtjt|)| jddD ]},|(|,}-|r| | ||-|d |!| |"  qt#|)D ]},|(|,}.||	|, | $||+|.}/t |r|%| t&||d d d |j'f | tj(  tj)  tj*  |+| W d    n	1 sw   Y  |"  t |d uo|,| j |)k r=|(|,| j }-|r9| | ||-|d |!| |"  | ,||/|	|
}0|rL|   |-  |*|, | j. }1t |rit/0||	|d d d |1f  t&|"|"1|0|#d d d |1f  |d	 }2|d }3tj(  |-  |r|2| j2d	  }4|3| j2d  }5t |r||1|.d
 |4|5kr|%|1|.d
 |!  q| 3||+|||||| ||fS )Nd   )tiled_tmem_load         )stride)unroll)src_idxproducer_stater   )rE   dst_idx)4r   tma_atom_postactmPostAct_mnlarchr   sm100_utilsget_smem_store_opsm90_utils_ogsm90_get_smem_store_oppostact_layoutpostact_dtype	acc_dtypecutemake_tiled_copy_S	get_slicepartition_Depilog_gmem_copy_and_partitionoffset_batch_epicta_tile_shape_postact_mnepi_tile_postactzipped_dividemake_layoutcta_tile_shape_mnkshapesizenum_tiles_executed	epi_begincutlassrangeminepi_c_stageget_hier_coordproducer_acquireproducer_commitadvancerange_constexprepi_begin_loopconsumer_waitcopyindexfence_view_async_shared	sync_warp	elect_oneconsumer_releaseepi_visit_subtilearrive_and_wait	epi_stage
copy_utilscvt_copyretilecluster_shape_mnkepi_end)6r"   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   tile_schedulerr;   r<   has_Chas_DrH   rI   sRowVecsColVecsPostActrL   copy_atom_postact_r2stiled_copy_postact_r2stRS_sPostAct	batch_idxcopy_postact_epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_C
gmem_coordepi_loop_tensorstRS_rPostAct
epi_bufferpid_mpid_nsquare_tile_msquare_tile_nr#   r#   r$   epilogue&   s   






















zGemmSymmetricMixin.epilogueN)F)__name__
__module____qualname__boolr%   rR   jitr   EpilogueParamsr   r   ra   pipelinePipelineAsyncPipelineStateTiler   r   	TiledCopyCoordr   NamedBarrierr   r
   r   r#   r#   r#   r$   r    "   sh    	
r    c                   @      e Zd ZdS )GemmSymmetricSm90Nr   r   r   r#   r#   r#   r$   r          r   c                   @   r   )GemmSymmetricSm100Nr   r#   r#   r#   r$   r      r   r   )maxsizec           $         s  d dkrt nt t t t }}}|dkrdnd}|dkr&dnd}|dkr.dnd}|dkr6dnd}tt|}}t||rJt|nd}}t|||f||dt||||f||dt||||f||dt||||f||dt|}|	dkrdnd}t||||f||d}dd } d }!t|! }" j||"| || |d	t|d
|d d|||||||||	
	|||f}#t	|# 	
fddS )Nr   	   krB   n)leading_dimdivisibilityc                 S   s0   | dkrd S | dkrt dS tt dtjjddS )Nr   rB         ?   )assumed_align)r	   r   rR   AddressSpacegmem)moder#   r#   r$   fake_scalar   s
   z,_compile_gemm_symmetric.<locals>.fake_scalaralphabetaFgemm_symmetricc                      s$   t  
	dS )NF)r   r#   GemmClsa_dtyperx   device_capacityepi_argsmAmBmCmD
persistentpingpongscheduler_argstile_shape_mnvarlen_argsr#   r$   <lambda>  s"    z)_compile_gemm_symmetric.<locals>.<lambda>)
r   r   rR   sym_intr   fake_tensorr   EpilogueArgumentsr   r   )$r   b_dtyped_dtypec_dtypec_majorrP   a_majorb_majord_majorpostact_majorr   rx   r   r   has_semaphore
alpha_mode	beta_moder   mr   l	a_leading	b_leading	d_leading	c_leadingdiv_adiv_bdiv_ddiv_cdiv_papostact_leadingmPostActr   
activationact_fnkeyr#   r   r$   _compile_gemm_symmetric   sh   $r   FT   r   ABDCtile_count_semaphoretile_Mtile_N	cluster_M	cluster_Nr   r   max_swizzle_sizer   r   r=   c           *      C   s  |j }t| |||\}}}}|jdkr|dddn|}t||||\}}}}t| |||\}}}}t|j }|ddkr@dnd}t	| j
}|d dv sQJ d||f}||df} t|tradn|d	krgdnd}!t|trpdn|d	krvdnd}"t|||||||||||| |	|
|d u|!|"|}#dd
lm}$ |$rd S |
rt|| nd}%dd }&tj|d |&||!|&||"d}'t|%||}(d })|d dkr|#|||||'|(|)d d 	 d S |#|||||'|(|) d S )Nr@   rB   rA   r   r   r   )r   
      z)Only SM90, SM100, and SM110 are supportedr   )COMPILE_ONLYc                 S   s$   |dkrd S |dkrt | S |  S )Nr   rB   )r	   data_ptr)scalarr   r#   r#   r$   
scalar_argf  s
   z"gemm_symmetric.<locals>.scalar_argr   r   )mTr   ndimpermuter   r   r   dtyperC   r   device
isinstancer   r   quack.cache_utilsr   r   r   r   r   )*r   r   r   r   r   r   r   r   r   r   r   r   r   r   PostActA_pB_pD_pC_p	PostAct_pr   r   r   r   r   r   r   r   rP   r   r   r   rx   r   r   compiled_fnr   max_active_clustersr   r   r   r   r#   r#   r$   r   '  sj   


r   )FTr   r   r   )?typingr   r   r   	functoolsr   r   torchr   ra   cutlass.cuterR   r   r	   r
   r   cutlass.cute.runtimer   cutlass.utils.hopper_helpersutilshopper_helpersrM   cutlass.utils.blackwell_helpersblackwell_helpersrK   quack.compile_utilsr   r   quack.cute_dsl_utilsr   r   r   quack.activationr   quack.gemm_actr   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   quack.tile_schedulerr   quack.varlen_utilsr   quack.copy_utilsru   r    r   r   r   intr   floatr   r#   r#   r#   r$   <module>   sz    (
 
o	
