o
    c۷id                  '   @   sT  d dl mZmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlZd dlmZ d dlm  mZ d dlm  mZ d dlmZmZmZmZ d dlmZ d dlmZmZm Z m!Z!m"Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z> G dd de*Z?G dd de?e&Z@G dd de?e(ZAG dd de?ZBG dd deBe&ZCG dd deBe(ZDedddd ZE		 	!				d7d"ed#ed$ee d%ee d&ed'ee d(eeF d)eGd*eGd+eGd,eGd-eHd.eHd/eGd0ee d1ee d2ee d3ee d4df&d5d6ZIeIZJdS )8    )
NamedTupleTupleOptionalCallable)	lru_cachepartial)	dataclass)TensorN)Int32Float32Boolean
const_expr)make_fake_tensor)
ParamsBasemlir_namedtupleget_device_capacityget_max_active_clusterstorch2cute_dtype_map)VarlenManager)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)
	get_majorperm3d_singlemake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsdiv_for_dtypemake_fake_gemm_tensorscached_compilecompile_gemm_kernel)permute_gated_Cregs_b16)
act_fn_mapgate_fn_mapc                3       s  e Zd ZeG dd deZeG dd deZddddedefd	d
Z	ddddede
ej fddZededeeeef dejdefddZdefddZdedeejdf f fddZejdedeejdf dejjdejjdejjdejjdejdedejdeej deej d ejd!ejd"eej d#eej d$eej d%ee d&ee d'ejd(ed)ejj d*e!d+e"deejjejjf f0d,d-Z#ej	d1ded.eejdf dejdeej deej f
d/d0Z$  Z%S )2GemmActMixinc                   @   s~   e Zd ZU ejed< dZeje	e
  ed< dZe	eejB  ed< dZe	eejB  ed< dZe	ej ed< dZe	ej ed< dS )zGemmActMixin.EpilogueArgumentsmPostActNact_fnalphabetamRowVecBroadcastmColVecBroadcast)__name__
__module____qualname__cuter	   __annotations__r'   cutlass	Constexprr   r   r(   r   r)   r*   r+    r3   r3   D/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_act.pyEpilogueArguments-   s   
 
r5   c                   @   s   e Zd ZU ejed< ejed< ejed< ejed< dZ	e
jee  ed< dZeeejB  ed< dZeeejB  ed< dZeej ed	< dZeej ed
< dS )zGemmActMixin.EpilogueParamstma_atom_postactmPostAct_mnlepi_postact_smem_layout_stagedepi_tile_postactNr'   r(   r)   r*   r+   )r,   r-   r.   r/   CopyAtomr0   r	   ComposedLayoutTiler'   r1   r2   r   r   r(   r   r)   r*   r+   r3   r3   r3   r4   EpilogueParams6   s   
 



r=   Nlocipargsreturnc                   s   |j j| _tjj|j | _| jd d | _	| j
}| jdkr tnt}|| j| j|| j}| jt|j dkr@tj|j dddn|j ||dd\}}dd	   fd
d|j|jfD \}	}
| j|||||j|j|j|	|
d	S )N   d   r   T
ragged_dim	ptr_shiftstoreop_typec                       t  fdd jD S )Nc                 3   4    | ]}t |st j|d  jj dn|V  qdS     )divbyNr/   	is_staticassumeelement_typewidth.0str3   r4   	<genexpr>W   
    "
zMGemmActMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>tuplestriderX   r3   rX   r4   <lambda>W       z:GemmActMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      6   g | ]}|d urt |jt j|j |dnd qS Nr^   r/   make_tensoriteratormake_layoutshaperV   rY   
new_strider3   r4   
<listcomp>[       z<GemmActMixin.epi_to_underlying_arguments.<locals>.<listcomp>r(   r)   r*   r+   )r&   rS   postact_dtyper1   utils
LayoutEnumfrom_tensorpostact_layoutcta_tile_shape_mnkcta_tile_shape_postact_mnepi_tilearchsm100_utils
sm90_utilsmake_smem_layout_epi	epi_stage_make_tma_epi_atoms_and_tensorsr/   rank
copy_utilscreate_ragged_tensor_for_tmar*   r+   r=   r'   r(   r)   )selfrA   r?   r@   r9   	utils_clsr8   r6   tma_tensor_postactr*   r+   r3   rj   r4   epi_to_underlying_argumentsB   s>   

	


z(GemmActMixin.epi_to_underlying_argumentsparamsc                C   s   |j gS N)r6   )r   r   r?   r@   r3   r3   r4   epi_get_tma_atomsm   s   zGemmActMixin.epi_get_tma_atomsrt   rv   c                 C   s8   | j j}tt||jd  }t| ||}|| S )N   r&   rS   r/   sizerh   rT   r   epi_smem_bytes_per_stagerA   rt   rv   ro   postact_bytes_per_stagerowvec_colvec_bytesr3   r3   r4   r   r   s   z%GemmActMixin.epi_smem_bytes_per_stagec                    s   j d u rdnjd jd u rdnjd j d ur!j jntjd ur,jjnt tjG  fddd}|S )Nr      c                       sz   e Zd ZU ejjejjf df ed< ejjejj f df ed< ejjejjje	j
f jf ed< dS )z:GemmActMixin.epi_get_smem_struct.<locals>.EpiSharedStorage   sRowVecsColVecsPostActN)r,   r-   r.   r/   structAlignMemRanger0   ro   cosizer8   buffer_align_bytesr3   col_vec_dtypecol_vec_smem_sizer   row_vec_dtyperow_vec_smem_sizer   r3   r4   EpiSharedStorage   s   
   r   )r*   rt   r+   rS   r   r/   r   )r   r   r   r3   r   r4   epi_get_smem_struct}   s   
z GemmActMixin.epi_get_smem_struct.c                    s6   t  ||\}}|jjj|jj|jjd}|||fS )N)swizzle)superepi_get_smem_tensorsepir   
get_tensorr8   outerinner)r   r   storager   r   r   	__class__r3   r4   r      s   
z!GemmActMixin.epi_get_smem_tensorsepi_smem_tensorsepi_pipelineepi_store_pipelineepi_read_stateepi_producer_stateload_acc_subtiletRS_rDtRS_rCtiled_copy_t2rtiled_copy_r2stRS_sDtiled_copy_s2rtSR_rCtSR_sCcopy_Dcopy_Ctile_coord_mnklvarlen_managerepilogue_barriertidxis_tma_warpc           2      C   sV  t |
d u}t |d u}|j}|j}|\}}}| jdkr#ttj|dntj} | | j	| j
| j}!t|!|}"|"||}#|d }$| ||||$| j|j||\}%}&}&tt| jd d |jd }'tj|'|'d dfd}(t|'})|j|) }*| |||||||||	}+t |d urtjt|)| jddD ]},|(|,}-|r| | ||-|d |!| |"  qt#|)D ]},|(|,}.||	|, | $||+|.}/t |r|%| t&||d d d |j'f | tj(  tj)  tj*  |+| W d    n	1 sw   Y  |"  t |d uo|,| j |)k r=|(|,| j }-|r9| | ||-|d |!| |"  | ,||/|	|
}0|rL|   |-  |*|, | j. }1t |rit/0||	|d d d |1f  t&|"|"1|0|#d d d |1f  tj(  |-  |rt |r||1|.d	 |%|1|.d	 |!  q| 2||+|||||| ||fS )
NrD   )tiled_tmem_load   rC   r   rc   )unroll)src_idxproducer_state)r   dst_idx)3r   r6   r7   rw   r   rx   get_smem_store_opsm90_utils_ogsm90_get_smem_store_oprs   ro   	acc_dtyper/   make_tiled_copy_S	get_slicepartition_Depilog_gmem_copy_and_partitionoffset_batch_epiru   r9   zipped_dividerg   rt   rh   r   num_tiles_executed	epi_beginr1   rangeminepi_c_stageget_hier_coordproducer_acquireproducer_commitadvancerange_constexprepi_begin_loopconsumer_waitcopyindexfence_view_async_shared	sync_warp	elect_oneconsumer_releaseepi_visit_subtilearrive_and_waitr{   r~   cvt_copyretileepi_end)2r   r   r   r   r   r   r   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   tile_schedulerr   r   has_Chas_Dr6   r7   r   r   r   r   copy_atom_postact_r2stiled_copy_postact_r2stRS_sPostAct	batch_idxcopy_postact_epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_C
gmem_coordepi_loop_tensorstRS_rPostAct
epi_bufferr3   r3   r4   epilogue   s   





















zGemmActMixin.epiloguer   c                 C   s   t | |||| t|jd urdt|jj| j}t| j	dk r7t
jt|ddD ]}||| ||< q*n/t
jt|d ddD ]}||d|  |d| d  f\|d| < |d| d < qCn|}t|| j}|| | j |S )NrD   Tunroll_fullrC   r   )r   r   r   r'   r/   make_rmem_tensorlayoutrh   r   rw   r1   r   r   make_fragment_likero   rH   loadto)r   r   r   r   r   r   itRS_rPostAct_outr3   r3   r4   r   +  s    zGemmActMixin.epi_visit_subtiler   )&r,   r-   r.   r   r   r5   r   r   r=   r   listr/   r:   r   staticmethodr   intr<   r   r   r	   r   jitr1   pipelinePipelineAsyncPipelineStater   r   	TiledCopyCoordr   NamedBarrierr
   r   r   r   __classcell__r3   r3   r   r4   r%   ,   s    
,

 	
 r%   c                   @      e Zd ZdS )GemmActSm90Nr,   r-   r.   r3   r3   r3   r4   r  H      r  c                   @   r  )GemmActSm100Nr  r3   r3   r3   r4   r  L  r  r  c                   @   s   e Zd ZddddejdejfddZedejdee	e	e	f de
jde	fd	d
Ze
j	ddejdee
jdf de
jdee
j dee
j f
ddZdS )GemmGatedMixinNr>   rA   rB   c                   s  |j j| _tjj|j | _| jjdksJ d| j	d u s$| j	
 s$J | j
 s+J | jdkr=| jd d dks=J d| jd | jd d f| _t| jd tjr^tdd| jd }n| jd d }| jd |f}| jd	krstnt}|| j| j|| j}| jt|j dkrtj|j dd
dn|j ||dd\}}	dd   fdd|j|jfD \}
}| j||	|||j|j|j |
|d	S )Nr   z-GemmGated only supports 16bit postact for nowZ   r   rN   r   z2GemmGatedSm90 requires tileN to be divisible by 32rC   rD   TrE   rH   rI   c                    rK   )Nc                 3   rL   rM   rP   rU   rX   r3   r4   rZ   t  r[   zOGemmGatedMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>r\   rX   r3   rX   r4   r_   t  r`   z<GemmGatedMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      ra   rb   rd   ri   rj   r3   r4   rl   x  rm   z>GemmGatedMixin.epi_to_underlying_arguments.<locals>.<listcomp>rn   )!r&   rS   ro   r1   rp   rq   rr   rs   rT   d_layoutis_n_major_crw   rt   ru   
isinstancerv   r/   Layoutrecast_layoutrx   ry   rz   r{   r|   r}   r~   r   r*   r+   r=   r'   r(   r)   )r   rA   r?   r@   epi_tile_postact_1r9   r   r8   r6   r   r*   r+   r3   rj   r4   r   Q  sV   


	


z*GemmGatedMixin.epi_to_underlying_argumentsrt   rv   c                 C   s<   | j j}tt|d |jd  }t| ||}|| S )NrC   r   r   r   r3   r3   r4   r     s   z'GemmGatedMixin.epi_smem_bytes_per_stager   r   .r   r   c           	      C   s4  t | |||| tdd|j}t|j| j}t| j	dk r@t
jt|ddD ]}||d|  |d| d  ||< q*n;t
jt|d ddD ].}||d|  |d| d  f|d| d  |d| d  f\|d| < |d| d < qLt|| j}|| | j t| j	dkrt| |S )	NrC   r   rD   Tr      r   r  )r   r   r/   r  r   r   rh   r   r   rw   r1   r   r   r'   make_rmem_tensor_likero   rH   r   r   r"   )	r   r   r   r   r   tRS_rPostAct_layoutr   r  r  r3   r3   r4   r     s    &8z GemmGatedMixin.epi_visit_subtiler   )r,   r-   r.   r%   r5   r=   r   r  r   r  r/   r<   r   r  r	   r   r   r3   r3   r3   r4   r  P  s<    
9r  c                   @   r  )GemmGatedSm90Nr  r3   r3   r3   r4   r    r  r  c                   @   r  )GemmGatedSm100Nr  r3   r3   r3   r4   r    r  r  )maxsizec           %         s  d dkrt td| nttd|  |	dkrdnd}t||||||||d
\	}}}}t|}|dkr>t n|}|dkrFdn|}|rN||fn|||f}t||||d} t|||fdd	d}!|d
krtt|||fdd	d}"n|dkrt||fdd	d}"nd }"|dkrt	| nt
| }# j| |#|!|"dt|d|t|d|r|nd d||||||||||	
||||||f}$t|$ 	
fddS )Nr   	   )actgatednr   )varlen_mgather_Ar#  )leading_dimdivisibilityr  rC   r"  r*   r+   Fgemm_actc                      s$   t  
	S r   )r!   r3   GemmClsa_dtypecluster_shape_mnkdevice_capacityepi_argsr&  mAmBmCmD
persistentpingpongscheduler_argstile_shape_mnvarlen_argsr3   r4   r_     s"    z#_compile_gemm_act.<locals>.<lambda>)r  r  r  r  r   r   r/   sym_intfake_tensorr#   r$   r5   r   r   r    )%r-  b_dtyped_dtypec_dtypero   a_majorb_majord_majorc_majorpostact_majorr8  r.  r6  r5  has_semaphore
activationrowvec_dtypecolvec_dtypecolvec_ndimr%  r&  r/  gemm_cls_name
pa_leadingmr$  kldiv_papa_npa_leading_dimpa_shaper&   mRowVecmColVecr'   keyr3   r+  r4   _compile_gemm_act  s   &rU  FTr   ABDCPostActtile_count_semaphorerE  tile_Mtile_N	cluster_M	cluster_Nr6  r5  max_swizzle_sizerowvec_biascolvec_biascu_seqlens_mA_idxrB   c           ,      C   s  |t v rd}n|tv sJ d| d}|d u}|d u}|rI|s$J d| ddks/J d|d ur>|ddks>J d|ddksIJ d	|r[|d usSJ d
|
dks[J dt| |}t|}t||}t||}t||}t|dd}t|dd}|d urt|ddnd }|d urt|ddnd }t|dd}t| j }t|j } |d urt|j nd }!|d urt|j nd }"t|j }#|d ur|jnd}$t| j	}%|%d dv sJ dt
|| |!|"|#|||||||f|	|
df|||d u||d urt|j nd |d urt|j nd |$|||%|}&ddlm}' |'rd S |r"t|	|
 nd}(tj|d ||d})t|(||}*t|d |}+|%d dkrN|&|||||)|*|+d d 	 d S |&|||||)|*|+ d S )Nr#  zUnsupported activation r"  z!varlen_m requires persistent=Truer   z!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz'varlen_m requires PostAct to be n-majorzgather_A requires varlenzgather_A requires cluster_N=1rK  rL  r$  r   )r!  
      z)Only SM90, SM100, and SM110 are supported)COMPILE_ONLYr)  r!  )r$   r#   r^   r   r   r   dtypendimr   devicerU  quack.cache_utilsrh  r   r%   r5   r   r   ),rV  rW  rX  rY  rZ  r[  rE  r\  r]  r^  r_  r6  r5  r`  ra  rb  rc  rd  rI  r%  r&  A_pB_pD_pC_p	PostAct_pr?  r@  rA  rB  rC  r-  r<  r=  r>  ro   rH  r/  compiled_fnrh  max_active_clustersr0  r7  r9  r3   r3   r4   r*  1  s   







r*  )FTr   NNNN)Ktypingr   r   r   r   	functoolsr   r   dataclassesr   torchr	   r1   cutlass.cuter/   cutlass.utils.hopper_helpersrp   hopper_helpersr   cutlass.utils.blackwell_helpersblackwell_helpersrx   r
   r   r   r   quack.compile_utilsr   r;  quack.cute_dsl_utilsr   r   r   r   r   quack.varlen_utilsr   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r   r    r!   quack.layout_utilsr"   quack.sm90_utilsry   quack.copy_utilsr~   quack.activationr#   r$   r%   r  r  r  r  r  rU  strr  boolr*  
gemm_gatedr3   r3   r3   r4   <module>   s   0  f
~	

o