o
    c۷ii                  '   @   sb  d dl mZmZmZmZmZ d dlmZmZ d dl	m
Z
 d dlZd dlZd dlmZ d dlZd dlmZ d dlmZmZmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z? G dd de"Z@G dd de@eZAG dd de@eZBG dd de"ZCG dd deCeZDG dd deCeZEedddd  ZF	!	!	"				d8d#ed$ed%ed&ed'ed(ee d)eeG d*eHd+eHd,eHd-eHd.eId/eId0eHd1ee d2ee d3ee d4ee d5df&d6d7ZJeJZKdS )9    )
NamedTupleOptionalTupleCallableType)	lru_cachepartial)	dataclassN)Tensor)Int32Float32
const_expr)partition_for_epilogue)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)GemmActMixin)make_fake_tensor)
ParamsBasemlir_namedtupletorch2cute_dtype_mapget_device_capacityget_max_active_clusters)
	get_majorperm3d_singlemake_scheduler_argsmake_varlen_argsmake_fake_scheduler_argsmake_fake_varlen_argsdiv_for_dtypemake_fake_gemm_tensorscached_compilecompile_gemm_kernel)VarlenManager)
copy_utils)dact_fn_mapdgate_fn_mapc                   @   sX   e Zd ZejZejZej	d
dedeej	df dej	de
ej	 de
ej	 f
dd	ZdS )GemmDActMixinNparamsepi_loop_tensors.tRS_rDtRS_rCreturnc           	      C   sj  |d usJ t j| |||d d t|| j}|| | j t|j	d urt
|jj| j}t| jdk rWtjt|ddD ]}|	|| || \||< ||< qCnJtjt|d ddD ]:}|	|d|  |d| d  f|d|  |d| d  f\\|d| < |d| d < \|d| < |d| d < qcn|}t|| j}|| | j |S )N)r+   d   Tunroll_full      )r   epi_visit_subtilecutemake_fragment_like	acc_dtypestoreloadtor   act_fnmake_rmem_tensorlayoutshapearchcutlassrangesizepostact_dtype)	selfr(   r)   r*   r+   
tRS_rC_acctRS_rPostActitRS_rPostAct_out rG   E/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_dact.pyr2   5   s.   "	zGemmDActMixin.epi_visit_subtileN)__name__
__module____qualname__r   EpilogueArgumentsEpilogueParamsr3   jitr   r
   r   r2   rG   rG   rG   rH   r'   /   s     r'   c                   @      e Zd ZdS )GemmDActSm90NrJ   rK   rL   rG   rG   rG   rH   rQ   Y       rQ   c                   @   rP   )GemmDActSm100NrR   rG   rG   rG   rH   rT   ]   rS   rT   c                       sP  e Zd ZeG dd deZeG dd deZddddedefd	d
Z	e
jdedee
jdf de
jdee
j de
jde
jdedejjdedee
jdf fddZdede
jf fddZe
j	d"dedee
jdf de
jdee
j dee
j f
ddZe
jdedee
jdf de
jdee
j de
jde
jdededdfd d!Z  ZS )#GemmDGatedMixinc                   @   s   e Zd ZU ejed< dZeje	 ed< dZ
eeejB  ed< dZeeejB  ed< dZeej ed< dZeej ed< dZeej ed< dS )	z!GemmDGatedMixin.EpilogueArgumentsmPostActN
act_bwd_fnalphabetamRowVecBroadcastmColVecBroadcastmColVecReduce)rJ   rK   rL   r3   r
   __annotations__rW   r>   	Constexprr   rX   r   r   rY   rZ   r[   r\   rG   rG   rG   rH   rM   d   s   
 
rM   c                   @   s   e Zd ZU ejed< ejed< ejed< ejed< e	j
e ed< ee	j ed< dZeeejB  ed< dZeeejB  ed	< dZeej ed
< dZeej ed< dZeej ed< dS )zGemmDGatedMixin.EpilogueParamstma_atom_postactmPostAct_mnlepi_postact_smem_layout_stagedepi_tile_postactrW   implicit_dtypeNrX   rY   rZ   r[   r\   )rJ   rK   rL   r3   CopyAtomr]   r
   ComposedLayoutTiler>   r^   r   r   NumericrX   r   r   rY   rZ   r[   r\   rG   rG   rG   rH   rN   n   s   
 



rN   N)locipargsr,   c                   s$  |j j| _tjj|j | _| jj	dksJ d| j
j	dks"J d| jj	dks,J d| jd d | _| j}| jdkr>tnt}|| j| j|| j}| jt|j dkr^tj|j dd	d
n|j ||dd\}}dd   fdd|j|j|jfD \}	}
}| j|||||j| j|j|j|	|
|dS )N   z&GemmDGated only supports 16bit for now    zD storage type must be 32 bitzC storage type must be 32 bitr0   r-   r   T)
ragged_dim	ptr_shiftr6   )op_typec                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )rl   )divbyN)r3   	is_staticassumeelement_typewidth).0strG   rH   	<genexpr>   s
    "
zPGemmDGatedMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>)tuplestriderw   rG   rw   rH   <lambda>   s    z=GemmDGatedMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr{   )r3   make_tensoriteratormake_layoutr<   )ru   rx   
new_striderG   rH   
<listcomp>   s    z?GemmDGatedMixin.epi_to_underlying_arguments.<locals>.<listcomp>)rX   rY   rZ   r[   r\   ) rV   rs   rA   r>   utils
LayoutEnumfrom_tensorpostact_layoutrc   rt   d_dtypec_dtypecta_tile_shape_mnkcta_tile_shape_postact_mnepi_tiler=   sm100_utils
sm90_utilsmake_smem_layout_epi	epi_stage_make_tma_epi_atoms_and_tensorsr3   rankr$   create_ragged_tensor_for_tmarZ   r[   r\   rN   rW   rX   rY   )rB   rj   rh   ri   rb   	utils_clsra   r_   tma_tensor_postactrZ   r[   r\   rG   r   rH   epi_to_underlying_arguments|   sH   

	
z+GemmDGatedMixin.epi_to_underlying_argumentsr(   epi_smem_tensors.r   tiled_copy_t2rtiled_copy_r2stile_coord_mnklvarlen_managerepilogue_barriertidxc
                 C   s   t | |||||||||	
}
tt||d ur|n||	|d u d}d }t|jd urKtj| jd d dd}|t	|t
j}t	|t
}t|d g |
|R S )Nr   
tiled_copyr   reference_srcr0   )r1   r   r}   g        )r   	epi_beginr   r   r   r\   r3   r   r   r:   r   r;   filter_zerosfill)rB   r(   r   r   r   r   r   r   r   r   epi_tensorspartition_for_epilogue_fntDrColVecReducecolvec_mma_layouttDrColVec_layoutrG   rG   rH   r      s<   
zGemmDGatedMixin.epi_begin	epi_coordc                    sd   |d d |d }}t  |||}d }t|d ur+t|dt|d d d |f }g ||R S )N   )superepi_begin_loopr   r3   group_modesr   )rB   r(   r   r   r   r)   tDrColVecReduce_cur	__class__rG   rH   r      s   
zGemmDGatedMixin.epi_begin_loopr)   r*   r+   c              
   C   s  |\}}}}}	|d u r|d u r|d u sJ |d usJ |j }
|
jdks'J dt||
}t|jt}|| 	t t
|t}t
|t}t
|}t|d urt| jdk rl|| | 	|j  npt||j}t||j}t||j}tjtj|dgdddD ]E}tjtj|dgdd	 ddD ]3}tj||d	| f ||d	| d f f||df ||df f\||d	| f< ||d	| d f< qqn||  t| jdk rtt|D ]$}||d	|  |d	| d  || \|d	| < |d	| d < ||< qndtt|d	 D ]Y}||d
|  |d
| d	  f|d
| d  |d
| d  f|d	|  |d	| d  f\\|d
| < |d
| d	 < \|d
| d < |d
| d < \|d	| < |d	| d < qt|	d ur/t| jdk rtjt|	ddD ]}|	|  || ||  7  < qnt|	|	j}t||	j}t||	j}tjtj|dgdddD ]j}tj||df ||df f||df ||df f}tjdtj|dgdd	 ddD ]+}tj||d	| f ||d	| d f f||d	| f ||d	| d f f|}q||df  |d |d  7  < qt|d urt| jdk rN|| | 	|j  nct||j}t||j}tjtj|dgdddD ]G}tjtj|dgdd	 ddD ]4}tj||d	| f ||d	| d f f||df ||df f\||d	| f< ||d	| d f< qzqit|j|
}|| 	|
 |t|t  t|| j}|| 	| j |S )Nrk   z+GemmDGatedMixin only supports 16bit for nowr-   r   modeTr.   r1   r0      r   )rc   rt   r3   recast_tensorr:   r;   r   r6   r7   r8   make_rmem_tensor_liker   r=   rs   layout_utilsconvert_layout_zero_strider>   r?   r@   mul_packed_f32x2rW   fma_packed_f32x2r4   rA   )rB   r(   r)   r*   r+   rX   rY   	tDrRowVec	tDrColVecr   rc   tRS_rXY_f16x2tRS_rXY_f32x2tRS_rdXY_f32x2tRS_rOuttRS_rD_scaledtDrColVec_mn	tRS_rD_mntRS_rD_scaled_mnmnrE   tDrColVecReduce_mntRS_rOut_mnrow_sumtRS_rdXY_f16x2tRS_rOut_cvtrG   rG   rH   r2      s   
 
"
	 ,
""$ 
"&
z!GemmDGatedMixin.epi_visit_subtiler   c	                 C   s  t t||d ur	|n|||d u d}	|d }
| jd d \}}t|jd urt|
}t| jdk rMtj	t
|ddD ]}tjj|| tjdd||< q;n	| j sVJ d	|d
 }|jsc|jjd n|jjd }|d |k rt|j r|j|d |d f }nt|jj| f|jd |d f }t||f|d f}t|||d |  |}|	t||f}t|
|
jd }t||
jd }|d d dkrt	tj
|dgdD ]}|| d }||k r|| ||< qd S d S d S d S )Nr   r   r0   r-   Tr.   r   )threads_in_groupz/GemmDGated only supports n-major output for nowr   r1   r   )Nr   r   )r   r   r   r   r\   r3   r   r=   r>   r?   r@   warp_reductionoperatoraddd_layoutis_n_major_cvarlen_mr<   domain_offsetr(   cu_seqlens_m
local_tileminlen_mmake_identity_tensorr   r   r;   )rB   r(   r   r   r   r   r   r   r   r   r   tile_Mtile_N
tDrCVR_fltrE   	batch_idxlimit_nmColVecgColVeclimit_mtDcCVtDrColVecReduce_mtDcCV_mr   row_idxrG   rG   rH   epi_endR  sj   

zGemmDGatedMixin.epi_endrI   )rJ   rK   rL   r   r   rM   r	   r   rN   r   r3   rO   r   r
   rf   r   	TiledCopyCoordr#   r>   pipelineNamedBarrierr   r   r   r2   r   __classcell__rG   rG   r   rH   rU   a   s    	
2	
)
o	
rU   c                   @   rP   )GemmDGatedSm90NrR   rG   rG   rG   rH   r     rS   r   c                   @   rP   )GemmDGatedSm100NrR   rG   rG   rG   rH   r     rS   r   )maxsizec           (         s  |dk}|rd dkrt nt n
d dkrtnt t|||||||	|d
\
	}}}}t|}|
dkr;dnd}|rC||fn|||f} t|| ||d}!|rt| }"d }#|dkrgt|||fdd	d}#n|dkrtt||fdd	d}#d }$t	 }%|d
krt||||%fddd}$n|dkrt|||%fddd}$ j
|!|"|#|$dfdd}&|&nt| }" 
|!|"d t|d|t|d|r|nd d|||||||||	|
|||||||f}'t|' 	
fddS )Ndgatedr   	   )r   gather_Ar   r1   )leading_dimdivisibilityr0   r   r   r[   r\   c                    s
    | _ d S rI   rc   )gemm_objr   rG   rH   _set_implicit_dtype  s   
z/_compile_gemm_dact.<locals>._set_implicit_dtypeF	gemm_dactc                      s(   t  
	dS )N)	post_init)r"   rG   )GemmClsa_dtypecluster_shape_mnkdevice_capacityepi_argsr   mAmBmCmD
persistentpingpongr   scheduler_argstile_shape_mnvarlen_argsrG   rH   r|     s$    z$_compile_gemm_dact.<locals>.<lambda>)r   r   rT   rQ   r    r   fake_tensorr&   r3   sym_intrM   r%   r   r   r!   )(r   b_dtyper   r   rA   rc   a_majorb_majord_majorc_majorpostact_majorr  r   r  r  has_semaphore
activationcolvec_scale_dtypecolvec_scale_ndimcolvec_reduce_dtypecolvec_reduce_ndimr   r   r   gemm_cls_name	is_dgatedr   r   kldiv_pa
pa_leadingpa_shaperV   r9   r   r\   n_tilesr   keyrG   )r   r   r   r   r   r   rc   r   r   r   r  r  r  r   r  r  r  rH   _compile_gemm_dact  s   (r  T   ABOutPreActPostActtile_count_semaphorer  r   r   	cluster_M	cluster_Nr  r  max_swizzle_sizecolvec_scalecolvec_reducer   A_idxr,   c           .      C   sl  |t v }|s!|tv sJ d| |d u sJ d|d u s!J d|r%dnd}|d u}|d u}|rc|s7J d| ddksBJ d	|ddksMJ d
|ddksXJ d|ddkscJ d|ru|d usmJ d|
dksuJ dd }|r|ddk}t|j }| dksJ d| dksJ d|s|s|tj}|tj}n|j	tjj	}|j	tjj	}t
| |}t
|}t
||}t
||}t
||}t|dd}t|dd}t|dd}t|dd} t|dd}!t| j }"t|j }#t|j }$t|j }%t|j }&t| j}'|'d dv sJ dt|"|#|$|%|&||||| |!||f|	|
df|||d u||d ur<t|j nd |d urE|jnd|d urPt|j nd |d urY|jnd|||'|}(ddlm}) |)rkd S |rtt|	|
 nd}*|rtj|d ||d}+nt|d }+t|*||},t|d |}-|'d dkr|(|||||+|,|-d d 	 d S |(|||||+|,|- d S )NzUnsupported activation z4colvec_scale is only supported for gated activationsz5colvec_reduce is only supported for gated activationsr   dactz!varlen_m requires persistent=Truer   r1   z!varlen_m requires A to be k-majorz#varlen_m requires Out to be n-majorz&varlen_m requires PreAct to be n-majorz'varlen_m requires PostAct to be n-majorzgather_A requires varlenzgather_A requires cluster_N=1r0   zOut dtype must be fp16 or bf16z!Preact dtype must be fp16 or bf16r   r  r   r   )r   
      z)Only SM90, SM100, and SM110 are supported)COMPILE_ONLYr   r   )r&   r%   r{   r   dtypeelement_sizeviewtorchfloat32mTr   r   r   devicer  ndimquack.cache_utilsr/  r   rU   rM   r'   r   r   ).r   r!  r"  r#  r$  r%  r  r   r   r&  r'  r  r  r(  r)  r*  r   r+  r  r  r   r   rc   
AB_swappedA_pB_pOut_pPreAct_p	PostAct_pr
  r  r  r  r  r   r	  r   r   rA   r   compiled_fnr/  max_active_clustersr   r  r  rG   rG   rH   r   (  s   










r   )TTr  NNNN)Ltypingr   r   r   r   r   	functoolsr   r   dataclassesr	   r   r3  r
   r>   cutlass.cuter3   r   r   r   cutlass.utils.blackwell_helpersr   blackwell_helpersr   quack.sm90_utilsr   r   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   quack.gemm_actr   quack.compile_utilsr   r  quack.cute_dsl_utilsr   r   r   r   r   quack.gemm_tvm_ffi_utilsr   r   r   r   r   r   r   r    r!   r"   quack.varlen_utilsr#   quackr$   quack.layout_utilsr   quack.activationr%   r&   r'   rQ   rT   rU   r   r   r  strintboolr   gemm_dgatedrG   rG   rG   rH   <module>   s   0*  2
 	

 