o
    پiWQ                  '   @   s  d dl mZmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
Z
d dlmZ d dlm  mZ d dlm  mZ d dl
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZm Z  d d
l!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dlm)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1Z2G dd de(Z3G dd de3e$Z4G dd de3e&Z5de2j6j7e2j6j8e2j6j9dZ:							d/de	de	dee	 dee	 de	dee	 d ee; d!e<d"e<d#e<d$e<d%e=d&e=d'e<d(ee	 d)ee	 d*ee	 d+ee	 d,df&d-d.Z>i e>_?dS )0    )TupleOptionalCallable)partial)	dataclass)TensorN)Int32Float32Boolean
const_expr)if_generate)from_dlpack)ArgumentsBase
ParamsBase)VarlenManager)GemmSm90)	GemmSm100)GemmDefaultEpiMixin)get_device_capacityget_max_active_clusters)GemmWrapperBasec                5       s,  e Zd ZU dZeed< eG dd deZeG dd de	Z
dddd	ed
e
fddZdddde
d
eej fddZdddde
deej ded
eee ee f fddZed	edeeeef dejd
efddZde
fddZde
d
eejdf f fddZejde
deejdf deeej  dejjd ejjd!ejj d"ejj dejd#e!d$ejd%eej d&eej" d'ej"d(ejd)eej" d*eej d+eej d,ee! d-ee! d.ej#d/e$d0ejj%d1ed2e&d
eejj ejj f f2d3d4Z'ej	d8de
d5eejdf d$ejd%eej d
eej f
d6d7Z(  Z)S )9GemmActMixin   num_epi_tensormapsc                   @   s~   e Zd ZU ejed< dZeje	e
  ed< dZe	eejB  ed< dZe	eejB  ed< dZe	ej ed< dZe	ej ed< dS )zGemmActMixin.EpilogueArgumentsmPostActNact_fnalphabetamRowVecBroadcastmColVecBroadcast)__name__
__module____qualname__cuter   __annotations__r   cutlass	Constexprr   r   r   r	   r   r   r    r'   r'   B/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_act.pyEpilogueArguments    s   
 
r)   c                   @   s   e Zd ZU ejed< ejed< ejed< ejed< dZ	e
jee  ed< dZeeejB  ed< dZeeejB  ed< dZeej ed	< dZeej ed
< dS )zGemmActMixin.EpilogueParamstma_atom_postactmPostAct_mnlepi_postact_smem_layout_stagedepi_tile_postactNr   r   r   r   r   )r    r!   r"   r#   CopyAtomr$   r   ComposedLayoutTiler   r%   r&   r   r   r   r	   r   r   r   r'   r'   r'   r(   EpilogueParams)   s   
 



r1   N)locipargsreturnc                   s   |j j| _tjj|j | _| jd d | _	| j
}| jdkr tnt}|| j| j|| j}| j|j ||dd\}}dd   fdd|j|jfD \}	}
| j|||||j|j|j|	|
d		S )
N   d   store)op_typec                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )    )divbyN)r#   	is_staticassumeelement_typewidth).0str'   r(   	<genexpr>H   s
    "
zMGemmActMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>)tuplestriderB   r'   rB   r(   <lambda>H   s    z:GemmActMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )NrF   )r#   make_tensoriteratormake_layoutshape)r@   rC   
new_strider'   r(   
<listcomp>L   s    z<GemmActMixin.epi_to_underlying_arguments.<locals>.<listcomp>)r   r   r   r   )r   r>   postact_dtyper%   utils
LayoutEnumfrom_tensorpostact_layoutcta_tile_shape_mnkcta_tile_shape_postact_mnepi_tilearchsm100_utils
sm90_utilsmake_smem_layout_epi	epi_stage_make_tma_epi_atoms_and_tensorsr   r   r1   r   r   r   )selfr4   r2   r3   r-   	utils_clsr,   r*   tma_tensor_postactr   r   r'   rM   r(   epi_to_underlying_arguments5   s:   




z(GemmActMixin.epi_to_underlying_argumentsparamsc                C   s   |j gS N)r*   )r^   rb   r2   r3   r'   r'   r(   epi_get_tma_atoms^   s   zGemmActMixin.epi_get_tma_atomscu_seqlens_m	batch_idxc                C   s:   |d ur
||d  nd g}t | j rdndg}||fS )Nr   r   )r   rT   is_m_major_c)r^   rb   re   rf   r2   r3   shapesordersr'   r'   r(   &epi_get_tensormap_update_shapes_ordersc   s   	z3GemmActMixin.epi_get_tensormap_update_shapes_ordersrU   rW   c                 C   s8   | j j}tt||jd  }t| ||}|| S )N   )r   r>   r#   sizerL   r?   r   epi_smem_bytes_per_stage)r4   rU   rW   rP   postact_bytes_per_stagerowvec_colvec_bytesr'   r'   r(   rm   p   s   z%GemmActMixin.epi_smem_bytes_per_stagec                    s   j d u rdnjd jd u rdnjd j d ur!j jntjd ur,jjnt tjG  fddd}|S )Nr   r   c                       sz   e Zd ZU ejjejjf df ed< ejjejj f df ed< ejjejjje	j
f jf ed< dS )z:GemmActMixin.epi_get_smem_struct.<locals>.EpiSharedStorage   sRowVecsColVecsPostActN)r    r!   r"   r#   structAlignMemRanger$   rP   cosizer,   buffer_align_bytesr'   col_vec_dtypecol_vec_smem_sizerb   row_vec_dtyperow_vec_smem_sizer^   r'   r(   EpiSharedStorage   s   
   r~   )r   rU   r   r>   r	   r#   rt   )r^   rb   r~   r'   ry   r(   epi_get_smem_struct{   s   
z GemmActMixin.epi_get_smem_struct.c                    s6   t  ||\}}|jjj|jj|jjd}|||fS )N)swizzle)superepi_get_smem_tensorsepirs   
get_tensorr,   outerinner)r^   rb   storagerq   rr   rs   	__class__r'   r(   r      s   
z!GemmActMixin.epi_get_smem_tensorsepi_smem_tensorstma_desc_epi_ptrsepi_pipelineepi_store_pipelineepi_read_stateepi_producer_stateload_acc_subtiletRS_rDtRS_rCtiled_copy_t2rtiled_copy_r2stRS_sDtiled_copy_s2rtSR_rCtSR_sCcopy_Dcopy_Ctile_coord_mnklvarlen_managerepilogue_barriertidxis_tma_warpc           6         s  t |d u}t  d u|j}|j}|\}}}| jdkr#ttj|dntj} | | j	| j
| j}!t|!|}"|"||}#|\}$|d }%| j||||%| j|j|||$d\}&}&tt| jd d |jd }'tj|'|'d dfd}(t|'})|j|) }*| ||||||||	}+t |d urtjt|)| jddD ]},|(|,}-r| | ||-|d	 |!| |"  q fd
d}.d}/d\}0}1t#|)D ]},|(|,}2|	|
|, | $||+|2}3t |r.|%| t&||d d d |j'f | tjj(tjj)j*tjj+j,d tj-  tj.  |/| W d    n	1 s%w   Y  |"  t |d uo:|,| j |)k r\|(|,| j }-rX| | ||-|d	 |!| |"  | 0||3|
|}4|*|, | j1 }5t |/rt |,dkr}|.|0|1d |5|2}0}1t rt23||
|d d d |5f  t&|"|"4|4|#d d d |5f  t |/ r|.|5|2d qt |/r|.|0|1d | 5||+|||||| ||fS )Nr7   )tiled_tmem_load   )tma_desc_ptrr6   r   rH   )unroll)src_idxproducer_statec                    sx   t jjt jjjt jjjd   r$tr | |d | |d t	fdd t	fdd   d S )Nspacer   dst_idxc                            S rc   )producer_commitr'   r   r'   r(   rG          z=GemmActMixin.epilogue.<locals>.tma_store_fn.<locals>.<lambda>c                      r   rc   )producer_acquirer'   r   r'   r(   rG      r   )
r#   rX   fence_proxy	ProxyKindasync_sharedSharedSpace
shared_ctaarrive_and_waitr   r   r   r   copy_postactr   r   has_Dr   r'   r(   tma_store_fn   s   z+GemmActMixin.epilogue.<locals>.tma_store_fnT)NNr   r   r   )6r   r*   r+   rX   r   rY   get_smem_store_opsm90_utils_ogsm90_get_smem_store_oprT   rP   	acc_dtyper#   make_tiled_copy_S	get_slicepartition_Depilog_gmem_copy_and_partitionoffset_batch_epirV   r-   zipped_dividerK   rU   rL   rl   num_tiles_executed	epi_beginr%   rangeminepi_c_stageget_hier_coordr   r   advancerange_constexprepi_begin_loopconsumer_waitcopyindexr   r   r   r   r   	sync_warp	elect_oneconsumer_releaseepi_visit_subtiler\   
copy_utilscvt_copyretileepi_end)6r^   rb   r   r   r   r   r   r   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   r   tile_schedulerr   r   has_Cr*   r+   rq   rr   rs   r   copy_atom_postact_r2stiled_copy_postact_r2stRS_sPostActtma_desc_postact_ptrrf   _epi_tile_shapeepi_tile_layoutepi_tile_numnum_prev_subtilesepi_tensorsepi_idxgmem_coord_Cr   delay_tma_storesrc_idx_prevdst_idx_prev
gmem_coordepi_loop_tensorstRS_rPostAct
epi_bufferr'   r   r(   epilogue   s   



















zGemmActMixin.epiloguer   c                 C   s   t | |||| t|jd urdt|jj| j}t| j	dk r7t
jt|ddD ]}||| ||< q*n/t
jt|d ddD ]}||d|  |d| d  f\|d| < |d| d < qCn|}t|| j}|| | j |S )Nr7   T)unroll_fullr6   r   )r   r   r   r   r#   make_fragmentlayoutrL   r   rX   r%   r   rl   make_fragment_likerP   r8   loadto)r^   rb   r   r   r   r   itRS_rPostAct_outr'   r'   r(   r   >  s    zGemmActMixin.epi_visit_subtilerc   )*r    r!   r"   r   intr$   r   r   r)   r   r1   ra   listr#   r.   rd   r   r   r   rE   rj   staticmethodr   r0   rm   r   r   jitPointerr%   pipelinePipelineAsyncPipelineStater   	TiledCopyCoordr   NamedBarrierr
   r   r   __classcell__r'   r'   r   r(   r      s   
 
*


 	
 $r   c                   @      e Zd ZdS )GemmActSm90Nr    r!   r"   r'   r'   r'   r(   r  [      r  c                   @   r  )GemmActSm100Nr  r'   r'   r'   r(   r  _  r  r  )Nrelurelu_sqgelu_tanh_approxFTrk   ABDCPostActtile_count_semaphore
activationtile_Mtile_N	cluster_M	cluster_Npingpong
persistentmax_swizzle_sizerowvec_biascolvec_biasre   A_idxr5   c           '      C   sB  |d ur/|s
J d|  ddksJ d|d ur$| ddks$J d| ddks/J d|d u}|rE|d us=J d|
dksEJ d|tv sPJ d	| tj| |||d
|i||d\}}}}}tj||d ud t| dddddd}t|| t| j}|d dv sJ d|d dkrt	nt
}t}||f}|	|
df}||d j|d j||d j|d j|d jstd|rt|	|
 nd}t|| t| }|j|d
 j||d urt| ddjddnd |d urt| ddj|d u rdnddnd d} tj|||d}!t|d |||||j|}"t }#tj|||||||d u|||d ur5|jnd |d ur>|jnd |d u|d udd}$tj}%|$|%vr|d dkr_t|||d }|||d j|||d!}&t !|&|d j|d j|d j|d" j| |!|"|#	|%|$< |%|$ |d j|d j|d j|d" j| |!|"|# d S )#Nz!varlen_m requires persistent=Truer   z!varlen_m requires A to be k-majorz!varlen_m requires D to be n-majorz'varlen_m requires PostAct to be n-majorz9gather_A requires varlen (cu_seqlens_m must be specified)zgather_A requires cluster_N=1zUnsupported activation r  )additional_tensorsre   r  )varlen_m)mkl)nr  r  )r  r   r  )r	  r
  r  r  r  r   )	   
   z!Only SM90 and SM100 are supportedr!  r	  r
  r  z;Skipping due to unsupported combination of types and majors   )assumed_align)leading_dim)r   r   )r  )r	  r
  r  r  r  )key_tensor_names)r  is_persistent)gather_Ar  )"rF   
act_fn_mapr   validate_and_prepare_tensorspermute_tensorsextract_dtypesdetermine_major_ordersr   devicer  r  r	   is_valid_dtypesdtypemajor	TypeErrorr   create_cute_tensorsr)   cute_tensorr   detachmark_layout_dynamiccreate_scheduler_argscreate_varlen_argsr   cutlass_torchcurrent_streamget_compile_keygemm_actcompile_cacher   r#   compile)'r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  re   r  r(  LMKNtensor_infosmajor_configsdevice_capacityGemmClsr   tile_shape_mncluster_shape_mnkmax_active_clustersr   epi_argsscheduler_argsvarlen_argsr:  compile_keycachegemm_objr'   r'   r(   r<  k  s   




r<  )FTrk   NNNN)@typingr   r   r   	functoolsr   dataclassesr   torchr   r%   cutlass.cuter#   cutlass.utils.hopper_helpersrQ   hopper_helpersr   cutlass.utils.blackwell_helpersblackwell_helpersrY   r   r	   r
   r   cutlass.cutlass_dslr   cutlass.torchr9  cutlass.cute.runtimer   quack.cute_dsl_utilsr   r   quack.varlen_utilsr   quack.gemm_sm90r   quack.gemm_sm100r   quack.gemm_default_epir   r   r   quack.gemm_wrapper_utilsr   quack.sm90_utilsrZ   quack.copy_utilsr   quack.activationquackr   r  r  r  r  r  r  r)  strr   boolr<  r=  r'   r'   r'   r(   <module>   s     @	

 
