o
    c۷i.                     @   s   d dl mZmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d dlmZ d dlmZ d d
lmZ G dd dZG dd deeZG dd deeZ dS )    )
NamedTupleOptionalTuple)partial)	dataclassN)Int32Float32Boolean
const_expr)
ParamsBasemlir_namedtuple)GemmSm90)	GemmSm100)partition_for_epilogue)VarlenManagerc                   @   sH  e Zd ZeG dd deZeG dd deZddddedefd	d
Z	e
jdedee
jdf de
jdee
j de
jde
jdedejjdefddZdede
jfddZe
j	d&dedee
jdf de
jdee
j dee
j f
ddZedee deeeef de
jdefd d!Zdefd"d#Zdedee
jdf fd$d%ZdS )'GemmDefaultEpiMixinc                   @   sp   e Zd ZU dZeeejB  ed< dZ	eeejB  ed< dZ
eej ed< dZeej ed< dZeje ed< dS )z%GemmDefaultEpiMixin.EpilogueArgumentsNalphabetamRowVecBroadcastmColVecBroadcastFadd_to_output)__name__
__module____qualname__r   r   r   cuteTensor__annotations__r   r   r   r   cutlass	Constexprbool r    r    L/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_default_epi.pyEpilogueArguments   s   
 r"   c                   @   s^   e Zd ZU dZeeejB  ed< dZ	eeejB  ed< dZ
eej ed< dZeej ed< dS )z"GemmDefaultEpiMixin.EpilogueParamsNr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r    r    r    r!   EpilogueParams   s
   
 r#   N)locipargsreturnc                   s<   dd   fdd|j |jfD \}}| j|j|j||dS )Nc                    s   t  fdd jD S )Nc                 3   s4    | ]}t |st j|d  jj dn|V  qdS )    )divbyN)r   	is_staticassumeelement_typewidth).0str    r!   	<genexpr>(   s
    "
zTGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<lambda>.<locals>.<genexpr>)tuplestrider0   r    r0   r!   <lambda>(   s    zAGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<lambda>c              	      s6   g | ]}|d urt |jt j|j |dnd qS )Nr4   )r   make_tensoriteratormake_layoutshape)r.   r1   
new_strider    r!   
<listcomp>,   s    zCGemmDefaultEpiMixin.epi_to_underlying_arguments.<locals>.<listcomp>)r   r   r   r   )r   r   r#   r   r   )selfr&   r$   r%   r   r   r    r;   r!   epi_to_underlying_arguments$   s   


z/GemmDefaultEpiMixin.epi_to_underlying_argumentsparamsepi_smem_tensors.epi_tiletiled_copy_t2rtiled_copy_r2stile_coord_mnklvarlen_managerepilogue_barriertidxc
           *      C   s  d\}
}t t|do|jd urt|j}
t t|do |jd ur(t|j}|^}}}| jd | jd }}|d }| jtj	j
 }tt||d urL|n||	|d u d}d }t |jd ur|jj}t td|j|j }tj|jj||d	d
|	}|j|d f }t||f|d f}||}||}|t|}t|jd |d |  |}tdt|jd ft}tjt|jd d	dD ]}|d|f |k |d|f< qtj||||d |t |j!tj"||fdd}t |d ur|#|}d } t |j$d ur|j$j}!t td|!j|!j }tj|j$j||d	d
|	}"t |j% r0|j$|d f }#nt&|j'j(| f|j$}#t|#|f|d f}$|"|$}%|"|}&|"t|}'t|)||d |  |}(tdt|&jd ft})tjt|&jd d	dD ]}|'d|f |(k |)d|f< qtj|"|%|&|)d |t |j!tj"||fdd} t |d ur|#| } t |jd up|j$d urtj	*  tj	+d |,  |
||| fS )N)NNr   r   r         )rB   
tiled_copyrH   reference_srcr(   T)is_asyncunroll_full)pred)r   rI   r6   )rI   r   )-r
   hasattrr   utilsload_scalar_or_pointerr   cta_tile_shape_mnknum_epi_warpsr   arch	WARP_SIZEr   r   r   r,   maxr-   
copy_utilstiled_copy_1d	get_slice
local_tilepartition_Spartition_Dmake_identity_tensorminr:   make_rmem_tensorsizer	   r   rangecopyr7   r8   r9   retiler   varlen_mdomain_offsetr@   cu_seqlens_mlen_mcp_async_commit_groupcp_async_wait_grouparrive_and_wait)*r>   r@   rA   rB   rC   rD   rE   rF   rG   rH   r   r   sRowVecsColVecresttile_Mtile_N	batch_idxnum_epi_threadspartition_for_epilogue_fn	tDsRowVecrowvec_dtypenum_copy_elemsthr_copy_RVmRowVecgRowVectRVgRVtRVsRVtRVcRVlimit_ntRVpRVm	tDsColVeccolvec_dtypethr_copy_CVmColVecgColVectCVgCVtCVsCVtCVcCVlimit_mtCVpCVr    r    r!   	epi_begin9   s   







zGemmDefaultEpiMixin.epi_begin	epi_coordc                 C   s  |\}}}}d }t |d urDt|dt|d d d |f }	t|	j|	j}
tt|	t|
 t	|
| j
}||
 | j
 d }t |d urt|dt|d d d |f }t|j|j}tt|t| t	|| j
}|| | j
 ||||fS )NrJ   )r
   r   group_modesrankra   layoutr,   autovec_copyfilter_zerosmake_fragment_like	acc_dtypestoreloadto)r>   r@   epi_tensorsr   r   r   ru   r   tDrRowVec_cvttDsRowVec_cur	tDrRowVectDrColVec_cvttDsColVec_cur	tDrColVecr    r    r!   epi_begin_loop   s(   

z"GemmDefaultEpiMixin.epi_begin_loopepi_loop_tensorstRS_rDtRS_rCc                 C   s"  |\}}}}|  }	tt|do|jd ur t|j}|	|9 }	t|d urPtt|d p1|jd u r>|	|  |j7 }	nt|j}|	||  |j 7 }	|	|	 t|d urrt
jt|ddD ]}
||
  ||
 7  < qet|d urt
jt|ddD ]}
||
  ||
 7  < qd S )Nr   r   TrN   )r   r
   rQ   r   rR   rS   r   r   r,   r   r   rc   r   rb   )r>   r@   r   r   r   r   r   r   r   rDir    r    r!   epi_visit_subtile   s$   
z%GemmDefaultEpiMixin.epi_visit_subtilerT   c                 C   sp   | j d u rdn|d }| jd u rdn|d }| j d ur| j jnt}| jd ur*| jjnt}||j ||j  d S )Nr   rI      )r   r   r,   r   r-   )r&   rT   rB   row_vec_smem_sizecol_vec_smem_sizerow_vec_dtypecol_vec_dtyper    r    r!   epi_smem_bytes_per_stage   s   z,GemmDefaultEpiMixin.epi_smem_bytes_per_stagec                    s~   |j d u rdn| jd |jd u rdn| jd |j d ur!|j jnt|jd ur,|jjnt tjG  fddd}|S )Nr   rI   c                       sN   e Zd ZU ejjejjf df ed< ejjejj f df ed< dS )zAGemmDefaultEpiMixin.epi_get_smem_struct.<locals>.EpiSharedStorage   rm   rn   N)r   r   r   r   structAlignMemRanger   r    r   r   r   r   r    r!   EpiSharedStorage   s   
  $r   )r   rT   r   r,   r   r   r   )r>   r@   r   r    r   r!   epi_get_smem_struct   s   z'GemmDefaultEpiMixin.epi_get_smem_structc                 C   s`   d }t |jd ur|jjt| jd }d }t |jd ur,|jj	t| jd }||fS )NrI   r   )
r
   r   epirm   
get_tensorr   r9   rT   r   rn   )r>   r@   storagerm   rn   r    r    r!   epi_get_smem_tensors   s   z(GemmDefaultEpiMixin.epi_get_smem_tensors)N)r   r   r   r   r   r"   r   r   r#   r?   r   jitr   r   Tiler   	TiledCopyCoordr   r   pipelineNamedBarrierr   r   r   r   staticmethodintr   r   r   r    r    r    r!   r      sr    
	
] r   c                   @      e Zd ZdS )GemmDefaultSm90Nr   r   r   r    r    r    r!   r          r   c                   @   r   )GemmDefaultSm100Nr   r    r    r    r!   r      r   r   )!typingr   r   r   	functoolsr   dataclassesr   r   cutlass.cuter   r   r   r	   r
   quack.cute_dsl_utilsr   r   quack.gemm_sm90r   quack.gemm_sm100r   quack.sm90_utilsr   quack.utilsrR   quack.copy_utilsrY   quack.varlen_utilsr   r   r   r   r    r    r    r!   <module>   s"    i