o
    c۷i                     @   s   d dl mZ d dlmZ d dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zd%ddZd&ddZdd Zdd Zdd Z	d'ddZdd Zdd Zdd Z			d(dd Zd!d" Z			d)d#d$Z dS )*    )partialN)Int32Float32)make_ptrcompile_and_cache)make_fake_tensor)torch2cute_dtype_map)TileSchedulerOptionsVarlenArgumentsc                 C   s
   d| j  S )zF16-byte alignment: divisibility in elements = 128 // dtype_width_bits.   )width)dtype r   N/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/gemm_tvm_ffi_utils.pydiv_for_dtype      
r   Fc                 C   s(   | dur| j dkr|s| dddS | S )zTPermute a single 3D tensor from (L, *, *) to (*, *, L), skipping for varlen_m or 2D.N         r   ndimpermute)tvarlen_mr   r   r   perm3d_single   s   (r   c                 C   sP   dd }|r| ||||fS |r| |||||fS || ||||||fS )z/Permute 3D tensors from (L, *, *) to (*, *, L).c                 S   s$   | d ur| j dkr| dddS | S )Nr   r   r   r   r   )r   r   r   r   _perm   s   $zperm3d.<locals>._permr   )ABDCr   varlen_kr   r   r   r   perm3d   s   r#   c                 C   s   |  ddkr	|S |S )Nr   )stride)r   dim0dim1r   r   r   	get_major*   s   r'   c                 C   sH   t | dd}t |dd}t |dd}|d urt |ddnd }||||fS )Nmkn)r'   )A_pB_pD_pC_pa_majorb_majord_majorc_majorr   r   r   
get_majors.   s
   r3   c                 C   s@   t | j }t |j }t |j }|d urt |j nd }||||fS N)r	   r   )r   r   r    r!   a_dtypeb_dtyped_dtypec_dtyper   r   r   
get_dtypes6   s
   


r9   c                 C   s(   t | d ||d ur| |dS d |dS )N)max_active_clustersraster_ordermax_swizzle_sizetile_count_semaphorebatch_idx_permute)r
   data_ptrr:   r<   r=   r>   r   r   r   make_scheduler_args>   s   rA   c              
   C   sJ   t tdtd| rttdtjjddnd |r!tt|fddddS d dS )Nr      r      )assumed_alignleading_dimdivisibilityr@   )r
   r   r   cuteAddressSpacegmemfake_tensor)has_semaphorehas_batch_idx_permutel_symr   r   r   make_fake_scheduler_argsL   s   	rO   c                 C   s"   | d u r
|d u r
d S t | ||dS )NmCuSeqlensMmCuSeqlensKmAIdxr   )cu_seqlens_mcu_seqlens_kA_idxr   r   r   make_varlen_args[   s   rW   c              	   C   sh   | s|sd S t  }t| rtt|fdddnd |r"tt|fdddnd |r0tt|fddddS d dS )Nr   rC   rE   rP   )rH   sym_intr   rK   r   )r   r"   gather_Aaidx_lennum_seqlensr   r   r   make_fake_varlen_argse   s   r\   c                 C   s  |dkrdnd}|dkrdnd}|dkrdnd}|dkrdnd}t  t  t  t  f\}}}}t| }t|}|durBt|nd}|durLt|nd}|rt  }|
rZt  n|}t| ||f||d}t||||f||d}t|||f||d}t|||f||d}ne|	rt  }|
rt  n|}t| ||f||d}t|||f||d}t||||f||d}t||||f||d}n,t| |||f||d}t||||f||d}t||||f||d}t||||f||d}||||||||fS )a  Create fake tensors for mA, mB, mD, mC with shared sym_ints.
    Pass dtype=None to get None for that tensor (e.g. optional C).
    Returns (mA, mB, mD, mC, m, n, k, l).
    When varlen_m, m is total_m (flattened M of D/C). When varlen_k, k is total_k.
    r)   r   r   r*   NrE   )rH   rX   r   rK   )r5   r6   r7   r8   r/   r0   r1   r2   r   r"   rY   	a_leading	b_leading	d_leading	c_leadingr(   r*   r)   ldiv_adiv_bdiv_ddiv_ca_mmAmBmDmCa_kr   r   r   make_fake_gemm_tensorsv   s8   $rl   c                 C   s
   t | |S )zWFilesystem caching via compile_and_cache. Use @lru_cache on caller for in-memory cache.r   )key
compile_fnr   r   r   cached_compile   r   ro   c              
   C   s   |d dkrt | ||d} | t||||d}|r|| tjjdd}|d dkr+dn||f}tj|||	|
|||||g	|R dd	iS )
zJBuild GemmCls instance, apply SM90 partial, and cute.compile with TVM-FFI.r   	   )pingpongis_persistent)rY   T)use_tvm_ffi_env_streamr   optionsz--enable-tvm-ffi)r   r   rH   runtimemake_fake_streamcompile)GemmClsr5   tile_shape_mncluster_shape_mnkrq   
persistentrY   device_capacityrg   rh   ri   rj   epi_argsscheduler_argsvarlen_args	post_initmSFAmSFBgemm_objstreamsf_argsr   r   r   compile_gemm_kernel   s,   
r   )F)FFr4   )FFF)NNN)!	functoolsr   cutlass.cuterH   cutlassr   r   cutlass.cute.runtimer   quack.cache_utilsr   quack.compile_utilsr   rK   quack.cute_dsl_utilsr	   quack.tile_schedulerr
   quack.varlen_utilsr   r   r   r#   r'   r3   r9   rA   rO   rW   r\   rl   ro   r   r   r   r   r   <module>   s:   

	


3