o
    پi1                     @  s~   d Z ddlmZ ddlZddlmZ ddlZddlmZm	Z	m
Z
 eeZedEddZedFddZdGd)d*ZdHdCdDZdS )IaU  
Fused metadata copy kernel for NSA backend CUDA graph replay.

This module provides JIT-compiled CUDA kernels for fusing multiple tensor
copy operations into single kernel launches, reducing kernel launch overhead
and improving CUDA graph replay performance.

The kernels are compiled on-demand using TVM FFI and cached for subsequent use.
    )annotationsN)Optional)
cache_onceload_jitmake_cpp_argsforward_modeinthas_real_page_tableboolhas_flashmlac                 C  st   t | ||}ztdg|R dgdd| dfgdW S  ty9 } ztd|  d| d| d	|   d
}~ww )a  Compile JIT module for single-backend fused metadata copy.

    Args:
        forward_mode: 0=DECODE, 1=TARGET_VERIFY, 2=DRAFT_EXTEND
        has_real_page_table: Whether real_page_table tensors are used
        has_flashmla: Whether FlashMLA metadata tensors are used
    fused_metadata_copy#elementwise/fused_metadata_copy.cuhzFusedMetadataCopyKernel<>::run
cuda_filescuda_wrappersz?Failed to compile JIT fused metadata copy kernel (forward_mode=z, has_real_page_table=, has_flashmla=): Nr   r   	Exceptionloggererror)r   r	   r   argse r   Y/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/fused_metadata_copy.py_jit_fused_metadata_copy_module   s8   

r   c                 C  sl   t | |}ztdg|R dgdd| dfgdW S  ty5 } ztd|  d| d|   d	}~ww )
zCompile JIT module for multi-backend fused metadata copy (DECODE mode only).

    Args:
        has_real_page_table: Whether real_page_table tensors are used
        has_flashmla: Whether FlashMLA metadata tensors are used
    fused_metadata_copy_multir   zFusedMetadataCopyMultiKernel<r   r   zLFailed to compile JIT fused metadata copy multi kernel (has_real_page_table=r   r   Nr   )r	   r   r   r   r   r   r   %_jit_fused_metadata_copy_multi_module=   s4   



r   cache_seqlens_srctorch.Tensorcu_seqlens_k_srcpage_indices_srcnsa_cache_seqlens_srcseqlens_expanded_srcOptional[torch.Tensor]nsa_cu_seqlens_k_srcreal_page_table_srcflashmla_num_splits_srcflashmla_metadata_srccache_seqlens_dstcu_seqlens_k_dstpage_table_1_dstnsa_cache_seqlens_dstseqlens_expanded_dstnsa_cu_seqlens_k_dstreal_page_table_dstflashmla_num_splits_dstflashmla_metadata_dstbsmax_lenmax_seqlen_kseqlens_expanded_sizereturnNonec                 C  s   |du}|du}t |||}|  } | }| }| }|dur&| }| }|| |||||||||	|
||||||||||| dS )a  
    Fused metadata copy kernel for NSA backend CUDA graph replay.

    This function fuses multiple tensor copy operations into a single kernel launch,
    reducing kernel launch overhead and improving performance.

    Args:
        cache_seqlens_src: Source cache sequence lengths [bs]
        cu_seqlens_k_src: Source cumulative sequence lengths [bs+1]
        page_indices_src: Source page indices [rows, max_len]
        nsa_cache_seqlens_src: Source NSA cache sequence lengths [size]
        seqlens_expanded_src: Optional source expanded sequence lengths [size] (required for TARGET_VERIFY/DRAFT_EXTEND)
        nsa_cu_seqlens_k_src: Source NSA cumulative sequence lengths [size+1]
        real_page_table_src: Optional source real page table [rows, cols]
        flashmla_num_splits_src: Optional source FlashMLA num_splits [size+1]
        flashmla_metadata_src: Optional source FlashMLA metadata tensor
        cache_seqlens_dst: Destination cache sequence lengths [bs]
        cu_seqlens_k_dst: Destination cumulative sequence lengths [bs+1]
        page_table_1_dst: Destination page table [rows, stride]
        nsa_cache_seqlens_dst: Destination NSA cache sequence lengths [size]
        seqlens_expanded_dst: Optional destination expanded sequence lengths [size] (required for TARGET_VERIFY/DRAFT_EXTEND)
        nsa_cu_seqlens_k_dst: Destination NSA cumulative sequence lengths [size+1]
        real_page_table_dst: Optional destination real page table [rows, cols]
        flashmla_num_splits_dst: Optional destination FlashMLA num_splits [size+1]
        flashmla_metadata_dst: Optional destination FlashMLA metadata tensor
        forward_mode: Forward mode (0=DECODE, 1=TARGET_VERIFY, 2=DRAFT_EXTEND)
        bs: Batch size
        max_len: Maximum length for decode/draft_extend mode
        max_seqlen_k: Maximum sequence length for target_verify mode
        seqlens_expanded_size: Size of expanded sequence lengths
    N)r   
contiguousr   )r   r!   r"   r#   r$   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r   r3   r4   r5   r6   r	   r   moduler   r   r   fused_metadata_copy_cudaa   sH   9r;   cache_seqlens_dst0cu_seqlens_k_dst0page_table_1_dst0nsa_cache_seqlens_dst0nsa_cu_seqlens_k_dst0real_page_table_dst0flashmla_num_splits_dst0flashmla_metadata_dst0cache_seqlens_dst1cu_seqlens_k_dst1page_table_1_dst1nsa_cache_seqlens_dst1nsa_cu_seqlens_k_dst1real_page_table_dst1flashmla_num_splits_dst1flashmla_metadata_dst1cache_seqlens_dst2cu_seqlens_k_dst2page_table_1_dst2nsa_cache_seqlens_dst2nsa_cu_seqlens_k_dst2real_page_table_dst2flashmla_num_splits_dst2flashmla_metadata_dst2c#           &      C  s   |du}#|du}$t |#|$}%|  } | }| }| }| }|%jg | |||||||||	|
|||||||||||||||||||||| |!|"R   dS )a  
    Multi-backend fused metadata copy kernel for NSA backend CUDA graph replay.

    This function copies metadata from one source to THREE destinations in a single
    kernel launch, eliminating the overhead of 3 separate kernel calls. Currently
    only supports DECODE mode, which is the most common case.

    Args:
        cache_seqlens_src: Source cache sequence lengths [bs]
        cu_seqlens_k_src: Source cumulative sequence lengths [bs+1]
        page_indices_src: Source page indices [bs, max_len]
        nsa_cache_seqlens_src: Source NSA cache sequence lengths [bs]
        nsa_cu_seqlens_k_src: Source NSA cumulative sequence lengths [bs+1]
        real_page_table_src: Optional source real page table [bs, cols]
        flashmla_num_splits_src: Optional source FlashMLA num_splits [bs+1]
        flashmla_metadata_src: Optional source FlashMLA metadata tensor
        cache_seqlens_dst0-2: Destination cache sequence lengths for backends 0-2
        cu_seqlens_k_dst0-2: Destination cumulative sequence lengths for backends 0-2
        page_table_1_dst0-2: Destination page tables for backends 0-2
        nsa_cache_seqlens_dst0-2: Destination NSA cache sequence lengths for backends 0-2
        nsa_cu_seqlens_k_dst0-2: Destination NSA cumulative sequence lengths for backends 0-2
        real_page_table_dst0-2: Optional destination real page tables for backends 0-2
        flashmla_num_splits_dst0-2: Optional destination FlashMLA num_splits for backends 0-2
        flashmla_metadata_dst0-2: Optional destination FlashMLA metadata tensors for backends 0-2
        bs: Batch size
        max_len: Maximum length for decode mode
        seqlens_expanded_size: Size of expanded sequence lengths
    N)r   r9   r   )&r   r!   r"   r#   r&   r'   r(   r)   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   r3   r4   r6   r	   r   r:   r   r   r   fused_metadata_copy_multi_cuda   s   B
	
 !"#rT   )r   r   r	   r
   r   r
   )r	   r
   r   r
   )0r   r    r!   r    r"   r    r#   r    r$   r%   r&   r    r'   r%   r(   r%   r)   r%   r*   r    r+   r    r,   r    r-   r    r.   r%   r/   r    r0   r%   r1   r%   r2   r%   r   r   r3   r   r4   r   r5   r   r6   r   r7   r8   )Hr   r    r!   r    r"   r    r#   r    r&   r    r'   r%   r(   r%   r)   r%   r<   r    r=   r    r>   r    r?   r    r@   r    rA   r%   rB   r%   rC   r%   rD   r    rE   r    rF   r    rG   r    rH   r    rI   r%   rJ   r%   rK   r%   rL   r    rM   r    rN   r    rO   r    rP   r    rQ   r%   rR   r%   rS   r%   r3   r   r4   r   r6   r   r7   r8   )__doc__
__future__r   loggingtypingr   torchsglang.jit_kernel.utilsr   r   r   	getLogger__name__r   r   r   r;   rT   r   r   r   r   <module>   s    

 
#f