o
    ½Ù¾i©  ã                   @   s`  d dl mZmZ d dlZzd dlmZ W n ey( Z zeZW Y dZ[ndZ[ww dZe	dƒZ
			d"dejdeded	ee d
edee deejejf fdd„Z						d#dejdejdejdejdedejdejdee dedejdB dejdB d
edeej deejejf fdd„Z	d$dejdejdejdededeejejejf fd d!„ZdS )%é    )ÚOptionalÚTupleN)Úflashmla_opszLFailed to load sgl_kernel.flashmla_ops extension. Ensure CUDA Driver >= 12.4FÚcache_seqlensÚnum_q_tokens_per_head_kÚnum_heads_kÚnum_heads_qÚis_fp8_kvcacheÚtopkÚreturnc                 C   s:   |r|du rt jjj | ||¡S t jjj | |||||¡S )aÕ  
    Arguments:
        cache_seqlens: (batch_size), dtype torch.int32.
        num_q_tokens_per_head_k: Equals to num_q_tokens_per_q_seq * num_heads_q // num_heads_k.
        num_heads_k: The number of k heads.
        num_heads_q: The number of q heads. This argument is optional when sparse attention is not enabled
        is_fp8_kvcache: Whether the k_cache and v_cache are in fp8 format.
        topk: If not None, sparse attention will be enabled, and only tokens in the `indices` array passed to `flash_mla_with_kvcache_sm90` will be attended to.

    Returns:
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
        num_splits: (batch_size + 1), dtype torch.int32.
    N)ÚtorchÚopsÚ
sgl_kernelÚ#get_mla_decoding_metadata_dense_fp8ÚdefaultÚget_mla_decoding_metadata)r   r   r   r   r	   r
   © r   úH/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/flash_mla.pyÚget_mla_metadata   s   
ý
úr   ÚqÚk_cacheÚblock_tableÚ
head_dim_vÚtile_scheduler_metadataÚ
num_splitsÚsoftmax_scaleÚcausalÚ	descale_qÚ	descale_kÚindicesc                 C   sº   |du r| j d d }|dur|dksJ dƒ‚|	du |
du ks#J dƒ‚|du rE|  ¡ dkrEtjjj | |||||||||	|
¡\}}||fS tjjj | ||||||||||¡\}}||fS )aÃ  
    Arguments:
        q: (batch_size, seq_len_q, num_heads_q, head_dim).
        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
        cache_seqlens: (batch_size), torch.int32.
        head_dim_v: Head dimension of v.
        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, returned by get_mla_metadata.
        num_splits: (batch_size + 1), torch.int32, returned by get_mla_metadata.
        softmax_scale: float. The scale of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
        causal: bool. Whether to apply causal attention mask.
        descale_q: (batch_size), torch.float32. Descaling factors for Q, used for fp8 quantization.
        descale_k: (batch_size), torch.float32. Descaling factors for K, used for fp8 quantization.
        is_fp8_kvcache: bool. Whether the k_cache and v_cache are in fp8 format. For the format of FP8 KV cache, please refer to README.md
        indices: (batch_size, seq_len_q, topk), torch.int32. If not None, sparse attention will be enabled, and only tokens in the `indices` array will be attended to. Invalid indices should be set to -1 or numbers >= total_seq_len_kv. For details about how to set up `indices`, please refer to README.md.

    Returns:
        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
    Néÿÿÿÿg      à¿Fz6causal must be `false` if sparse attention is enabled.z<descale_q and descale_k should be both None or both not Noneé   )ÚshapeÚelement_sizer   r   r   Úfwd_kvcache_mla_fp8r   Úfwd_kvcache_mla)r   r   r   r   r   r   r   r   r   r   r   r	   r   ÚoutÚsoftmax_lser   r   r   Úflash_mla_with_kvcache6   sL   #ÿþ
õ
óõr(   é   ÚkvÚsm_scaleÚd_vc                 C   s   t jjj | ||||¡}|S )aK  
    Sparse attention prefill kernel

    Args:
        q: [s_q, h_q, d_qk], bfloat16
        kv: [s_kv, h_kv, d_qk], bfloat16
        indices: [s_q, h_kv, topk], int32. Invalid indices should be set to -1 or numbers >= s_kv
        sm_scale: float
        d_v: The dimension of value vectors. Can only be 512

    Returns:
        (output, max_logits, lse)
        About the definition of output, max_logits and lse, please refer to README.md
        - output: [s_q, h_q, d_v], bfloat16
        - max_logits:  [s_q, h_q], float
        - lse: [s_q, h_q], float, 2-based log-sum-exp
    )r   r   r   Úsparse_prefill_fwdr   )r   r*   r   r+   r,   Úresultsr   r   r   Úflash_mla_sparse_fwd€   s   

ÿr/   )NFN)NFNNFN)r)   )Útypingr   r   r   r   r   Ú	ExceptionÚ_eÚ_flashmla_import_errorÚImportErrorÚ_IMPORT_ERRORÚTensorÚintÚboolr   Úfloatr(   r/   r   r   r   r   Ú<module>   sž    €ÿÿ	úÿþýüûú
ù-óÿþýüûúùø	÷
öõôó
òOûÿþýüûú