o
    پi{                     @   s  d dl mZmZ d dlZd dlmZ d dlZd dlm	Z	 d dl
mZmZ ed ejjdejjdiZeejdr@deejj< neejdrLd	eejj< e Ze Ze	 Zd
Zer[dndZdZdd Zdd Zdd Zejedeeed	fddZ	dKdej de!dee" deej ej f fddZ#ejdgeddLd e!d!e!fd"d#Z$d$ej d%ej d&ej d'ej dej f
d(d)Z%ejd*gejjdejjdidd+ddd,d-d.d/d0d1Z&ejd*gg d2d3d+dd,d4d5e!d6e!d7e!d8e!d9e!d:ee' d;e!fd<d=Z(ejd>d*gejjdejjdidd+ddd,d.d?d@dAZ)ejd*gejjdejjdidd,d.dBdCdDZ*	EdMd$ej dFej dGej d:e'dHe!dej fdIdJZ+dS )N    )OptionalTupleN)is_fp8_fnuz)is_gfx95_supportedis_hipWARNINGTTL_DISABLE_FAST_MATHTL_ENABLE_FAST_MATHFbfloat16float8_e4m3fnuzfloat8_e4m3float32c              	   C   sB   t d| }|d? d@ }|d@ }t d|d t |dkdd S )	Nuint32      i int32   r      )TreinterpretCastif_then_else)xbits_xexp_xman_bits r   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/nsa/tilelang_kernel.pyfast_log2_ceil   s   "r   c                 C   s   | d d> }t d|S )Nr   r   r   )r   r   )r   r   r   r   r   	fast_pow2&   s   r   c                 C   s   t t| | S )N)r   r   )amaxfp8_max_invr   r   r   fast_round_scale+   s   r"   )pass_configsc                    s   t d tr	dndtrdndd 
rdndd	d
t jdt j ff dt j f	f dt j t ff f 	
fdd}|S )NMg      lg      |g      l@g      |@r   r             XYSc              	      s  t jt  t dd\}}t f}t f}t f}t f}t f	}	t f	}
t jddD ]}t | | | f | t || t j||dd t D ] }t 	|| d||< 
rt
|| ||< qk||  ||< qkt D ]\}}t |||f ||  |	||f< qt D ]}|| || | |f< qt |	|
 t |
|| | f  qHW d    d S 1 sw   Y  d S )Nr'   threadsr   
num_stagesdimg-C6?)r   Kernelceildivalloc_sharedalloc_fragment	Pipelinedcopyreduce_absmaxParallelmaxr"   clamp)r(   r)   r*   pid_mpid_nx_sharedx_local
amax_locals_localy_localy_shared_ijr$   Nblk_mfp8_maxr!   fp8_min
group_sizein_dtyper.   	out_dtyperound_scalescale_dtyper   r   act_quant_kernel_;   s8   ""z+act_quant_kernel.<locals>.act_quant_kernel_)r   symbolic_is_fp8_fnuz	prim_funcTensorr2   )rG   rL   rM   rO   rN   rP   r   rF   r   act_quant_kernel/   s    
&#rU   r'   r   
block_size	scale_fmtreturnc                 C   s   |   sJ d| d| dksJ d| d| d}tr)tj| tjd}ntj| tjd}| jg |  dd || R dtji}t	||dud	}|| 
d||
d||
d||  ||fS )
av  
    Quantizes the input tensor `x` using block-wise quantization.

    Args:
        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The quantized tensor with dtype `torch.float8_e4m3fn`.
            - A tensor of scaling factors with dtype `torch.float32`.
    zInput tensor must be contiguousr   z@Last dimension size must be divisible by block_size (block_size=))dtypeNr[   )rN   )is_contiguoussizerR   torch
empty_liker   float8_e4m3fn	new_emptyr   rU   view)r   rV   rW   rG   yskernelr   r   r   	act_quantb   s   

,(rf      )out_idxr#   hdc                    s   t d t dt dddt jdt j ftf dt j ftf dt j ftf d	t j ftf d
t j ftf dd f fdd}|S )Nbmn   r'   qq_skk_sorX   c                    s  t  t \}}}t ft}t | ||ddf | t t}	t |||df |	 t j ddD ]}
t ft}t ||| |
  df | t t}t ||| |
  f | t ft}s~t 	|d t j
|||ddd t D ]\}}t |||f d|	|  |||f< qt t}t j||dd t D ]}||  || 9  < qt ||||| |
  f  q=W d    d S 1 sw   Y  d S )	Nr   r%   r-   FT)transpose_Atranspose_Bclear_accumr   r/   )r   r1   r2   r3   FP8r6   r4   FP32r5   fillgemmr8   r9   
reduce_sum)ro   rp   rq   rr   rs   i_bi_mi1_nq_smemq_s_fragi2_nk_smemk_s_fraglogitsi_hi3_n
logits_sumrk   blk_n1blk_n2rv   rj   ri   rl   rm   r   r   fp8_index_kernel_   s<   " 	&$"z+fp8_index_kernel.<locals>.fp8_index_kernel_)r   rQ   rS   rT   rw   rx   )ri   rj   rv   r   r   r   r   fp8_index_kernel   s(   


,r   ro   rp   rq   rr   c                 C   sF   t rt| jd | jd d| |||S t| jd | jd | |||S )a  
    Perform index score using FP8 precision.

    Args:
        q (torch.Tensor): The Q tensor, must be contiguous.
        q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous.
        k (torch.Tensor): The K tensor, must be contiguous.
        k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous.

        fp8 q @ fp8 k -> fp32 logits
        relu(fp32 logits) * q_s (weights) -> fp32 logits
        fp32 logits -> fp32 logits_sum
        fp32 logits_sum * k_s (e8m0) -> fp32 index_score
    r%      F)_is_hipr   shape)ro   rp   rq   rr   r   r   r   	fp8_index   s   " r   rY   r   @   r%      )kv_groupsm_scale	is_causalblock_Ir.   r,   c                   s  |t j|ksJ d| |t j|ksJ d| |dks&J d|| dks0J dd u r?d||  d d nd td	td
td}
| 	 }| || g}|
	|| g}| |g}	|g}d}dd|}tt j|d|kr	dksJ | t |||||dkr|d dksJ d|d nddkrndtjdt|dt|dt||dt|f 	
fdd}|S )N+haven't check padding correctness yet, dim=Tznon-casual is not supportedr   Cotherwise will load some index=0 thus causing wrong kv to be loaded      ?      ?/ldG?batchseq_len
seq_len_kvr   r
   float   r   r   "head_kv should be a multiple of 64QKVIndicesOutputc                     s  t j 	dE\}}}t g}t g}t  g}	t  g}
t g}t  gd}t g}t  g}t  g}t g}t g}t g}t g}t g}t |d t |d t |d ||}}dkr|n| }|}|}| dkrdn| d  }| }t | ||||d f | t | ||||d f | t j
dD ]'}t  D ]}|||||  | f dk||< qt  D ]\}}|||||||  | f ||f |	||f< qt  D ]\}}|||||||  | f || f |
||f< qt  D ]\}}t || dt 	|j
 |||f< q@t j||	|dt jjd	 t j||
|dt jjd	 t || t j||dd
d t D ]}t || ||   ||< qt  D ]\}}t |||f  ||   |||f< qt j||dd t D ]}|| ||  ||  ||< qt D ]\}}|||f ||  |||f< qt || t j||	|t jjd qt D ]\}}|||f  ||   < q	t D ]}t || ||   ||< q t || t ||||||d d f  W d    d S 1 sUw   Y  d S )Nr+   boolr      r   r   r-   Tru   policyFr0   clearr/   r   )r   r1   r3   r4   ry   r6   r5   r8   r   infinityr[   rz   GemmWarpPolicyFullCol
reduce_maxexp2r{   log2) r   r   r   r   bxbybzQ_sharedQ_tail_shared	KV_sharedK_tail_sharedO_sharedmaskacc_oacc_sS_sharedsumexpsumexp_ialpham_im_i_prevb_ig_is_iq_imax_kv_iH0H1i_ibi_id_ih_iBIDD_tailH_per_blockNIREPLICATE_Haccum_dtyper   r[   r   r.   padded_Hr   r   r,   r   r   main  s   
   """  ""$z,sparse_attention_fwd_kernel_v1.<locals>.main	tilelangmathnext_power_of_2r   rQ   r9   cdivrS   rT   )	num_headsr0   tail_dimtopkr   r   r   r   r.   r,   r   head_kvq_shapekv_shapeo_shapeindices_shapeindices_dtypeHr   r   r   r   sparse_attention_fwd_kernel_v1   sh   







,dr   )
z-O3z-Wno-deprecated-declarationsz-U__CUDA_NO_HALF_OPERATORS__z-U__CUDA_NO_HALF_CONVERSIONS__z-U__CUDA_NO_HALF2_OPERATORS__z"-U__CUDA_NO_BFLOAT16_CONVERSIONS__z--expt-relaxed-constexprz--expt-extended-lambdaz,--ptxas-options=-v,--register-usage-level=10z-DNDEBUG)rh   compile_flags)r   r   r   r   r0   r   r   r   r   r   c                   s  |t j|ksJ d| |t j|ksJ d| || dks(J dd u r7d||  d d nd dtdtd	td
}| || g}|||| g}	| |g}
||g}d	dd| }tt j| d

|kr|dksJ | t ||d dksJ d||| dkr| d dksJ d| d nddkr
ndtjdt|dt|	dt|	dt|
f 	
fdd}|S )Nr   r   r   r   r   r     r   r   	num_pagesr   r
   r   r   r   r%   zNI should be a multiple of 2r   r   r   r   r   r   c           9         s  t j dd\}}}t d g}t d g}t g}	t  d g}
t  d g}t  d g}t  d g}t  g}t  g}|}|}t j gddd}t j gddd}t d g}t d g}t  g}t  g}t g}t g}t g}t jgdd}t g}t g}t g}t dg	}t dg	} t jdd}!t jd	d}"t jd	d}#t jd
d}$t jd
d}%t jd
d}&t jd
d}'t jd	d}(t jd	d})t jd	d}*t jd	d}+||},}-dkr-|n| }.|-
 dkr;dn| d  }/|/ }0t  }1t | |,|.|/|0dd f | t | |,|.|/|0d f | t | |,|.|/|0df |	 t |! |1d	k r)t 	dd t 
|d t 
|d t 
|d t |!d t t dD ]}2t |"d |2d@  t |( t |(d t  D ]\}3}4t ||4 dt |j ||3|4f< qt j||
|ddd t j|||ddd t j|	||ddd t d |2dkrt |' t |'|2d d@ dA  t || t j||ddd t D ]}3t ||3 ||3   ||3< q2t  D ]\}3}4t ||3|4f  ||3   ||3|4f< qKt j||dd t D ]}3||3 ||3  ||3  ||3< qst d D ]\}3}5||3|5f  ||3 9  < qt || t || t ||
| t |& t |$d  t |#d |2d@  t |( t |(d t  D ]\}3}4t ||4 dt |j ||3|4f< qt j|||ddd t j|||ddd t j|	||ddd t d t |' t |'|2d d d@ dA  t || t j||ddd t D ]}3t ||3 ||3   ||3< q;t  D ]\}3}4t ||3|4f  ||3   ||3|4f< qTt j||dd t D ]}3||3 ||3  ||3  ||3< q|t d D ]\}3}5||3|5f  ||3 9  < qt || t || t ||| t |& t |%d  qt D ]	}3||3 ||3< qt |+ t d D ]\}3}5||3|5f  ||3   < qt D ]}3t ||3 ||3   ||3< qt || t |||,|.|/|0dd f  n}|1d	kr|1d
k rt 
|d t t dD ]}2t |& t |&|2d d@  t |) t |)d t d D ]\}3}5||3|5f  ||3 9  < qft ||| t |$d  t |' t |& t |&|2d d d@  t |) t |)d t d D ]\}3}5||3|5f  ||3 9  < qt ||| t |%d  |2t dd krt |' qBt |+d t d D ]\}3}5||3|5f  ||3   < qt || t |||,|.|/|0d f  n|1d
krt 	dd d|d< t t dD ]}2t |$d |2d@ dA  t |* t |*d t dD ]}6||,|.|-|2d   |6d  |1d
 d  f | d< | d dk||6d |1d
 d  < ||6d |1d
 d   r| d |d< t dddz t dD ]l}7t dD ]c}8||,|d |-d|7 |1d
 d d  |8 f |
|6d |1d
 d  d|7 |1d
 d d  |8 f< ||,|d |-d d|7  |1d
 d d  |8 f ||6d |1d
 d  d|7 |1d
 d d  |8 f< qqW d   n	1 sw   Y  t ddd9 t dD ]+}8||,|d |-|1d
 d d  |8 f ||6d |1d
 d  |1d
 d d |8 f< q'W d   n	1 s^w   Y  qRt |"d  t |%d |2d@ dA  t |* t |*d t dD ]}6||,|.|-|2d d   |6d  |1d
 d  f | d< | d dk||6d |1d
 d  < ||6d |1d
 d   r| d |d< t dddz t dD ]l}7t dD ]c}8||,|d |-d|7 |1d
 d d  |8 f ||6d |1d
 d  d|7 |1d
 d d  |8 f< ||,|d |-d d|7  |1d
 d d  |8 f ||6d |1d
 d  d|7 |1d
 d d  |8 f< qܐqW d   n	1 sMw   Y  t ddd9 t dD ]+}8||,|d |-|1d
 d d  |8 f ||6d |1d
 d  |1d
 d d |8 f< q_W d   n	1 sw   Y  qt |#d  q3W d   dS W d   dS W d   dS W d   dS 1 sw   Y  dS )z
        Q: [b, qo_len, H, D + D_tail] (bfloat16)
        KV: [b, num_pages, kv_group, D + D_tail] (bfloat16)
        Indices: [b, qo_len, kv_group, topk] (int32)
        r   r+   r%   r   shared)scoper   )arrive_countr'   r   r   r   N   r   TrY   )ru   wg_waitFr   r/   P   rg   r      defaultasync_scope)r   r1   r3   r4   alloc_localalloc_barrierget_thread_bindingr6   barrier_arriveset_max_nregry   barrier_waitserialr2   r8   r   r   r[   rz   
wait_wgmmar   r   r{   r   attr
vectorizedcp_async_barrier_noinc)9r   r   r   r   r   r   r   
Q_shared_l
Q_shared_rr   KV_shared_0_lKV_shared_0_rKV_shared_1_lKV_shared_1_rK_tail_shared_0K_tail_shared_1
O_shared_l
O_shared_ris_kv_valid_0is_kv_valid_1acc_o_lacc_o_rr   r   r   sum_exp_sharedr   alpha_sharedalpha_localr   r   indices_localindices_tmpbar_qbar_k_0_readybar_k_1_readybar_k_0_freebar_k_1_freebar_sScale_and_sS_readybar_sScale_and_sS_free	bar_0_128	bar_1_128	bar_2_128	bar_finalr   r   r   r   r   txr   r   r   r   ruvr   r   r   r   r   r   r   r   r[   r   r   qo_lenr   r,   r   r   r     sl   
"$$ 







" 





" 

"(





(

& "&
* "&   T  S  X $z,sparse_attention_fwd_kernel_v2.<locals>.mainr   )r   r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r"  r   sparse_attention_fwd_kernel_v2|  sl   







*  Dr$  )r   r   r   r   r,   c                   s|  |dksJ d|dksJ || dksJ 	du r%d||  d d 	n	d 	d}	t d	t d
}
| | }ttj|d}|dkrJ|d nddkrR|nd| || |||	| || g}|	|
||| g}|	||g}|	| |g}|	| g}t j}t jt jt j	dt 
|dt 
|dt 
||dt 
|dt 
|f
 	
fdd}|S )zy
    grid: (seq_len * REPLICATE_H, top_k_blocks).
    Each block does one topk block, writes partial_o, partial_lse.
    Tznon-causal is not supportedr   r   Nr   r   r   r   r   r   r   r   r   r   	Partial_OPartial_Lsec                    s  t j 
d\}}t g}t g}t  g}	t  g}
t  gt j}t g}t  g}t  g}t g}t g}t |d d\}}dkrl|n| }|}|}dkrzdn| d }| }t | ||||d f | t | ||||d f | t  D ]}|||||  | f dk||< qt  D ]\}}|||||||  | f ||f |	||f< qt  D ]\}}|||||||  | f || f |
||f< qt  D ]\}}t || dt 	|j
 |||f< qt j||	|dt jjd t j||
|dt jjd t j||ddd t D ]}t || d	||< qDt  D ]\}}t |||f 	 || 	  |||f< qXt j||dd
 t || t j||	|t jjd t D ]\}}|||f t || dkd||  |||f< qt D ]}t || dkd	t || || 	  ||< qt |||||||d d f  t |||||||f  W d    d S 1 sw   Y  d S )Nr+   r   )r   r   r   r   Tr   r   r   r/   r   g        r   )r   r1   r3   r4   r   ry   r6   r8   r   r   r[   rz   r   r   r   r9   r   r{   r   )r   r   r   r&  r'  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   topk_block_ir   r   r   r   r   r   r   r   r   r   r   r   r   r[   r   r   r,   r   r   r   @  s     ""
"$z+sparse_mla_fwd_decode_partial.<locals>.main)r   dynamicr9   r   r   r   r   r
   r   rS   rT   )headsr0   r   r   r   r   r   r   r,   r   r   r   r   r   r   r   partial_o_shapepartial_lse_shaper   r   r   r)  r   sparse_mla_fwd_decode_partial  sL   






$Zr.  )r   r,   c             
      s   | | dks
J dd}t d|| | |   || g}|| g}|| g}	t j}
t jt jdt ||
dt |dt |	|
f fdd	}|S )
z}
    grid: (seq_len * REPLICATE_H). batch=1, kv_group=1.
    Each block does one tile of heads (e.g. 4 or 8 for decode).
    r   z head_per_block must divide headsr   r   r&  r'  r   c                    sr  t j d$\}t  g}t  g}t  g}t  g}t  g}d}	dkr:|n| }
dkrDdn|   }|  }t D ]}t ||	|
|||f ||d d f  qSt |d t D ]}t  D ]}t || |||f ||< q|qut |d t D ]}t  D ]}|| t 	|||f ||   ||< qqt D ]"}t  D ]}t 	|||f ||  t 
||  |||f< qqt |d t D ]+}t  D ]"\}}|||f |||f | |	|
||| |f   |||f< qqt |||	|
||d d f  W d    d S 1 s2w   Y  d S )Nr+   r   r   r   )r   r1   r3   r4   r   r6   ry   r8   r9   r   r   astype)r&  r'  r   r   
shared_lselse_maxlse_sumscaler   r   r   r   r   rq   r   r   r   r   r   r   r0   r   r,   r   r   r     sR   *
 "$z+sparse_mla_fwd_decode_combine.<locals>.main)r   r*  r
   r   rS   rT   )r+  r0   r   head_per_blockr   r,   r   r,  r-  r   r[   r   r   r4  r   sparse_mla_fwd_decode_combine  s(   



.r6  rn   kvindicesd_vc              
   C   s*  |   dkr|  dkr|  dksJ | jd }| jd }|| }|jd }|dks-J tr}trp| jd dkret|||||ddd	}	t|||d
ddd}
|	| d|d|d\}}|
||}|S t|||||dd}nt|||||dddd}n	t|||||d}|| d|d|dS )Nr   r   r%   rY   i   r   r   r   )r   r   r,   rg   )r5  r   r,   )r   r.   r&   r'   )r   r   r.   r,   )r   )	r0   r   r   _is_gfx95_supportedr.  r6  	unsqueezer   r$  )ro   r7  r8  r   r9  r   r0   r   r   kernel_partialkernel_combine	partial_opartial_lseoutre   r   r   r   tilelang_sparse_fwd  sV   (


	

rA  )r'   N)T)rn   ),typingr   r   r   tilelang.languagelanguager   r^   )sglang.srt.layers.quantization.fp8_kernelr   sglang.srt.utilsr   r   set_log_levelPassConfigKeyTL_DISABLE_WARP_SPECIALIZEDTL_DISABLE_TMA_LOWERr#   hasattrr   r	   r   r:  rR   BF16rw   rx   r   r   r"   jitrU   rT   intstrrf   r   r   r   r   r$  r.  r6  rA  r   r   r   r   <module>   s    

3
8
 #   T