o
    پio                    @  s  U d dl mZ d dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8 erd dl9m:Z: d dl;m<Z< d dl=m>Z> e8 Z?e?rzd dl@mAZAmBZBmCZC d dlDmEZEmFZF W n eGy   eHd Y n	w d dlImAZAmJZJ daKejLM oe? ZNejOM ZPeddG dd dZQeddG dd  d ZRG d!d" d"eZSejTd7d8d*d+ZUd7d8d,d-ZVeddG d.d/ d/eZWed0 ZXd1eYd2< G d3d4 d4eeZZG d5d6 d6Z[dS )9    )annotations)	dataclass)IntEnumauto)TYPE_CHECKINGDictListLiteralOptionalTuple	TypeAliasN)get_nsa_index_topkis_deepseek_nsa)envs)AttentionBackend)dequantize_k_cache_paged))NativeSparseAttnBackendMTPPrecomputeMixinPrecomputedMetadatacompute_cu_seqlens)BaseIndexerMetadata)(verify_multi_backend_fused_metadata_copy)verify_single_backend_fused_metadata_copy)quantize_k_cache)!transform_index_page_table_decode"transform_index_page_table_prefill)$can_nsa_prefill_cp_round_robin_splitcompute_nsa_seqlensis_nsa_enable_prefill_cpnsa_cp_round_robin_split_datansa_cp_round_robin_split_q_seqspad_nsa_cache_seqlens)concat_mla_absorb_q_generalmla_quantize_and_rope_for_fp8)get_attention_tp_size)ForwardBatchForwardMode)is_cudais_hip)RadixAttention)ModelRunner)	SpecInput)flash_attn_varlen_funcmha_batch_prefill_funcpaged_attention_ragged)mla_decode_fwdmla_prefill_fwdz]aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device.)r+   flash_attn_with_kvcacheT)frozenc                   @  s4   e Zd ZU dZded< ded< dd Zdd	d
ZdS )NSAFlashMLAMetadataz Metadata only needed by FlashMLAtorch.Tensorflashmla_metadata
num_splitsc                 C  s   t | j| j| dS )Nr4   r5   )r2   r4   r5   )selfsli r9   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/nsa_backend.pysliceZ   s   zNSAFlashMLAMetadata.sliceother'NSAFlashMLAMetadata'c                 C  s    | j |j  | j|j d S N)r4   copy_r5   )r7   r<   r9   r9   r:   r?   `   s   zNSAFlashMLAMetadata.copy_N)r<   r=   )__name__
__module____qualname____doc____annotations__r;   r?   r9   r9   r9   r:   r2   S   s   
 r2   c                   @  s   e Zd ZU ded< ded< ded< ded< ded< ded< ded	< ded
< ded< ded< ded< ded< ded< dZded< dZded< dZded< dZded< dZded< dZ	ded< dZ
ded< dZded< dZded < dS )!NSAMetadataint	page_sizer3   cache_seqlens_int32max_seq_len_qmax_seq_len_kcu_seqlens_qcu_seqlens_kpage_table_1real_page_tablensa_cache_seqlens_int32nsa_cu_seqlens_qnsa_cu_seqlens_k	List[int]nsa_extend_seq_lens_listnsa_seqlens_expanded   z
Literal[1]nsa_max_seqlen_qNzOptional[NSAFlashMLAMetadata]r4   Optional[torch.Tensor]paged_mqa_schedule_metadatazOptional[int]seq_lens_sumpage_table_1_flattenedtopk_indices_offsetz+Optional[Tuple[torch.Tensor, torch.Tensor]]indexer_k_start_endindexer_seq_lens_cputoken_to_batch_idx)r@   rA   rB   rD   rV   r4   rX   rY   rZ   r[   r\   r]   r^   r9   r9   r9   r:   rE   e   s.   
 rE   c                   @  s   e Zd Ze Ze ZdS )TopkTransformMethodN)r@   rA   rB   r   PAGEDRAGGEDr9   r9   r9   r:   r_      s    
r_   tensorslist[torch.Tensor]dimrF   returnr3   c                 C  s   t j| |dS )Nre   )torchcat)rc   re   r9   r9   r:   _compiled_cat   s   rj   c                 C  s\   t | dksJ | \}}|jdkr|jdksJ tj|d tj|d t||g|dS )z
    Concatenate two tensors along the last dimension.
    Use this function to concatenate q_nope and q_rope or k_nope and k_rope.
          r   rg   )lenndimrh   _dynamomark_dynamicrj   )rc   re   qk_nopeqk_roper9   r9   r:   _cat   s   rs   c                   @  s   e Zd ZU ded< ded< dZded< d(d
dZd(ddZd(ddZd(ddZd(ddZ	d)ddZ
d(ddZd*ddZd(ddZ					d+d,d&d'ZdS )-NSAIndexerMetadatarE   attn_metadatar_   topk_transform_methodNrW   rX   rf   r3   c                 C     | j jS r>   )ru   rH   r7   r9   r9   r:   get_seqlens_int32      z$NSAIndexerMetadata.get_seqlens_int32c                 C  rw   r>   )ru   rN   rx   r9   r9   r:   get_page_table_64   rz   z$NSAIndexerMetadata.get_page_table_64c                 C  rw   r>   )ru   rM   rx   r9   r9   r:   get_page_table_1   rz   z#NSAIndexerMetadata.get_page_table_1c                 C  rw   r>   )ru   rT   rx   r9   r9   r:   get_seqlens_expanded   rz   z'NSAIndexerMetadata.get_seqlens_expandedc                 C  rw   r>   )ru   rL   rx   r9   r9   r:   get_cu_seqlens_k   rz   z#NSAIndexerMetadata.get_cu_seqlens_k!Tuple[torch.Tensor, torch.Tensor]c                 C  rw   r>   )ru   r\   rx   r9   r9   r:   get_indexer_kvcache_range   rz   z,NSAIndexerMetadata.get_indexer_kvcache_rangec                 C  rw   r>   )ru   r]   rx   r9   r9   r:   get_indexer_seq_len_cpu   rz   z*NSAIndexerMetadata.get_indexer_seq_len_cpurR   c                 C  rw   r>   )ru   rS   rx   r9   r9   r:   get_nsa_extend_len_cpu   rz   z)NSAIndexerMetadata.get_nsa_extend_len_cpuc                 C  rw   r>   )ru   r^   rx   r9   r9   r:   get_token_to_batch_idx   rz   z)NSAIndexerMetadata.get_token_to_batch_idxlogitstopkrF   ksrK   	ke_offsetbatch_idx_listtopk_indices_offset_overridec                 C  s  ddl m}m}	m}
 |d ur|}d }n!|d ur,|tj}t|}t|d d |}n| j	j
}| j	j}|d ur;|}n|  }|d urJ| j	j| }n| j	j}tj s[|
||||dS | jtjkrk|||||||dS | jtjkrz|	|||||dS J d| j)	Nr   )fast_topk_transform_fused fast_topk_transform_ragged_fusedfast_topk_v2rb   )
row_starts)scorelengthspage_table_size_1rK   r   r   )r   r   r[   r   r   Fz)Unsupported self.topk_transform_method = )
sgl_kernelr   r   r   torh   int32r   repeat_interleaveru   rK   r[   r}   rM   r   SGLANG_NSA_FUSE_TOPKgetrv   r_   r`   ra   )r7   r   r   r   rK   r   r   r   r   r   r   cu_topk_indices_offsetcu_seqlens_q_topkseq_lens_topkr   r9   r9   r:   topk_transform   sN   


z!NSAIndexerMetadata.topk_transform)rf   r3   )rf   r   )rf   rR   )NNNNN)r   r3   r   rF   r   rW   rK   r3   r   r3   r   rR   r   rW   rf   r3   )r@   rA   rB   rD   rX   ry   r{   r|   r}   r~   r   r   r   r   r   r9   r9   r9   r:   rt      s&   
 








rt   )flashmla_sparseflashmla_kvfa3tilelangtrtllmr   _NSA_IMPL_Tc                      sF  e Zd Z				dsdt fddZduddZdvddZdwddZ	dxdyddZdzddZd{d)d*Z		dxd|d.d/Z
d}d2d3Z	4						d~ddAdBZ	4						d~ddCdDZddPdQZddTdUZddXdYZddZd[Zdd\d]Zdd^d_Z	4						d~dd`daZddbdcZddde ZdxddgdhZddjdkZddndoZddqdrZ  ZS )NativeSparseAttnBackendFr   model_runnerr)   skip_prefillboolc                   s  t    |  |j| _t|jtsJ |j| _|jjrdnd| _	t
|jj| _| js.J d|jj| _t|jj| _|jj| _|jjt  | _|jj| _|jj| _|jj| _|jj| _|jd usbJ |jj| _d| _tj  | _!|jj"| _#|jj$| _%| j#dk| _&t'j(d| jt'j)d| _*t+r|jj,}t'j-|d ft'j)|jd| _.|jj/pd| _0|| _1|jj2| _2|| _3t'j45 | _6| j6d | _7|j8| _8| j7d	ks| j%d
krt9d u rt'j:tj;  t'j<|jda9t9| _=d S d | _=d S )NrU   r   z&NSA backend only supports DeepSeek NSAFflashmla_autoi @  devicedtyper   r   
   r   )>super__init__r   
isinstancerG   rF   real_page_sizeserver_argsenable_deterministic_inferencer5   r   model_config	hf_configuse_nsatoken_to_kv_poolnsa_kv_cache_store_fp8r   nsa_index_topkcontext_lenmax_context_lennum_attention_headsr#   num_q_headskv_cache_dimqk_nope_head_dimkv_lora_rankqk_rope_head_dimreq_to_token_poolreq_to_tokenuse_mhar   SGLANG_NSA_FORCE_MLAr   _force_attn_forward_mlansa_prefill_backendnsa_prefill_implnsa_decode_backendnsa_decode_implenable_auto_select_prefill_implrh   aranger   _arange_buf_is_hipsizezeros	kv_indptrspeculative_eagle_topkr   speculative_num_stepsspeculative_num_draft_tokensspeculative_step_idcudaget_device_capabilitydevice_capabilitydevice_sm_majorkv_cache_dtypeglobal_workspace_bufferempty SGLANG_FLASHINFER_WORKSPACE_SIZEuint8workspace_buffer)r7   r   r   r   r   r   max_bs	__class__r9   r:   r     sd   









z NativeSparseAttnBackend.__init__lrF   rf   r3   c                 C  sB   |t | jkrd|d  > }tj|| jtjd| _| jd | S )NrU   r   )rm   r   
bit_lengthrh   r   r   r   )r7   r   next_pow_of_2r9   r9   r:   get_device_int32_arangei  s   
z/NativeSparseAttnBackend.get_device_int32_arange
page_tablec                 C  sH   | j }|dkr	|S |jd }tjd|||jtjd}|d d |f | S )NrU   r   r   )r   shaperh   r   r   r   )r7   r   rG   max_seqlen_kstrided_indicesr9   r9   r:   _transform_table_1_to_realq  s   
z2NativeSparseAttnBackend._transform_table_1_to_realforward_batchr$   c               	     sZ  |j }|jj |j rj}nd}|j| tj}t	|}|j
dus&J t|j
  | }|jj|jd|f d}d}|  }	d}
|j
}|j redg| }d}|d }|}n|j rd}tjd|j d dtj d}jg| }||_fdd|j
 D }t fddt||dd	D }tjjdd
n|jjddr|jdur|jdur|jdusJ d|j}|jdusJ d}tjd|jd dtj d}t fddt|j|j
 dd	D }|j rtjjdd
n3tj|jdd
n(|j  r3|jdur+|jdur+|jdus/J d|j}|jdus:J |j}t fddt|j|j
 dd	D }t!|rt"|}t#||\}}}
}||
 }|| }t	|}t$|dkrt|  | nd}|d|f t%|js|jt&j'ks|
durt$|dkrt|nd}t	|tj}n|}|}j(o|j)j*tj+k}||_,t%|j}|r"|	t-j.ks|r"tfddt/| D }|j0d t1|ksJ d|j0d dt1||j)j2|j)j3 }|j4|kr"|  }||k s"J d| d| |	t-j.kr2t|dd |}nJ d|j5||
\}}t6|j7d}t8||}t	|}t$|}d}t9 r|j sr|j sr|j rzddl:}|j s|j r|n|}|;|d|< }W n t=t>fy   d}Y nw t?d1i dj@d|d|d|d|d|d|j4d d!|d"jAd#krΈjB|dd$n0dd%|d&|d'|d(|d)|d*|d+Cd,dd-|d.|d/|d0|}|_DdS d%|d&|d'|d(|d)|d*|d+Cd,dd-|d.|d/|d0|}|_DdS )2z%Init the metadata for a forward pass.r   NrU   r   c                      g | ]} j | qS r9   r   .0kv_lenrx   r9   r:   
<listcomp>      zANativeSparseAttnBackend.init_forward_metadata.<locals>.<listcomp>c                   0   g | ]\}}t j|| d  |d  t j dqS rU   r   rh   r   r   r   qo_lenr   r   r9   r:   r         
Tstrictrepeatsre   
include_v2zAll of them must not be Nonec                   r   r   r   r   r   r9   r:   r     r   c                   r   r   r   r   r   r9   r:   r     r   c                   s    g | ]\}} |d |f qS r>   r9   )r   ir   )r   r9   r:   r   8  s    z"page_table_1_flattened.shape[0] = z1 must be the same as sum(indexer_seq_lens_cpu) = zInvalid page table index: max=z, kv_cache_capacity=rb   Fz)Unsupported forward_batch.forward_mode = )original_seq_lensr   @   rG   rH   rI   rJ   rK   rL   rY   rM   rZ   r4   r   cache_seqlens	seq_len_qrX   rO   rP   rQ   rT   rS   rN   rV   r[   r\   r]   r^   r9   )E
batch_sizeseq_lensr   forward_modeis_target_verifyr   r   rh   r   r   seq_lens_cpurF   maxitemr   r   req_pool_indicesset_nsa_prefill_implget_topk_transform_methodis_decode_or_idler   r   extend_seq_lens_cputolistri   zipr   is_draft_extendextend_seq_lensextend_prefix_lens_cpuextend_num_tokensis_draft_extend_v2	is_extendr   r   r   rm   anyr%   DRAFT_EXTENDr   r   r   float8_e4m3fnusing_mha_one_shot_fp8_dequantr_   ra   	enumerater   sumr   rG   rY   _cal_indexer_k_start_endr   r   r    r&   	deep_gemmget_paged_mqa_logits_metadataget_num_smsImportErrorModuleNotFoundErrorrE   r   r   _compute_flashmla_metadatar   forward_metadata) r7   r   r  draft_token_numrH   rL   r   rZ   r[   rv   
bs_idx_cpur]   r  max_seqlen_qrK   seqlens_expandedseqlens_int32_cpur  bs_idxmha_dequantize_neededhas_prefix_sharingkv_cache_capacitymax_idxr\   r^   rO   rQ   rP   rX   r  
seqlens_32metadatar9   )r   r   r7   r:   init_forward_metadata{  s"  






















	



z-NativeSparseAttnBackend.init_forward_metadataNr+  Optional[List[int]]c                 C  s  |j  sdS |jdks|d ur&t|dkr&tjdtj| jd}||f|fS g }g }g }d}d}|jd ur:|j	d us<J t
|jD ]|}	|j|	  }
t|
tsQJ |j	|	 }tj|f|tj| jd}|
}|j  rn|| j7 }tj|| d |d tj| jd}|| }|| || |d ur|	|v r||	n|	}tj|f|tj| jd}|| |d u s|	|v r||7 }||
7 }qAtj|dd}tj|dd}tj|dd}|d urt|sJ t|}t|}t|}||f|fS )N)NNr   r   rU   rg   )r  is_extend_without_speculativer  rm   rh   r   r   r   r  r  ranger
  r   rF   fullr  r   r   appendindexri   r   r   )r7   r   r+  empty_tks_listke_listr^   q_offsetk_offsetr   seq_lenextend_seq_lenr   r   seq_lens_expandedkebitbr9   r9   r:   r    sb   








z0NativeSparseAttnBackend._cal_indexer_k_start_endr   max_num_tokensc              
   C  s   t j|t j| jdt jd|d t j| jdt j|d t j| jdt j|| jt j| jd| jdkr?| jt j|t j| jdddndd| _	dS )a  Initialize CUDA graph state for the attention backend.

        Args:
            max_bs (int): Maximum batch size to support in CUDA graphs

        This creates fixed-size tensors that will be reused during CUDA graph replay
        to avoid memory allocations.
        r   r   rU   r   r  N)r  rK   rL   r   r4   )
rh   onesr   r   r   r   r   r   r$  decode_cuda_graph_metadata)r7   r   rD  r9   r9   r:   init_cuda_graph_state  s0   



z-NativeSparseAttnBackend.init_cuda_graph_statebs
num_tokensr  r  encoder_lensrW   r  r%   	spec_infoOptional[SpecInput]c                   s   j d d 	 | rb|tj}t|}	 jd d |d d f }
d}|
jd } jd d |d  }t| j	d}|}dg| } j
dkr_ jd td|d }| j|dd	 nd }n| sl|jd
dr| j tj}t|}	d} jd d | j d d f }
|
jd }tjd| j d dtj jd} jg| } fdd| D }t fddt||d
dD }t| j	d}dg|  j } j
dkr jd td| j d }| j|dd	 nd }t|} t|} |
}d }t rE| s| s| rEzdd l}| s(| r*|n|}||d| }W n ttfyD   d }Y nw t j |||||	|
||||||||d}| j|< | _!d S )Nr   r   rU   rK   r   r   r4   r   r  Tr   r   c                   r   r9   r   r   rx   r9   r:   r   Y  r   zTNativeSparseAttnBackend.init_forward_metadata_capture_cuda_graph.<locals>.<listcomp>c                   2   g | ]\}}t j|| d  |d  t j jdqS r   rh   r   r   r   r   rx   r9   r:   r   ^      
r   r   )rG   rH   rI   rJ   rK   rL   rM   r4   rX   rO   rP   rQ   rT   rN   rS   )"r  r  r   rh   r   r   rF  r   r   r   r   r;   r?   r$  r  r  r   r   r   r  ri   r  r   rm   r   r&   r  r   r!  r"  r#  rE   r   r%  )r7   rH  rI  r  r  rJ  r  rK  rH   rL   rM   r(  r   rK   rO   r)  rS   r4   r  r*  rQ   rP   rN   rX   r  r0  r1  r9   rx   r:   (init_forward_metadata_capture_cuda_graph  s   












z@NativeSparseAttnBackend.init_forward_metadata_capture_cuda_graphrY   r  out_cache_locc
              	     s  |dusJ  j dd |d| }|d| }|d| } j| }
| rtt|  }|tj}|
j	
| |
jdd 
tj|dtjd  j|d|f }|
jddd|f 
| t| jd}|
j
| |}n| rt|   j }| j tj}|
j	
| |
jdd 
tj|dtjd  j|d|f }tj| jdd}|
jddd|f 
|  jg| } fdd	| D }t fd
d	t||ddD }|
j
| t| j}|
j
| n|jddrt|  }|tj}|
j	
| |
jdd 
tj|dtjd |jd| }| } j|d|f }tj||dd}|
jd|jd d|f 
| t fdd	t|| ddD }|
jd|jd  
| t| j}|
jd|jd  
| t r| s| s| rz.ddl}| s| r|n|
j	}||d| }|
j du r||
_ n|
j 
| W n t!t"fy   d|
_ Y nw |jd }|
jdur|
j#dur jdusJ |
j#dd|  
tj|dtjd  j$|
j%ks
J  j$dkr. &|}|jd }|jd }|
j'd|d|f 
| n	|
j'|
ju s7J  j(dkrR|
j)*t*d|d }|
 j+|dd |
 _,dS )z5Initialize forward metadata for replaying CUDA graph.NrM  rU   r   )re   r   rN  r   c                   r   r9   r   r   rx   r9   r:   r     r   zSNativeSparseAttnBackend.init_forward_metadata_replay_cuda_graph.<locals>.<listcomp>c                   rO  r   rP  r   rx   r9   r:   r     rQ  Tr   r   c                   rO  r   rP  r   rx   r9   r:   r     rQ  r   r   r  )-r  rF  r  rF   r	  r
  r   rh   r   rH   r?   rL   cumsumr   rM   r   r   rO   r  r   r   r  ri   r  rT   r  accept_lengthr   r&   r  r   r!  rX   r"  r#  rQ   r   rG   r   rN   r   r4   r;   r$  r%  )r7   rH  r  r  rY   rJ  r  rK  r  rS  r1  max_lenr  page_indicesnsa_cache_seqlensr)  r   r  r*  r  r  r0  new_scheduleseqlens_expanded_size
real_tablenew_rowsnew_colsr4   r9   rx   r:   'init_forward_metadata_replay_cuda_graph  s  











z?NativeSparseAttnBackend.init_forward_metadata_replay_cuda_graphprecomputedr   c                 C  s0  | j dd | j| }d}trzddlm} | rd}n| r$d}n| r+d}ntd| d}d}	d}
d}|j	durO|j	j
}|j	j
}	|j	j	}
|j	j	}||j|j|j|j|j|j|j||
|j|j|j|j|j|j|jdurt|jnd|	||||j|j|j d	}trt||||||
|	|d
 W n& ty   td Y n ty } ztd| d W Y d}~nd}~ww |s|j|j |jdd |jdd  | r|jddd|jf |j |j|j nZ| r|jddd|jf |j |j|j |j|j n6| rH|jjd }|j}|jd|d|f |j |j}|jd| |j |jd| |j |j}|jdd|  |jdd|   |jdury|jj\}}|jd|d|f |j |j	dur|j}|j	 t d|d }||j	 || _!dS )a  Fast path: copy precomputed metadata to this backend's metadata.

        This function only performs copy operations, no computation.

        Args:
            bs: Batch size
            precomputed: Precomputed metadata to copy from
            forward_mode: Forward mode
        NrM  Fr   )fused_metadata_copy_cudarU   rk   zUnsupported forward_mode: T)r1  r_  r  rH  flashmla_num_splits_srcflashmla_metadata_srcflashmla_num_splits_dstflashmla_metadata_dstzUWarning: Fused metadata copy kernel not available, falling back to individual copies.z7Warning: Fused metadata copy kernel failed with error: z$, falling back to individual copies.)"r  rF  _USE_FUSED_METADATA_COPY%sglang.jit_kernel.fused_metadata_copyr`  r  r  r  
ValueErrorr4   r5   r  rL   rW  rX  r)  rQ   rN   rH   rM   rO   rT   rV  r   rZ  _VERIFY_FUSED_METADATA_COPYr   r"  print	Exceptionr?   r   r;   r%  )r7   rH  r_  r  r1  fused_kernel_succeededr`  mode_intra  rc  rb  rd  erowscolsr   r4   r9   r9   r:   8init_forward_metadata_replay_cuda_graph_from_precomputedh  s   


"




zPNativeSparseAttnBackend.init_forward_metadata_replay_cuda_graph_from_precomputedTqkvlayerr(   q_ropek_ropetopk_indicescos_sin_cacheis_neoxOptional[bool]llama_4_scalingc                 C  s  |j  }| j}|sJ d|j s|jjddr| jn| j}|dkr8| js8| ||||||j	||||	|
||S |d urV|d usBJ |rV|j sJ|j
n|j}|j|||| | jr|d ura|d uscJ |d u skJ d|j|j  krzdksJ d J d| j||||||dS |d usJ |j|j}|d ur|d	|j|j}|d	|j|j|j }n'| d	|j|j}|d d d d d |jf }|d d d d |jd f }|	d ur| |	|jd
 }	|  }tj r|	}nC|tjkr|j}|d usJ |	d	k}|j dkr|!dn|}t"#||	| |	}	n|tj$kr4|j%d us*J t&|j'|	|j%dd}|dkrO|d urCt(||}| j)||||j*|jdS |dkr|d ur^t(||}|tjkrt+|j,r{| jj-}|d usuJ t.||}nt/||gd	d}|	}| j0||||j*|jdS |dkr|d urt(||}| j1|||j*|j|||dS |dkr| j2|||j|||j	|j3|j4|j5|j*|j6ddS t7d|)NNSA is causal onlyTr   r   z(MHA_ONE_SHOT path should not pass q_roperU   z-MHA_ONE_SHOT requires dense multi-head config)rq  rr  rs  rt  r   r1  rb   r   )r   rw  extend_lens_cpurG   r   q_allkv_cacherM   sm_scale
v_head_dimr   rg   r   r  r  r  r  rt  r1  rM   r   ru  r  r  q_noper   r  rK   rL   r(  r  	logit_caprG   zUnsupported nsa_impl = )8is_cross_attentionr%  r  r  r  r   r   r   _forward_trtllmrO   rS  encoder_out_cache_locr   set_mla_kv_buffertp_k_head_numtp_q_head_num_forward_standard_mhaget_key_bufferlayer_idviewr  head_dim
contiguous_pad_topk_indicesr   r  r   r   r   r_   ra   r[   rn   	unsqueezerh   wherer`   rS   r   rM   r!   _forward_tilelangscalingr  r  rZ   r   rs   _forward_flashmla_sparse_forward_flashmla_kv_forward_fa3rP   rQ   rV   r  rg  )r7   rq  rr  rs  rt  r   save_kv_cacheru  rv  rw  rx  ry  r{  causalr1  nsa_impl	cache_locr  r  r  rv   rM   r[   maskrZ   r9   r9   r:   forward_extend  s$  	














z&NativeSparseAttnBackend.forward_extendc                 C  sz  |j  }| j}|sJ d| jdkr$| ||||||j||||	|
||S |d urB|d us.J |rB|j s6|jn|j}|j|||| |j	|j
}|d urc|d|j|j}|d|j|j|j }n'| d|j|j}|d d d d d |jf }|d d d d |jd f }|	d ur| |	|jd }	tj r|	}nt|j|	dd}| jdkr|d urt||}| j||||j|jdS | jd	kr|d urt||}| j|||j|j|||d
S | jdkr|d urt||}| j||||j|jdS | jdkr| j|||j|||j|j|j|j |j|j!ddS | jdkr5|d ur)t"j#||gdd}| j$||||||j%dS J d| j)Nr|  r   rb   r   rU   r   rw  rG   r   r~  r   r  r   r   r  aiterrg   )r  r  rM   rt  r1  rH  Fz#Unsupported self.nsa_decode_impl = )&r  r%  r   r  rH   rS  r  r   r  r  r  r  r  r  r  r  r  r   r   r   r   r   rM   r!   r  r  r  r  r  rO   rP   rQ   rV   r  rh   ri   _forward_aiterr  )r7   rq  rr  rs  rt  r   r  ru  rv  rw  rx  ry  r{  r  r1  r  r  r  r  rM   r9   r9   r:   forward_decode  s   










z&NativeSparseAttnBackend.forward_decoder  r  r  r  rK   rL   r(  r  floatr  rG   c                 C  s   |d d d d |d f }|d d d d d |f }|j d }|d|d|}|d|d|}t|||||||||	|
d|d| jd}|S )Nrb   rU   TF)rq  k_cachev_cacheqvr   r  rK   cu_seqlens_k_newr(  softmax_scaler  softcapreturn_softmax_lser5   )r   r  r0   r5   )r7   ru  r  r  r  r   r  rK   rL   r(  r  r  rG   k_rope_cache
c_kv_cacheqk_rope_dimor9   r9   r:   r  W  s,   
z$NativeSparseAttnBackend._forward_fa3r  rM   c                 C  s   ddl m} |j\}}}	| jdkrdnd}
||
 dk}|rF|
| dks.J d| d|
 d|||
|	f}||d d d |d d f< |}n|}|d	}||||||d
\}}}|rh|d d d |d d f }|S )Nr   )flash_mla_sparse_fwdr      r   z
num_heads z cannot be padded to z*. TP size may be too large for this model.rU   rq  kvindicesr  d_v)sgl_kernel.flash_mlar  r   r   	new_zerosr  )r7   r  r  r  rM   r  r  rI  	num_headsr  required_paddingneed_paddingq_paddedq_inputindices_inputr  _r9   r9   r:   r  }  s.   
	z0NativeSparseAttnBackend._forward_flashmla_sparser1  rE   c                 C  s   ddl m} |j}	|dd|j|j}|d| jd| j}| jdks&J d| js-t	|}|
d}
|
jd | jks<J ||||	||jj|jj||
tj|jd dftj|jddd	
\}}|S )
Nr   )flash_mla_with_kvcacherb   rU   r   zonly page size 64 is supportedr   T)
rq  r  r  
head_dim_vtile_scheduler_metadatar5   r  r  block_tableis_fp8_kvcache)r  r  rO   r  r  r  r   r   r   r   r  r   r   r4   r5   rh   r   r   r   )r7   r  r  r  r  rt  r1  rM   r  r  r  r  r  r9   r9   r:   r    s0   


z,NativeSparseAttnBackend._forward_flashmla_kvc                 C  s:  | d|j|j}| d|j|j}| d|j|j}|j}|j}|j}	d}
t	|t	|ksAJ dt	|d  dt	|d  d| j
dkrdd	l}|j}|jjdi d
|d|d|d| jd|d|jd|	d|jddddd|jddd|d|ddd|
ddS d}t||||||j|	|j|
|d
S ) z?Standard MHA using FlashAttention varlen for MHA_ONE_SHOT mode.rb   Tz&batch_size mismatch: cu_seqlens_q has rU   z requests, cu_seqlens_k has z	 requestsr   r   Nquerykeyvaluer   r  	max_q_len
max_kv_len
bmm1_scale
bmm2_scale      ?
o_sf_scaler  window_leftcum_seq_lens_qcum_seq_lens_kv
enable_pdlF	is_causal
return_lserl   )
rq  rr  rs  rK   rL   r(  r   r  r  verr9   )r  r  r  r  tp_v_head_numr  rK   rL   rJ   rm   r   
flashinferrH   prefill trtllm_ragged_attention_deepseekr   rI   r  r  r+   )r7   rq  rr  rs  rt  r   r1  rK   rL   r   r  r  r  
fa_versionr9   r9   r:   r    s~   


	
z-NativeSparseAttnBackend._forward_standard_mhac                 C  s$   ddl m} ||||d||dS )Nr   )tilelang_sparse_fwdrU   r  )/sglang.srt.layers.attention.nsa.tilelang_kernelr  r  )r7   r  r  r  rM   r  r  r9   r9   r:   r    s   z)NativeSparseAttnBackend._forward_tilelangc                 C  s   | d|j|j }|j|jkr||jd |j|j f}nt|}| j}	|dk}
|
j	dd}tj
|dd|	d|d < ||dk }t|d|j|j|ddd|j|d|j|j|j|	||j|j|j|j
 |S )Nrb   r   rU   rg   )reshaper  r  r  	new_emptyr   rh   
empty_liker   r  rT  r.   r  rK   rI   r  r  )r7   r  r  rM   rt  r1  rH  rq  r  r   non_minus1_masknon_minus1_counts
kv_indicesr9   r9   r:   r  0  s,   	
z&NativeSparseAttnBackend._forward_aiterc           "      C  s.  ddl }| j}|du}| jtjkrC|dusJ d|	dus!J d|dus)J dt|||d|	d|j||| j| j		\}}}	d}|rc|durM|	dusQJ d|j
sW|jn|j}|j||||	 |j|j}|d	| j| jd}|r|d	|j|j}|d	|j|j|j }t||}n	|d	|j|j}|
dur| |
|jd }
tj r|
}nt|j|
dd
}d}t |dddur|j!nd}|| |j" }|jd }|j\}}}||d||}|d	d| j| j}|d} |du r|j#n|}|j$j%||| j&| j'| j| j	| ||j(| j)|dd}!|!dS )z(Forward using TRT-LLM sparse MLA kernel.r   Nz'For FP8 path q_rope should not be None.z'For FP8 path k_rope should not be None.z.For FP8 path cos_sin_cache should not be None.rU   FzNFor populating trtllm_mla kv cache, both k_nope and k_rope should be not None.rb   r  r  k_scale_floatz
trtllm-gen)r  r  r   r   r   r   block_tablesr  max_seq_lensparse_mla_top_kr  backend)*flashinfer.decoder%  r   rh   r  r"   squeeze	positionsr   r   r  rS  r  r   r  r  r  r  r   r   r  r  r  r  r!   r  r   r   r   r   r   rM   getattrr  r  rH   decode%trtllm_batch_decode_with_kv_cache_mlar   r   rJ   r   )"r7   rq  rr  rs  rt  r   r  r  ru  rv  rw  rx  ry  r{  r  r1  merge_queryr  r  r  r  q_rope_reshapedr  rM   q_scalek_scaler  r  r  r  r  r  r  outr9   r9   r:   r  W  s   





z'NativeSparseAttnBackend._forward_trtllmc                 C  sn   |j d }||kr|S ||ksJ d| d| d|| }tj||j d fd|j|jd}tj||gddS )	Nr   ztopk_indices rows (z) > num_tokens (zA); this indicates a mismatch between indexer output and q layout.rU   rb   r   rg   )r   rh   r6  r   r   ri   )r7   rw  rI  current_tokenspad_sizepaddingr9   r9   r:   r    s   

z)NativeSparseAttnBackend._pad_topk_indicesc                 C  s   dS )z5Get the fill value for sequence length in CUDA graph.rU   r9   rx   r9   r9   r:   !get_cuda_graph_seq_len_fill_value  s   z9NativeSparseAttnBackend.get_cuda_graph_seq_len_fill_valueOptional[ForwardBatch]c           
      C  s*  ddl m}m} |rW|j rW|jdusJ |j  }t|j}| }| j	dkr.|
 n| j}|dkp<|dko<|dk oT||koT|jjtjtjfv oT||
 koTt  | _nd| _| jr`d| _| js| jr| jr| r|dur|jtjkr|j}|j}	||	d	 k rd
| _	dS d| _	dS d
| _	dS dS dS )zR
        Decide all attention prefill dispatch strategies for this batch.
        r   )get_device_smis_blackwellNr   Z   d   n   Fi   r   r   )sglang.srt.utilsr  r  r  r4  r  r	  r
  r  r   get_max_chunk_capacityr   r   r   rh   bfloat16r  r   r   r   r   r   r%   EXTENDrY   r  )
r7   r   r  r  r  sum_seq_lens	device_smmha_max_kv_lentotal_kv_tokenstotal_q_tokensr9   r9   r:   r    sR   

	


z,NativeSparseAttnBackend.set_nsa_prefill_implr_   c                 C  s$   | j r| jdkrtj}|S tj}|S )z
        SGLANG_NSA_FUSE_TOPK controls whether to fuse the topk transform into the topk kernel.
        This method is used to select the topk transform method which can be fused or unfused.
        r   )r   r   r_   ra   r`   )r7   rv   r9   r9   r:   r    s   
z1NativeSparseAttnBackend.get_topk_transform_methodr  rt   c                 C  s   t | j|  | jjdS )N)ru   rv   rX   )rt   r%  r  rX   )r7   r  r   r9   r9   r:   get_indexer_metadata-  s
   z,NativeSparseAttnBackend.get_indexer_metadatar  c                 C  s>   ddl m} |||| j d d| jd| jd\}}t||dS )Nr   )get_mla_metadatarU   T)r  num_q_tokens_per_head_knum_heads_knum_heads_qr  r   r6   )r  r  r   r   r2   )r7   r  r  r  r4   r5   r9   r9   r:   r$  6  s   
z2NativeSparseAttnBackend._compute_flashmla_metadata)Fr   r   r   )r   r)   r   r   )r   rF   rf   r3   )r   r3   rf   r3   r   r$   r>   )r   r$   r+  r3  r   rF   rD  rF   )rH  rF   rI  rF   r  r3   r  r3   rJ  rW   r  r%   rK  rL  )rH  rF   r  r3   r  r3   rY   rF   rJ  rW   r  r%   rK  rL  r  rW   rS  rW   )rH  rF   r_  r   r  r%   )TNNNNFN)rq  r3   rr  r3   rs  r3   rt  r(   r   r$   ru  rW   rv  rW   rw  rW   rx  rW   ry  rz  r{  rW   rf   r3   )ru  r3   r  r3   r  rF   r  r3   r   r3   r  r3   rK   r3   rL   r3   r(  rF   r  r  r  r  rG   rF   rf   r3   )r  r3   r  r3   r  rF   rM   r3   r  r  rf   r3   )r  r3   r  r3   r  rF   r  r  r1  rE   rf   r3   )rq  r3   rr  r3   rs  r3   rt  r(   r   r$   r1  rE   rf   r3   )r  r3   r  r3   rM   r3   rt  r(   r1  rE   rH  rF   rf   r3   )rq  r3   rr  r3   rs  r3   rt  r(   r   r$   r  r3   ru  rW   rv  rW   rw  rW   rx  rW   ry  rz  r{  rW   rf   r3   )rw  r3   rI  rF   rf   r3   )r   r  )rf   r_   )r  rF   r   r$   rf   rt   )r  r3   r  rF   )r@   rA   rB   r   r   r   r2  r  rG  rR  r^  rp  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r$  __classcell__r9   r9   r   r:   r     sv    
M


  +
L
& & 
; . D 

&
1
-
C
/
u
9
	r   c                   @  s>   e Zd ZdddZdd
dZdddZdddZdddZdS ) NativeSparseAttnMultiStepBackendr   r)   r   rF   r   c              	   C  sL   || _ || _|| _g | _t| jd D ]}| jt||| j| jd qd S )NrU   )r   r   r   )r   r   r   attn_backendsr5  r7  r   )r7   r   r   r   r   r9   r9   r:   r   L  s   z)NativeSparseAttnMultiStepBackend.__init__r   r$   c                 C  s(   t | jd D ]
}| j| | qd S NrU   )r5  r   r  r2  r7   r   r   r9   r9   r:   r2  ]  s   z6NativeSparseAttnMultiStepBackend.init_forward_metadatar   rD  c                 C  s*   t | jd D ]}| j| || qd S r  )r5  r   r  rG  )r7   r   rD  r   r9   r9   r:   rG  a  s   z6NativeSparseAttnMultiStepBackend.init_cuda_graph_statec              
   C  sH   t | jd D ]}| j| j|j|j| j |j|jd tj	|j
d qd S )NrU   )rJ  r  rK  )r5  r   r  rR  r  r   r  r  r%   DECODErK  r  r9   r9   r:   rR  e  s   

zINativeSparseAttnMultiStepBackend.init_forward_metadata_capture_cuda_graphrH  c                 C  sH  t j r| jd j||j|j|jtj	|j
d}| jdkrgzddlm} | jd j| }| jd j| }| jd j| }tdD ]}| j| jd d qCd }	d }
d }d }d }d }d }d }|jd ur|jj}	|jj}
|jj}|jj}|jj}|jj}|jj}|jj}|g |j|j|j|j|j|j|	|
|j|j|j|j|j|jd ur|jnd |||j|j|j|j|j|jd ur|jnd |||j|j|j|j|j|jd ur|jnd ||||j|jR   trt||||||	|
d td| jd D ]}| j| j ||tj	d	 qW d S  t!t"fyf } z1t#|t!r<t$d
 nt$d| d t| jd D ]}| j| j ||tj	d	 qKW Y d }~d S d }~ww t| jd D ]}| j| j ||tj	d	 qnd S t| jd D ]}| j| j%||j|j|j&d tj	|j
|jd d	 qd S )Nr   )rH  r  r  r  r  rK  rl   )fused_metadata_copy_multi_cudarU   rk   rM  )	metadata0	metadata1	metadata2r_  rH  ra  rb  )rH  r_  r  zVWarning: Multi-backend fused metadata copy kernel not available, falling back to loop.zEWarning: Multi-backend fused metadata copy kernel failed with error: z, falling back to loop.)	rH  r  r  rY   rJ  r  rK  r  rS  )'r   )SGLANG_NSA_ENABLE_MTP_PRECOMPUTE_METADATAr   r  _precompute_replay_metadatar  r  r  r%   r  rK  r   rf  r  rF  r5  r  r4   r5   r  rL   rW  rX  rQ   rN   rH   rM   rO   rV  rZ  rh  r   rp  r"  rj  r   ri  r^  rY   )r7   r   rH  r_  r  r  r  r  r   ra  rb  flashmla_num_splits_dst0flashmla_num_splits_dst1flashmla_num_splits_dst2flashmla_metadata_dst0flashmla_metadata_dst1flashmla_metadata_dst2rm  r9   r9   r:   r^  q  sl  

	

"#%&'()
,/02348






zHNativeSparseAttnMultiStepBackend.init_forward_metadata_replay_cuda_graphN)r   r)   r   rF   r   rF   r
  r  )r   r$   rH  rF   )r@   rA   rB   r   r2  rG  rR  r^  r9   r9   r9   r:   r  J  s    



r  )rb   )rc   rd   re   rF   rf   r3   )\
__future__r   dataclassesr   enumr   r   typingr   r   r   r	   r
   r   r   rh   sglang.srt.configs.model_configr   r   sglang.srt.environr   -sglang.srt.layers.attention.base_attn_backendr   /sglang.srt.layers.attention.nsa.dequant_k_cacher   :sglang.srt.layers.attention.nsa.nsa_backend_mtp_precomputer   r   r   +sglang.srt.layers.attention.nsa.nsa_indexerr   4sglang.srt.layers.attention.nsa.nsa_mtp_verificationr   r   -sglang.srt.layers.attention.nsa.quant_k_cacher   /sglang.srt.layers.attention.nsa.transform_indexr   r   %sglang.srt.layers.attention.nsa.utilsr   r   r   r   r   r    !sglang.srt.layers.attention.utilsr!   r"   sglang.srt.layers.dp_attentionr#   ,sglang.srt.model_executor.forward_batch_infor$   r%   r  r&   r'   !sglang.srt.layers.radix_attentionr(   &sglang.srt.model_executor.model_runnerr)    sglang.srt.speculative.spec_infor*   r   r  r+   r,   r-   	aiter.mlar.   r/   r"  ri  sgl_kernel.flash_attnr0   r   SGLANG_USE_FUSED_METADATA_COPYr   re  !SGLANG_VERIFY_FUSED_METADATA_COPYrh  r2   rE   r_   compilerj   rs   rt   r   rD   r   r  r9   r9   r9   r:   <module>   s    $ 
3^
              ?