o
    ÔÙ¾i  ã                   @  sŒ   d dl mZ d dlmZmZ d dlmZmZ d dlZd dl	m
Z
 er<d dlmZ d dlmZ d dlmZmZ d d	lmZ G d
d„ deƒZdS )é    )Úannotations)ÚABCÚabstractmethod)ÚTYPE_CHECKINGÚOptionalN)Úis_npu)ÚBaseIndexerMetadata)ÚRadixAttention)ÚForwardBatchÚForwardMode)Ú	SpecInputc                   @  s    e Zd ZdZed=dd„ƒZd>d	d
„Zd?dd„Zd@dd„Zdd„ Z	dd„ Z
dAd#d$„Z	%dBdCd-d.„Z	%dBdCd/d0„Z	%dBdCd1d2„Z	%dBdCd3d4„Zd5d6„ ZdDd:d;„Zd<S )EÚAttentionBackendz$The base class of attention backendsÚforward_batchr
   c                 C  ó   t ƒ ‚)z%Init the metadata for a forward pass.©ÚNotImplementedError)Úselfr   © r   úa/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/base_attn_backend.pyÚinit_forward_metadata   s   z&AttentionBackend.init_forward_metadataÚmax_bsÚintÚmax_num_tokensc                 C  r   )z-Init the global shared states for cuda graph.r   )r   r   r   r   r   r   Úinit_cuda_graph_state   ó   z&AttentionBackend.init_cuda_graph_stateÚbsÚ
num_tokensÚreq_pool_indicesútorch.TensorÚseq_lensÚencoder_lensúOptional[torch.Tensor]Úforward_moder   Ú	spec_infoúOptional[SpecInput]c                 C  r   )z@Init the metadata for a forward pass for capturing a cuda graph.r   )r   r   r   r   r   r    r"   r#   r   r   r   Ú(init_forward_metadata_capture_cuda_graph   s   z9AttentionBackend.init_forward_metadata_capture_cuda_graphÚseq_lens_sumÚseq_lens_cpuc	           	      C  r   )z@Init the metadata for a forward pass for replaying a cuda graph.r   )	r   r   r   r   r&   r    r"   r#   r'   r   r   r   Ú'init_forward_metadata_replay_cuda_graph*   s   z8AttentionBackend.init_forward_metadata_replay_cuda_graphc                 C  r   )z@Get the fill value for padded seq lens. Typically, it is 0 or 1.r   ©r   r   r   r   Ú!get_cuda_graph_seq_len_fill_value8   r   z2AttentionBackend.get_cuda_graph_seq_len_fill_valuec                 C  s   ddgS )zŸ
        Return buffers of verify attention kernels that needs to be filled after draft.

        Typically, these are tree mask and position buffers.
        Nr   r)   r   r   r   Ú&get_verify_buffers_to_fill_after_draft<   s   z7AttentionBackend.get_verify_buffers_to_fill_after_draftr   Úcuda_graph_bsúOptional[int]c                 C  r   )zï
        Update the buffers returned by get_verify_fill_after_draft_buffers if needed.

        Here, we need to redo the computation of all metadata of the attention backend
        that depends on tree mask and position buffers.
        r   )r   r#   r,   r   r   r   Ú)update_verify_buffers_to_fill_after_draftD   s   	z:AttentionBackend.update_verify_buffers_to_fill_after_draftTÚqÚkÚvÚlayerr	   Úsave_kv_cacheÚboolc                 K  s˜   |j  ¡ r| |jd |j|j ¡S |j  ¡ r&| j|||||fd|i|¤ŽS |j  ¡ r=t	ƒ r=| j
|||||fd|i|¤ŽS | j|||||fd|i|¤ŽS )z"Run forward on an attention layer.r   r3   )r"   Úis_idleÚ	new_emptyÚshapeÚtp_q_head_numÚ
v_head_dimÚ	is_decodeÚforward_decodeÚis_mixedr   Úforward_mixedÚforward_extend)r   r/   r0   r1   r2   r   r3   Úkwargsr   r   r   ÚforwardO   sJ   

ûúù	ûúù
ûúùzAttentionBackend.forwardc                 C  r   )zRun a forward for decode.r   ©r   r/   r0   r1   r2   r   r3   r   r   r   r;   {   ó   
zAttentionBackend.forward_decodec                 C  r   )zRun a forward for extend.r   rA   r   r   r   r>   ‡   rB   zAttentionBackend.forward_extendc                 C  r   )zRun a forward for mix.r   rA   r   r   r   r=   “   rB   zAttentionBackend.forward_mixedc                 C  ó   dS )z-Check if the current backend supports triton.Tr   r)   r   r   r   Úsupport_tritonŸ   s   zAttentionBackend.support_tritonÚlayer_idÚreturnúOptional[BaseIndexerMetadata]c                 C  rC   )z;Get the indexer metadata. None means don't support indexer.Nr   )r   rE   r   r   r   r   Úget_indexer_metadata£   s   z%AttentionBackend.get_indexer_metadataN)r   r
   )r   r   r   r   )r   r   r   r   r   r   r   r   r    r!   r"   r   r#   r$   )r   r   r   r   r   r   r&   r   r    r!   r"   r   r#   r$   r'   r!   )r#   r   r,   r-   )T)r/   r   r0   r   r1   r   r2   r	   r   r
   r3   r4   )rE   r   r   r
   rF   rG   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r%   r(   r*   r+   r.   r@   r;   r>   r=   rD   rH   r   r   r   r   r      s(    



ù3ùùùr   )Ú
__future__r   Úabcr   r   Útypingr   r   ÚtorchÚsglang.srt.utils.commonr   Ú+sglang.srt.layers.attention.nsa.nsa_indexerr   Ú!sglang.srt.layers.radix_attentionr	   Ú,sglang.srt.model_executor.forward_batch_infor
   r   Ú sglang.srt.speculative.spec_infor   r   r   r   r   r   Ú<module>   s    