o
    پi                     @   st   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ G d	d
 d
eZdS )    )OptionalN)AttentionBackend)BaseIndexerMetadata)RadixAttention)ForwardBatchForwardMode)ModelRunner)	SpecInputc                   @   sD  e Zd ZdZdededefddZdedefd	d
Zde	fddZ
dedefddZdededejdejdeej dedee fddZdedejdejdedeej dedee deej fddZdd Z	 d.d!ejd"ejd#ejd$ede	d%efd&d'Z	 d.d!ejd"ejd#ejd$ede	d%efd(d)Zd*ede	dee fd+d,Zd-S )/HybridAttnBackendz2Support different backends for prefill and decode.model_runnerprefill_backenddecode_backendc                 C   s   || _ || _|| _|j| _d S N)r   r   r   kv_cache_dtype	data_type)selfr   r   r    r   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/hybrid_attn_backend.py__init__   s   zHybridAttnBackend.__init__forward_modereturnc                 C   s>   |  r| jS | s| r| jjjdkr| jS | jS | jS )a  
        Select the appropriate attention backend based on the forward mode.

        Args:
            forward_mode: The current forward mode indicating the operation type

        Returns:
            The selected attention backend (prefill or decode)

        Note:
            - decode_or_idle: Always uses decode backend
            - target_verify or draft_extend: Uses decode backend if speculative_attention_mode is "decode", otherwise prefill backend
            - prefill: Always uses prefill backend
        decode)is_decode_or_idler   is_target_verifyis_draft_extendr   server_argsspeculative_attention_moder   )r   r   r   r   r   _select_backend   s   z!HybridAttnBackend._select_backendforward_batchc                 C   s   |  |j}|| d S r   )r   r   init_forward_metadata)r   r   backendr   r   r   r   5   s   z'HybridAttnBackend.init_forward_metadatamax_bsmax_num_tokensc                 C   sD   | j || | jjjd ur| jjjdkr | j|| d S d S d S )Nprefill)r   init_cuda_graph_stater   r   speculative_algorithmr   r   )r   r!   r"   r   r   r   r$   9   s   z'HybridAttnBackend.init_cuda_graph_statebs
num_tokensreq_pool_indicesseq_lensencoder_lens	spec_infoc           	   	   C   s$   |  |}|||||||| d S r   )r   (init_forward_metadata_capture_cuda_graph)	r   r&   r'   r(   r)   r*   r   r+   r    r   r   r   r,   C   s   

z:HybridAttnBackend.init_forward_metadata_capture_cuda_graphseq_lens_sumseq_lens_cpuc	           
   
   C   s&   |  |}	|	|||||||| d S r   )r   'init_forward_metadata_replay_cuda_graph)
r   r&   r(   r)   r-   r*   r   r+   r.   r    r   r   r   r/   X   s   
z9HybridAttnBackend.init_forward_metadata_replay_cuda_graphc                 C   s
   | j  S r   )r   !get_cuda_graph_seq_len_fill_value)r   r   r   r   r0   o   s   
z3HybridAttnBackend.get_cuda_graph_seq_len_fill_valueTqkvlayersave_kv_cachec                 K   s   | j j||||||fi |S r   )r   forward_decode)r   r1   r2   r3   r4   r   r5   kwargsr   r   r   r6   r   s
   
z HybridAttnBackend.forward_decodec           	      K   s(   |  |j}|j||||||fi |S r   )r   r   forward_extend)	r   r1   r2   r3   r4   r   r5   r7   r    r   r   r   r8      s   
z HybridAttnBackend.forward_extendlayer_idc                 C   s   |  |j}|||S r   )r   r   get_indexer_metadata)r   r9   r   r    r   r   r   r:      s   z&HybridAttnBackend.get_indexer_metadataN)T)__name__
__module____qualname____doc__r   r   r   r   r   r   r   intr$   torchTensorr   r	   r,   r/   r0   r   boolr6   r8   r   r:   r   r   r   r   r
      s    


	



r
   )typingr   r@   -sglang.srt.layers.attention.base_attn_backendr   +sglang.srt.layers.attention.nsa.nsa_indexerr   !sglang.srt.layers.radix_attentionr   ,sglang.srt.model_executor.forward_batch_infor   r   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor	   r
   r   r   r   r   <module>   s    