o
    پiW(                     @   sj   d dl Z d dlmZ d dlmZ d dlZd dlmZ eddG dd dZeddG d	d
 d
eZ	dS )    N)	dataclass)Optional)ForwardBatchT)kw_onlyc                   @   s   e Zd ZU ejed< ejed< dZeej ed< dZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeed< dZeed< dS )ForwardMetadataquery_start_locmamba_cache_indicesNretrieve_next_tokenretrieve_next_siblingretrieve_parent_tokentrack_conv_indicestrack_ssm_h_srctrack_ssm_h_dsttrack_ssm_final_srctrack_ssm_final_dstFis_target_verify   draft_token_num)__name__
__module____qualname__torchTensor__annotations__r	   r   r
   r   r   r   r   r   r   r   boolr   int r   r   e/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/mamba/mamba2_metadata.pyr      s   
 

r   c                   @   s   e Zd ZU dZeed< eed< eed< edddG dd dZd	Zed	B ed
< 	 e	de
jdededee
je
jf fddZe	dede
jdededd f
ddZededededd fddZd	S )Mamba2Metadataz<stable metadata across all mamba2 layers in the forward passnum_prefillsnum_prefill_tokensnum_decodesT)r   frozenc                   @   sR   e Zd ZU ejed< eed< eed< ejed< ejed< ejed< ee ed< dS )	zMamba2Metadata.MixedMetadatahas_initial_statesprep_initial_states
chunk_sizeseq_idxchunk_indiceschunk_offsetsextend_seq_lens_cpuN)	r   r   r   r   r   r   r   r   listr   r   r   r   MixedMetadata5   s   
 



r+   Nmixed_metadatar   r%   total_seqlensreturnc                 C   s   | dd }t || |dd | dk  }tj|tj| jd}tj|ftj| jd}d}t|dd |dd D ]/\}}	||| dk7 }|| | |	| | |	| dk }
}||
|  |8  < || ||
< q=||fS )a	  
        Args:
            query_start_loc (torch.Tensor): 1D tensor of cumulative sequence
                lengths, shape (num_seqs + 1,).
                The first element should be 0. Each entry represents the starting
                index of a sequence in the flattened token array.
            chunk_size (int): The size of each physical mamba chunk
                (number of tokens per chunk).
            total_seqlens (int): The total number of tokens in the batch.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - chunk_indices (torch.Tensor): 1D tensor of indices
                    indicating the physical chunk for each logical chunk.
                - chunk_offsets (torch.Tensor): 1D tensor of offsets
                    indicating the starting index of each logical chunk within
                    its physical chunk.

        This function computes the chunk indices and offsets for the given
        query_start_loc and chunk_size. Both are tensors of integers with length N,
        where N is the number of logical (pseudo) chunks.
        A logical chunk is a sequence of tokens that are all part of the same
        sequence and are all in the same physical mamba chunk.
        In other words, a logical chunk changes every time we cross a sequence
        boundary or a physical mamba chunk boundary.
        Logical chunks are needed to handle batched requests with initial states
        (see _state_passing_fwd and _chunk_scan_fwd).
        The chunk_indices tensor contains the index of the physical chunk for each
        logical chunk.
        The chunk_offsets tensor contains the offset (AKA starting index) of the
        logical chunk in the physical chunk.

        Example:
        query_start_loc = [0, 5, 10]
        chunk_size = 8
        total_seqlens = 10
        -> chunk_indices = [0, 0, 1]
        -> chunk_offsets = [0, 5, 0]

        In this example, we have 2 sequences, each with 5 tokens. The physical
        chunk size is 8 tokens.
        We have three logical chunks:
        - the first logical chunk starts at token 0 in the first physical chunk
            and contains all 5 tokens from the first sequence
        - the second logical chunk starts at token 5 in the first physical chunk
            and contains first 3 tokens from the second sequence
        - the third logical chunk starts at token 0 in the second physical chunk
            and contains the remaining 2 tokens from the second sequence
        r   Nr   dtypedevice)	mathceilsumr   aranger   r2   zeroszip)r   r%   r-   
cu_seqlensNr'   r(   pse_s_er   r   r   )_query_start_loc_to_chunk_indices_offsetsD   s    6"&z8Mamba2Metadata._query_start_loc_to_chunk_indices_offsetsforward_metadataseq_lensr   r   c                C   s*   t | j| j| j| j| jt|dd||d
S )zTThis path is run during CUDA graph capture, i.e. decode only, so `num_prefills` is 0r   )
r   r   r	   r
   r   r!   r   r    r   r   )r   r   r   r	   r
   r   len)rA   rB   r   r   r   r   r   prepare_decode   s   	zMamba2Metadata.prepare_decodeforward_batchc                 C   sD  |j du r|jdur|jjnd}| j||j|j |dS t|j}|j }t|j| }|j	}|dus5J |dk}	t
|	d|  }
|jd|d  }t
jt
j|t
j|jd| |d}|d d\}}|
rs| |||\}}|jdurt|jddnd}t||j|j|j|j||||j || j|	|
|||||jd	d
S )zEThis path cannot run with CUDA graph, as it contains extend requests.Nr   )r   r   r   r0   )output_size)NNr   )r#   r$   r%   r&   r'   r(   r)   )r   r   r	   r
   r   r   r    r!   r   r   r,   )extend_num_tokens	spec_infor   rD   rB   forward_moder   rC   extend_seq_lensextend_prefix_lensr   anyitemr   repeat_interleaver6   int32r2   diff
unsqueeze_r@   getattrr   r   r	   r
   r   r+   r)   )clsrA   r%   rE   r   r   r    r!   context_lens_tensorr#   r$   r   r&   r(   r'   r   r   r   prepare_mixed   st   





zMamba2Metadata.prepare_mixed)r   r   r   __doc__r   r   r   r+   r,   staticmethodr   r   tupler@   r   r   rD   classmethodr   rU   r   r   r   r   r   -   sR   
 
Sr   )
r3   dataclassesr   typingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   r   r   r   r   r   <module>   s   