o
    .i1                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZmZmZ d dlmZmZ ed	d
dZeG dd
 d
ZG dd dee e jZdS )    N)	dataclass)ClassVarTypeVar)
VllmConfig)cdiv)AttentionCGSupportAttentionMetadataBuilderCommonAttentionMetadata)PAD_SLOT_IDcompute_causal_conv1d_metadatamamba_get_block_table_tensorsplit_decodes_and_prefills)AttentionSpec	MambaSpecMBaseMambaAttentionMetadata)boundc                   @   s   e Zd ZU eed< eed< eed< eed< eed< ejdB ed< ejdB ed< ejdB ed	< ejed
< ejdB ed< ejdB ed< ejdB ed< ejed< dZedB ed< dZ	ejdB ed< dZ
ejdB ed< dS )r   num_prefillsnum_prefill_tokensnum_decodesnum_decode_tokensnum_reqsNhas_initial_states_pquery_start_loc_pnum_computed_tokens_pstate_indices_tensorblock_idx_last_scheduled_token!block_idx_first_scheduled_token_pblock_idx_last_computed_tokenseq_lens	nums_dict	batch_ptrtoken_chunk_offset_ptr)__name__
__module____qualname__int__annotations__torchTensorr    dictr!   r"    r+   r+   b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/mamba_attn.pyr      s"   
 

c                	       s   e Zd ZU ee ed< dZeed< ej	Z
ee ed< dZeed< dedee d	ed
ejf fddZdedefddZ	d dedededefddZdededeejejejf fddZdedefddZdedejdejdefddZ  ZS )!!BaseMambaAttentionMetadataBuildermetadata_cls   reorder_batch_threshold_cudagraph_supportTsupports_update_block_tablekv_cache_speclayer_namesvllm_configdevicec                    s   t  |||| t|tsJ |j| _| jjj| _| jj	d ur)t
| j| jj	| _| jjjdkr_tj| jt| jjj| jjftj|d| _tj| jftj|d| _tj| jftj|d| _d S tj| jftj|d| _d S )Nall)dtyper6   )super__init__
isinstancer   compilation_configr5   scheduler_configmax_num_seqsdecode_cudagraph_max_bsmax_cudagraph_capture_sizemincache_configmamba_cache_moder(   emptyr   model_configmax_model_lenr3   
block_sizeint32r   r   r   )selfr3   r4   r5   r6   	__class__r+   r,   r:   D   sF   z*BaseMambaAttentionMetadataBuilder.__init__common_attn_metadatareturnc                 C   s*   |}|j |jksJ dd|_| d|S )z
        This method builds the metadata for full cudagraph capture.
        Currently, only decode is supported for full cudagraphs with Mamba.
        zmMamba only supports decode-only full CUDAGraph capture. Make sure all cudagraph capture sizes <= max_num_seq.r/   r   )r   num_actual_tokensmax_query_lenbuild)rI   rL   mr+   r+   r,   build_for_cudagraph_captures   s   z=BaseMambaAttentionMetadataBuilder.build_for_cudagraph_captureFcommon_prefix_len
fast_buildc                 C   s
   |  |S )z
        Default build implementation for Mamba-like attention backends.
        Subclasses (e.g., Mamba2) can override to add additional metadata.
        )_compute_common_metadata)rI   rS   rL   rT   r+   r+   r,   rP      s   

z'BaseMambaAttentionMetadataBuilder.buildmamba_block_sizec                 C   s^   |  }t||d }t|d |d }t|j|d }tj|dd}tj|dd}|||fS )Nr/   r   )rA   )compute_num_computed_tokensr   r   r(   clamp)rI   rL   rV   num_computed_tokensr   block_idx_first_scheduled_tokenr   r+   r+   r,   %_compute_prefix_caching_block_indices   s    zGBaseMambaAttentionMetadataBuilder._compute_prefix_caching_block_indicesc                 C   st  |j }t|| jd\}}}}d}d}d}	d}
d}d}d}d}d\}}}| jjjdkr?| }	|j}| jj	}| 
||\}}}nt|j|j| j| jjjdddf }|dkr|	du r^| }	|j| d d | }|j| d d | }|	|| | dk}t||jjd\}}}| jjjdkr|	dusJ |	|| | }
|dusJ ||| | }nT|| jkr| jj r| jd| j|dd	 | jd| }t||d< | jjjdkr| jd| j|dd	 | jd| }| jd| j|dd	 | jd| }| jdi d
|d|d|d|d|d|d|d|d|d|d|
d|d|jd|d|d|S )zD
        Compute metadata common to both Mamba1 and Mamba2.
        )decode_thresholdN)NNNr7   r   r/   )r6   Tnon_blockingr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r+   )r   r   r0   r5   rB   rC   rW   block_table_tensorr3   rG   r[   r   r   query_start_loc_cpuquery_start_locr   r6   r?   r<   cudagraph_modehas_full_cudagraphsr   copy_r
   r   r   r.   )rI   rL   r   r   r   r   r   r   r   rY   r   rZ   r   r   r   r    r!   r"   r   rV   query_start_loc_p_cpur+   r+   r,   rU      s   






	
z:BaseMambaAttentionMetadataBuilder._compute_common_metadatametadata	blk_tableslot_mappingc                 C   s   t  |}t||j| j| jjj}| jjjdv r |d d df }|jd }|jdkrE|| j	krE| j
j rE| jd | }|j|dd |}||_|S )N)nonealignr   Tr]   )copyr   r   r3   r5   rB   rC   shaper   r?   r<   rb   rc   r   rd   )rI   rf   rg   rh   new_metadatastate_indices_tr   persistent_state_indices_tr+   r+   r,   update_block_table1  s&   




z4BaseMambaAttentionMetadataBuilder.update_block_table)F)r#   r$   r%   typer   r'   r0   r&   r   UNIFORM_SINGLE_TOKEN_DECODEr1   r   r2   boolr   liststrr   r(   r6   r:   r	   rR   rP   tupler)   r[   rU   rp   __classcell__r+   r+   rJ   r,   r-   <   sh   
 /



 r-   )abcrk   dataclassesr   typingr   r   r(   vllm.configr   vllm.utils.math_utilsr   vllm.v1.attention.backendr   r   r	    vllm.v1.attention.backends.utilsr
   r   r   r   vllm.v1.kv_cache_interfacer   r   r   r   ABCr-   r+   r+   r+   r,   <module>   s   