o
    .i@                     @   s   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZmZmZmZ ddlmZmZ G dd	 d	eZeG d
d dZG dd de	e ZdS )z$Backend for GatedDeltaNet attention.    )	dataclassN)
VllmConfig)AttentionBackendAttentionCGSupportAttentionMetadataBuilderCommonAttentionMetadata)PAD_SLOT_IDcompute_causal_conv1d_metadatamamba_get_block_table_tensorsplit_decodes_and_prefills)AttentionSpec	MambaSpecc                   @   s4   e Zd ZedefddZeded fddZdS )GDNAttentionBackendreturnc                   C   s   dS )NGDN_ATTN r   r   r   `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/gdn_attn.pyget_name      zGDNAttentionBackend.get_nameGDNAttentionMetadataBuilderc                   C   s   t S )N)r   r   r   r   r   get_builder_cls   r   z#GDNAttentionBackend.get_builder_clsN)__name__
__module____qualname__staticmethodstrr   typer   r   r   r   r   r      s
    r   c                   @   s  e Zd ZU eed< eed< eed< eed< eed< eed< eed< dZejdB ed	< dZejdB ed
< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZedB ed< dZejdB ed< dZejdB ed< dS )GDNAttentionMetadatanum_prefillsnum_prefill_tokensnum_decodesnum_decode_tokensnum_spec_decodesnum_spec_decode_tokensnum_actual_tokensNhas_initial_statespec_query_start_locnon_spec_query_start_locspec_state_indices_tensornon_spec_state_indices_tensorspec_sequence_masksspec_token_indxnon_spec_token_indxnum_accepted_tokens	nums_dict	batch_ptrtoken_chunk_offset_ptr)r   r   r   int__annotations__r%   torchTensorr&   r'   r(   r)   r*   r+   r,   r-   r.   dictr/   r0   r   r   r   r   r   #   s,   
 r   c                   @   s   e Zd ZU ejZdZeed< de	de
e dedejfddZ					
ddededejd	B dejd	B dedefddZdefddZd	S )r      reorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec                 C   s  t |tsJ || _|j| _|j| _|| _| jr&| jjd us J | jj| _nd| _| jdk| _| 	d| j | jj
 | _| jjj| jd  | _| jjd urWt| j| jj| _tj| j| jd ftj|d| _tj| jftj|d| _tj| jftj|d| _tj| j| jd  ftj|d| _tj| j| jd  ftj|d| _tj| jd ftj|d| _tj| jd ftj|d| _tj| jftj|d| _d S )Nr   r6   dtyper;   )
isinstancer   r:   compilation_configspeculative_configr8   num_speculative_tokensnum_specuse_spec_decode_init_reorder_batch_thresholdcudagraph_modehas_full_cudagraphsuse_full_cuda_graphscheduler_configmax_num_seqsdecode_cudagraph_max_bsmax_cudagraph_capture_sizeminr3   emptyint32r(   r)   boolr*   r+   r,   r&   r'   r-   )selfr8   r9   r:   r;   r   r   r   __init__I   sx   


z$GDNAttentionMetadataBuilder.__init__NFcommon_prefix_lencommon_attn_metadatar-   num_decode_draft_tokens_cpu
fast_buildr   c           )      C   sP  |}|j }|j}| }	d\}
}}t|j|j| j| jjj	}d }| j
r3|d u s3||dk   dkr8d }d}n|dk}|  }|dkrKd }d }n|j|jdd}|d u r{t|dd\}}}}d}d }d }d }|d d df }d }|}|}d }n|dd  |d d  }|d usJ |dd  |d d  }||  }|dk  }|d| }|}|  | }|  | | }|dkr|dkrt|| jd  |d  } tj| tj|jd}tjdtj|jd}|d d d | jd f }d }|}d }d }nt||}!tj|!dd	}"|| }#|"d |# }|"|#d  }||d | jd f }|| df }tj|d tj|jd}tj|| d|dd  d
 tj|d| d tj|jd}tj||  d|dd  d
 tj|d| d tjd}tj||  d|dd  d
 |d usJ || }|dkr|	dk}$|d ur|$|  }$|d usJ t||jd\}
}}nd }$|j}%| jr|dkr|dkr|| jkr|| jkr| jd | j |dd | jd |% }||d  !t" | j#d | j |dd | j#d |% }||d  !d |d ur|d usJ | j$d |d j |dd | j$d |d }| j%d |d j |dd | j%d |d }| j&d |d  j |dd |d }&| j&d |%d  }||d d  !|& | j'd | j |dd | j'd |% }||d  !d | jr|dkr|dkr|| jkr| j(d | j |dd | j(d |% }||d  !t" | j)d |d  j |dd |d }'| j)d |%d  }||d d  !|' t*d!i d|d|d|d|d|d|d|jd|$d|d|d|d|d|d|d|d|d|
d|d |}(|(S )"N)NNNr   T)non_blockingr6   )decode_thresholdr<   )stable)dimout)r=   )r;   Fr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r   )+query_start_locquery_start_loc_cpucompute_num_computed_tokensr
   block_table_tensorseq_lensr8   r:   cache_configmamba_cache_moderC   sumitemtor;   r   sizerL   rB   r3   arangerN   rM   repeat_interleaveargsortzeroscumsumr	   r$   rG   rJ   r(   copy_fill_r   r*   r,   r+   r&   r-   r)   r'   r   ))rP   rR   rS   r-   rT   rU   mr\   r]   context_lens_tensorr.   r/   r0   r_   spec_sequence_masks_cpur*   r"   r    r   r!   r   r#   r+   r,   r(   r)   r&   r'   non_spec_query_start_loc_cpu
query_lensquery_lens_cpunon_spec_query_lensspec_token_sizespec_token_masksindexnum_non_spec_tokensr%   
batch_sizespec_num_query_tokensnon_spec_num_query_tokensattn_metadatar   r   r   build   s  
















	
z!GDNAttentionMetadataBuilder.buildc              
   C   sn   |}|j | jkr|j| jks#J d|j  d| j d|j d| j d	t|j}|d  }| d|||S )z
        This method builds the metadata for full cudagraph capture.
        Currently, only decode is supported for full cudagraphs with Mamba.
        zLGDN only supports decode-only full CUDAGraph capture. Make sure batch size (z) <= cudagraph capture sizes (z), and number of tokens (z).r6   r   )num_reqsrJ   r$   r3   diffr\   cpur}   )rP   rS   rn   r-   rT   r   r   r   build_for_cudagraph_capture  s    	z7GDNAttentionMetadataBuilder.build_for_cudagraph_capture)NNF)r   r   r   r   UNIFORM_BATCH_cudagraph_supportr7   r1   r2   r   listr   r   r3   r;   rQ   r   r4   rO   r   r}   r   r   r   r   r   r   D   s@   
 
O
 sr   )__doc__dataclassesr   r3   vllm.configr   vllm.v1.attention.backendr   r   r   r    vllm.v1.attention.backends.utilsr   r	   r
   r   vllm.v1.kv_cache_interfacer   r   r   r   r   r   r   r   r   <module>   s   
 