o
    .iz=                  
   @   sH  d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ eeZG dd deZeG dd dZG dd dee Z de!e"e#df  de!e# fddZ$de!e"e#df  de!e# dej%dB dej&dB dej'f
ddZ(G dd deZ)dS )z#Attention layer with TreeAttention.    N)	dataclass)ClassVarOptional)_custom_ops)
VllmConfig)init_logger)AttentionBackendAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)split_decodes_and_prefills)unified_attention)AttentionSpecc                   @   s   e Zd ZU dZeed< ejejgZ	e
eej  ed< edeeeB  fddZedee fddZedefd	d
Zeded fddZe	ddedededededeedf fddZeded fddZedefddZdS )TreeAttentionBackendTaccept_output_buffersupported_dtypesreturnc                   C   s
   t dgS )N   )r    r   r   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/tree_attn.py get_supported_kernel_block_sizes#   s   
z5TreeAttentionBackend.get_supported_kernel_block_sizesc                 C   s   g dS )N)    @   `                  r   )clsr   r   r   get_supported_head_sizes'   s   z-TreeAttentionBackend.get_supported_head_sizesc                   C      dS )N	TREE_ATTNr   r   r   r   r   get_name+      zTreeAttentionBackend.get_nameTreeAttentionImplc                   C      t S N)r'   r   r   r   r   get_impl_cls/   r&   z!TreeAttentionBackend.get_impl_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nr   r   z$Block size must be a multiple of 16.   )
ValueError)r,   r-   r.   r/   r0   r   r   r   get_kv_cache_shape3   s   z'TreeAttentionBackend.get_kv_cache_shapeTreeAttentionMetadataBuilderc                   C   r(   r)   )r4   r   r   r   r   get_builder_cls?   r&   z$TreeAttentionBackend.get_builder_clsc                  O   r#   )NFr   )argskwargsr   r   r   use_cascade_attentionC   r&   z*TreeAttentionBackend.use_cascade_attentionN)r+   )__name__
__module____qualname__r   bool__annotations__torchfloat16bfloat16r   r   listdtypestaticmethodintr   r   classmethodr"   strr%   typer*   tupler3   r5   r8   r   r   r   r   r      s<   
 
r   c                   @   s   e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< dZeed	< dZeed
< dZ	eed< dZ
eed< dZejdB ed< dZed  ed< dZed  ed< eded  fddZeded  fddZdS )TreeAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingr   num_prefill_tokensnum_decode_tokensnum_prefillsnum_decodesNtree_attn_bias_cached_prefill_metadata_cached_decode_metadatar   c              
   C   s   | j dkrd S | jd ur| jS | j| jd  }t|}| j| jd  }t| jt	|
  ||d  t	|
  || j| jd  | j| jd  d| _| jS )Nr   )rJ   rK   rL   rM   rN   rO   rP   )rS   rV   rL   rT   r>   diffrN   rI   rQ   rD   maxitemrO   rP   rR   selfq_start_loc	q_seqlens
kv_seqlensr   r   r   prefill_metadata]   s"   



	z&TreeAttentionMetadata.prefill_metadatac              
   C   s   | j dkrd S | jd ur| jS | jd | jd  }t|}| jd | j }t| j t|	 
 |t|	 
 || jd | j | jd | j  | jd| _| jS )Nr      )rJ   rK   rL   rM   rN   rO   rP   rU   )rR   rW   rL   rT   r>   rX   rN   rI   rD   rY   rZ   rO   rP   rU   r[   r   r   r   decode_metadatav   s$   



z%TreeAttentionMetadata.decode_metadata)r9   r:   r;   rD   r=   r>   TensorrQ   rR   rS   rT   rU   rV   r   rW   propertyr`   rb   r   r   r   r   rI   H   s&   
 



rI   c                	       sh   e Zd Zdedee dedejf fddZ		dde
d	ed
edefddZd	ede
defddZ  ZS )r4   kv_cache_speclayer_namesvllm_configdevicec           
         sx   t  |||| |j| _|j}d }| }r|j}|d ur"t|ndg}t|}	t||	t	j
|d| _| jjd | _d S )N)r   )rB   rh   r   )super__init__r-   speculative_configspeculative_token_treeastliteral_eval_get_depth_counts_prepare_tree_attn_biasr>   float32rU   shapereorder_batch_threshold)
r\   re   rf   rg   rh   spec_configspec_token_treespectree_choicesdepth_counts	__class__r   r   rj      s    z%TreeAttentionMetadataBuilder.__init__Fcommon_prefix_lencommon_attn_metadata
fast_buildr   c                 C   sl   | j jd }t||d\}}}}|j}	|j}
|j}|j}|j}|j}|j	}t
|	||||||
||||| j dS )Nr   )decode_threshold)rJ   rQ   rR   rS   rT   rK   rL   rM   rN   rO   rP   rU   )rU   rr   r   rJ   rL   rK   rN   rM   block_table_tensorrP   rI   )r\   r{   r|   r}   r~   rT   rS   rR   rQ   rJ   r]   rK   r_   rM   rO   rP   r   r   r   build   s4   
z"TreeAttentionMetadataBuilder.builddraft_indexc                 C   sb   | j }|dkrtd| _ ndd|j }}| j ||||f  | _ | jd|dd}|| _ |S )Nr   ra   T)r}   )rU   r>   emptyrK   
contiguousr   )r\   r|   r   orig_tree_attn_biasstartendattn_metadatar   r   r   build_for_drafting   s   z/TreeAttentionMetadataBuilder.build_for_drafting)F)r9   r:   r;   r   rA   rF   r   r>   rh   rj   rD   r   r<   rI   r   r   __classcell__r   r   ry   r   r4      s6    !
$r4   sorted_tree_choices.r   c                 C   sH   g }d}| D ]}t |}||kr|d ||d   d7  < |}q|S )Nr   ra   )lenappend)r   rx   
prev_depthpathdepthr   r   r   ro      s   
ro   rx   rB   rh   c                 C   s   t | d }tj||ftj ||d}d}t|D ]}||||f< q||d d df< d}tt |D ]E}t|| D ]6}	| ||	  }
t |
dkrIq:g }tt |
d D ]}|| |
d |d  d  qS|||	| d |f< q:||| 7 }q2|S )Nra   )rh   rB   r   )r   r>   fullinfranger   index)r   rx   rB   rh   tree_lentree_attn_maskmask_valir   jcur_tree_choiceancestor_idxcr   r   r   rp      s,   rp   c                   @   s   e Zd Zdejdfdededededee dB dedB ded	edB d
ededB ddfddZ				dde
jjde
jde
jde
jde
jdede
jdB de
jdB de
jdB de
jfddZdS )r'   N	num_headsr/   scaler.   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                 C   s   || _ || _t|| _|| _| j | j | _|| _|
| _|d ur'tj	|tj
d}|| _|d u r0d}|| _|d u r;d| _n|d df| _|	tjkrKtdd S )N)rB   r   )r   ra   zeEncoder self-attention and encoder/decoder cross-attention are not implemented for TreeAttentionImpl.)r   r/   floatr   r.   num_queries_per_kvr   r   r>   tensorrq   r   r   r   r   DECODERNotImplementedError)r\   r   r/   r   r.   r   r   r   r   r   r   r   r   r   rj   !  s*   

zTreeAttentionImpl.__init__layerquerykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
              
   C   s  |dusJ d|dus|	durt d|du r|dS |d\}
}| jdu r9t|||
||j| j|j|j	 |j
}|j}|jjd d |jd f}|j }rtdi d||| d|
d|d	||| d
|jd|jd|jd|jd| jddd| jd| jd|jd| jddd|j|d|j	| |j }rtdi d|d| d|
d|d	|d| d
|jd|jd|jd|jd| jddd| jd|jd| jd|jd| jddd|j|d|j	| |S )a  Forward pass with TreeAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zDfused output quantization is not yet supported for TreeAttentionImplr   ra   qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalTr   window_sizerO   softcap	q_descale	k_descale	v_descaleqq_biasr   )r   fill_unbindr   opsreshape_and_cache_flashrP   r   _k_scale_v_scalerJ   rR   rL   rr   r`   r   rK   rN   rM   r   r   r   rO   r   expandrb   rU   )r\   r   r   r   r   r   r   r   r   r   	key_cachevalue_cacherJ   rR   descale_shapeprefill_metadecode_metar   r   r   forwardI  s   


	

	
zTreeAttentionImpl.forward)NNN)r9   r:   r;   r   r   rD   r   rA   rF   rj   r>   nnModulerc   rI   r   r   r   r   r   r'      sf    

	

0	
r'   )*__doc__rm   dataclassesr   typingr   r   r>   vllmr   r   vllm.configr   vllm.loggerr   vllm.v1.attention.backendr   r	   r
   r   r   r    vllm.v1.attention.backends.utilsr   .vllm.v1.attention.ops.triton_unified_attentionr   vllm.v1.kv_cache_interfacer   r9   loggerr   rI   r4   rA   rH   rD   ro   rB   rh   rc   rp   r'   r   r   r   r   <module>   s<    )H"[
'