o
    id=                  
   @   sD  d Z ddlZddlmZ ddlmZ ddlZddlmZ	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ eeZG dd deZeG dd dZG dd dee Zde e!e"df  de e" fddZ#de e!e"df  de e" dej$dB dej%dB dej&f
ddZ'G dd deZ(dS )z#Attention layer with TreeAttention.    N)	dataclass)ClassVar)_custom_ops)
VllmConfig)init_logger)AttentionBackendAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)split_decodes_and_prefills)unified_attention)AttentionSpecc                   @   s   e Zd ZU dZeed< ejejgZ	e
eej  ed< edeeeB  fddZedee fddZedefd	d
Zeded fddZe	ddedededededeedf fddZeded fddZedefddZdS )TreeAttentionBackendTaccept_output_buffersupported_dtypesreturnc                   C   s
   t dgS )N   )r    r   r   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/backends/tree_attn.py get_supported_kernel_block_sizes#   s   
z5TreeAttentionBackend.get_supported_kernel_block_sizesc                 C   s   g dS )N)    @   `                  r   )clsr   r   r   get_supported_head_sizes'   s   z-TreeAttentionBackend.get_supported_head_sizesc                   C      dS )N	TREE_ATTNr   r   r   r   r   get_name+      zTreeAttentionBackend.get_nameTreeAttentionImplc                   C      t S N)r&   r   r   r   r   get_impl_cls/   r%   z!TreeAttentionBackend.get_impl_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nr   r   z$Block size must be a multiple of 16.   )
ValueError)r+   r,   r-   r.   r/   r   r   r   get_kv_cache_shape3   s   z'TreeAttentionBackend.get_kv_cache_shapeTreeAttentionMetadataBuilderc                   C   r'   r(   )r3   r   r   r   r   get_builder_cls?   r%   z$TreeAttentionBackend.get_builder_clsc                  O   r"   )NFr   )argskwargsr   r   r   use_cascade_attentionC   r%   z*TreeAttentionBackend.use_cascade_attentionN)r*   )__name__
__module____qualname__r   bool__annotations__torchfloat16bfloat16r   r   listdtypestaticmethodintr   r   classmethodr!   strr$   typer)   tupler2   r4   r7   r   r   r   r   r      s<   
 
r   c                   @   s   e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< dZeed	< dZeed
< dZ	eed< dZ
eed< dZejdB ed< dZded< dZded< edddZedddZdS )TreeAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingr   num_prefill_tokensnum_decode_tokensnum_prefillsnum_decodesNtree_attn_biasTreeAttentionMetadata | None_cached_prefill_metadata_cached_decode_metadatar   c              
   C   s   | j dkrd S | jd ur| jS | j| jd  }t|}| j| jd  }t| jt	|
  ||d  t	|
  || j| jd  | j| jd  d| _| jS )Nr   )rI   rJ   rK   rL   rM   rN   rO   )rR   rV   rK   rS   r=   diffrM   rH   rP   rC   maxitemrN   rO   rQ   selfq_start_loc	q_seqlens
kv_seqlensr   r   r   prefill_metadata]   s"   



	z&TreeAttentionMetadata.prefill_metadatac              
   C   s   | j dkrd S | jd ur| jS | jd | jd  }t|}| jd | j }t| j t|	 
 |t|	 
 || jd | j | jd | j  | jd| _| jS )Nr      )rI   rJ   rK   rL   rM   rN   rO   rT   )rQ   rW   rK   rS   r=   rX   rM   rH   rC   rY   rZ   rN   rO   rT   r[   r   r   r   decode_metadatav   s$   



z%TreeAttentionMetadata.decode_metadata)r   rU   )r8   r9   r:   rC   r<   r=   TensorrP   rQ   rR   rS   rT   rV   rW   propertyr`   rb   r   r   r   r   rH   H   s&   
 



rH   c                	       sh   e Zd Zdedee dedejf fddZ		dde
d	ed
edefddZd	ede
defddZ  ZS )r3   kv_cache_speclayer_namesvllm_configdevicec           
         sx   t  |||| |j| _|j}d }| }r|j}|d ur"t|ndg}t|}	t||	t	j
|d| _| jjd | _d S )N)r   )rA   rh   r   )super__init__r,   speculative_configspeculative_token_treeastliteral_eval_get_depth_counts_prepare_tree_attn_biasr=   float32rT   shapereorder_batch_threshold)
r\   re   rf   rg   rh   spec_configspec_token_treespectree_choicesdepth_counts	__class__r   r   rj      s    z%TreeAttentionMetadataBuilder.__init__Fcommon_prefix_lencommon_attn_metadata
fast_buildr   c                 C   sl   | j jd }t||d\}}}}|j}	|j}
|j}|j}|j}|j}|j	}t
|	||||||
||||| j dS )Nr   )decode_threshold)rI   rP   rQ   rR   rS   rJ   rK   rL   rM   rN   rO   rT   )rT   rr   r   rI   rK   rJ   rM   rL   block_table_tensorrO   rH   )r\   r{   r|   r}   r~   rS   rR   rQ   rP   rI   r]   rJ   r_   rL   rN   rO   r   r   r   build   s4   
z"TreeAttentionMetadataBuilder.builddraft_indexc                 C   sb   | j }|dkrtd| _ ndd|j }}| j ||||f  | _ | jd|dd}|| _ |S )Nr   ra   T)r}   )rT   r=   emptyrJ   
contiguousr   )r\   r|   r   orig_tree_attn_biasstartendattn_metadatar   r   r   build_for_drafting   s   z/TreeAttentionMetadataBuilder.build_for_drafting)F)r8   r9   r:   r   r@   rE   r   r=   rh   rj   rC   r   r;   rH   r   r   __classcell__r   r   ry   r   r3      s6    !
$r3   sorted_tree_choices.r   c                 C   sH   g }d}| D ]}t |}||kr|d ||d   d7  < |}q|S )Nr   ra   )lenappend)r   rx   
prev_depthpathdepthr   r   r   ro      s   
ro   rx   rA   rh   c                 C   s   t | d }tj||ftj ||d}d}t|D ]}||||f< q||d d df< d}tt |D ]E}t|| D ]6}	| ||	  }
t |
dkrIq:g }tt |
d D ]}|| |
d |d  d  qS|||	| d |f< q:||| 7 }q2|S )Nra   )rh   rA   r   )r   r=   fullinfranger   index)r   rx   rA   rh   tree_lentree_attn_maskmask_valir   jcur_tree_choiceancestor_idxcr   r   r   rp      s,   rp   c                   @   s   e Zd Zdejdfdededededee dB dedB ded	edB d
ededB ddfddZ				dde
jjde
jde
jde
jde
jdede
jdB de
jdB de
jdB de
jfddZdS )r&   N	num_headsr.   scaler-   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                 C   s   || _ || _t|| _|| _| j | j | _|| _|
| _|d ur'tj	|tj
d}|| _|d u r0d}|| _|d u r;d| _n|d df| _|	tjkrKtdd S )N)rA   r   )r   ra   zeEncoder self-attention and encoder/decoder cross-attention are not implemented for TreeAttentionImpl.)r   r.   floatr   r-   num_queries_per_kvr   r   r=   tensorrq   r   r   r   r
   DECODERNotImplementedError)r\   r   r.   r   r-   r   r   r   r   r   r   r   r   r   rj   !  s*   

zTreeAttentionImpl.__init__layerquerykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
              
   C   s  |dusJ d|dus|	durt d|du r|dS |d\}
}| jdu r9t|||
||j| j|j|j	 |j
}|j}|jjd d |jd f}|j }rtdi d||| d|
d|d	||| d
|jd|jd|jd|jd| jddd| jd| jd|jd| jddd|j|d|j	| |j }rtdi d|d| d|
d|d	|d| d
|jd|jd|jd|jd| jddd| jd|jd| jd|jd| jddd|j|d|j	| |S )a  Forward pass with TreeAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zDfused output quantization is not yet supported for TreeAttentionImplr   ra   qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalTr   window_sizerN   softcap	q_descale	k_descale	v_descaleqq_biasr   )r   fill_unbindr   opsreshape_and_cache_flashrO   r   _k_scale_v_scalerI   rQ   rK   rr   r`   r   rJ   rM   rL   r   r   r   rN   r   expandrb   rT   )r\   r   r   r   r   r   r   r   r   r   	key_cachevalue_cacherI   rQ   descale_shapeprefill_metadecode_metar   r   r   forwardI  s   


	

	
zTreeAttentionImpl.forward)NNN)r8   r9   r:   r
   r   rC   r   r@   rE   rj   r=   nnModulerc   rH   r   r   r   r   r   r&      sf    

	

0	
r&   ))__doc__rm   dataclassesr   typingr   r=   vllmr   r   vllm.configr   vllm.loggerr   vllm.v1.attention.backendr   r   r	   r
   r   r    vllm.v1.attention.backends.utilsr   .vllm.v1.attention.ops.triton_unified_attentionr   vllm.v1.kv_cache_interfacer   r8   loggerr   rH   r3   r@   rG   rC   ro   rA   rh   rc   rp   r&   r   r   r   r   <module>   s<    )H"[
'