o
    
۾i<+                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ e r!ddlmZ ddl	m
Z
 ddlmZ d	d
lmZmZmZmZ e
eZG dd deZG dd deZdS )z$Attention layer with FlashAttention.    N)AttentionType)#is_flash_attn_varlen_func_available)%triton_reshape_and_cache_flash_diffkv)flash_attn_varlen_func)init_logger)get_kv_cache_layout   )FlashAttentionBackendFlashAttentionImplFlashAttentionMetadatacascade_attentionc                   @   s   e Zd ZU dZeed< ededdfddZede	fddZ
eded	 fd
dZe	ddedededede	deedf fddZe	ddedeedf fddZdS )FlashAttentionDiffKVBackend   head_size_vreturnNc                 C   s
   || _ d S N)r   )clsr    r   `/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/flash_attn_diffkv.pyset_head_size_v    s   
z+FlashAttentionDiffKVBackend.set_head_size_vc                   C   s   dS )NFLASH_ATTN_DIFFKVr   r   r   r   r   get_name$      z$FlashAttentionDiffKVBackend.get_namer
   c                   C   s   t S r   )FlashAttentionDiffKVImplr   r   r   r   get_impl_cls(   r   z(FlashAttentionDiffKVBackend.get_impl_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s&   |d dkr
t d| |||tj fS )N   r   z$Block size must be a multiple of 16.)
ValueErrorr   r   )r   r   r   r   r    r   r   r   get_kv_cache_shape.   s   z.FlashAttentionDiffKVBackend.get_kv_cache_shapeFinclude_num_layers_dimensionc                 C   sV   t  }|dkr| rdS |dkrd}|S |dkr| rdS |dkr#d}|S td| d)	NNHD)r   r            )r   r   r&   r'   HND)r   r'   r   r&   r(   )r   r&   r   r'   zUnknown cache layout format .)r   r"   )r$   cache_layoutstride_orderr   r   r   get_kv_cache_stride_order?   s   	z5FlashAttentionDiffKVBackend.get_kv_cache_stride_order)r   )F)__name__
__module____qualname__r   int__annotations__classmethodr   staticmethodstrr   typer   tupler#   boolr-   r   r   r   r   r      s<   
 

r   c                   @   sf   e Zd Z			ddejjdejdejdejdejdedejdB d	ejdB d
ejdB dejfddZdS )r   Nlayerquerykeyvaluekv_cacheattn_metadataoutputoutput_scaleoutput_block_scaler   c
                 C   sf  |dusJ d| j dusJ d|dus|	durtd|du r&|dS | j}
|j}|
tjtjfv rN| |d| |d| |d| |d| ||S |dd| j	f }|d| j	df }| j
du r{|dur{|dur{t||||j| j|j|j | jdrt| j}||}||}|jsP|j}|j}|j}|j}|j}|j}|jd d | jf}| jdkr| j|d| |d| |d| |||d| ||j ||j ||j |d	
 |S | j!durt"| j!nd}t#d+i d
|d| d|d|d|d| d|d|d|d|d| j$d|j%d| j&d|d|d| j'd|d| j d|j |d|j |d|j |d|j(d| j) |S t*|d| |d| ||fi d|jd |jd!|j+d"|j,d#|j-d$|jd| j$d| j&d%| j!d&| j'd|jd'|j.d(|j(d| j d)|j/d*|jd|jd|jd|jd| j) |S ),a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size_v]
            kv_cache: shape =
                [num_blocks, block_size, num_kv_heads, head_size + head_size_v]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size_v]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.z$FlashAttention version not detected.zEfused output quantization is not yet supported for FlashAttentionImplr   .fp8r   )	q_descale	k_descale	v_descaleqkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalalibi_slopeswindow_sizeblock_tablesoftcapscheduler_metadata
fa_versionrC   rD   rE   
num_splitss_auxcu_query_lensmax_query_lencu_prefix_query_lensprefix_kv_lenssuffix_kv_lens
max_kv_lensliding_windowlogits_soft_capcommon_prefix_lenmax_num_splitsprefix_scheduler_metadatasuffix_scheduler_metadatar   )0vllm_flash_attn_versionNotImplementedErrorfill_	attn_typenum_actual_tokensr   ENCODER_ONLYENCODER_forward_encoder_attentionr   kv_sharing_target_layer_namer   slot_mappingkv_cache_dtype_k_scale_v_scale
startswithr	   get_fp8_dtype_for_flashattnviewuse_cascadequery_start_locseq_lensrY   max_seq_lenrR   rT   shaper   dcp_world_size_forward_with_dcp_q_scaleexpandr^   listr   scalerO   rP   r_   ra   sinksr   rZ   r[   r\   r`   rb   )selfr9   r:   r;   r<   r=   r>   r?   r@   rA   rg   rh   	key_cachevalue_cachedtyperJ   rL   rK   rM   rR   rT   descale_shapesliding_window_sizer   r   r   forwardX   s<  


















	


	
z FlashAttentionDiffKVImpl.forward)NNN)	r.   r/   r0   torchnnModuleTensorr   r   r   r   r   r   r   W   s2    		
r   )__doc__r   vllm.v1.attention.backendr   #vllm.v1.attention.backends.fa_utilsr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   r   vllm.loggerr    vllm.v1.attention.backends.utilsr   
flash_attnr	   r
   r   r   r.   loggerr   r   r   r   r   r   <module>   s   ;