o
    
۾i                  3   @   s  d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
Z
 ddlmZmZmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ e rSdd
lmZmZmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddlm-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 e&e5Z6G dd deZ7eG dd dZ8dede9e:e;e;f dB  fddZ<G dd de.e8 Z=G dd  d eZ>d!e;d"ej?d#e;d$e;d%e@d&e@d'e@d(e;d)e;de@fd*d+ZA						dEd,ejBd-ejBd.ejBd/ejBd0ejBd1e;d2ejBd3ejBd4ejBd5e;d6eCd7ejBdB d8e:e;e;f d9eCd:ejBd!e;d;e;d<e;d=ejBdB d>ejBdB d?ejBdB d@ejBdB dAejBdB dBejBdB dejBf2dCdDZDdS )Fz$Attention layer with FlashAttention.    N)	dataclass)ClassVar)	Attention)AttentionBackendAttentionImplAttentionType
MultipleOfis_quantized_kv_cache)flash_attn_supports_fp8get_flash_attn_version#is_flash_attn_varlen_func_available)cp_lse_ag_out_rs)merge_attn_states)flash_attn_supports_sinksflash_attn_varlen_funcget_scheduler_metadatareshape_and_cache_flash)
VllmConfigget_current_vllm_configget_layers_from_vllm_config)
CacheDTypeget_dcp_group)init_logger)vllm_is_batch_invariantDeviceCapability)cdiv)AttentionCGSupportAttentionMetadataBuilderCommonAttentionMetadata)get_dcp_local_seq_lensget_kv_cache_layout)AttentionSpecc                   @   s  e Zd ZU dZeed< ejejgZ	e
eej  ed< edeeeB  fddZdZeed< edefd	d
ZededefddZeded fddZeded fddZe	d4dedededededeedf fddZe	d5dedeedf fddZed edejfd!d"Zededefd#d$Zed ed%B defd&d'Zedefd(d)Z ed*e!defd+d,Z"eded-ejd ed%B ded%B d.ed/ed0ed1e!ded%B fd2d3Z#d%S )6FlashAttentionBackendTaccept_output_buffersupported_dtypesreturnc                  C   sB   t  } | j}| j}|r|jr|jdks|jdkrg dS tdgS )Nfloat32)       @   r)   )r   model_configcache_config	is_hybridmamba_ssm_cache_dtypemamba_cache_dtyper   )vllm_configr,   r-    r2   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/flash_attn.py get_supported_kernel_block_sizes>   s   


z6FlashAttentionBackend.get_supported_kernel_block_sizesF forward_includes_kv_cache_updatec                   C   s   dS )N
FLASH_ATTNr2   r2   r2   r2   r3   get_nameT      zFlashAttentionBackend.get_name	attn_typec                 C   s   |t jt jt jt jfv S )z,FlashAttention supports all attention types.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)clsr9   r2   r2   r3   supports_attn_typeX   s   z(FlashAttentionBackend.supports_attn_typeFlashAttentionImplc                   C      t S N)r@   r2   r2   r2   r3   get_impl_clsb   r8   z"FlashAttentionBackend.get_impl_clsFlashAttentionMetadataBuilderc                   C   rA   rB   )rD   r2   r2   r2   r3   get_builder_clsf   r8   z%FlashAttentionBackend.get_builder_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nr)   r   z$Block size must be a multiple of 16.   )
ValueError)rG   rH   rI   rJ   rK   r2   r2   r3   get_kv_cache_shapej   s   z(FlashAttentionBackend.get_kv_cache_shapeinclude_num_layers_dimensionc                 C   sV   t  }|dkr| rdS |dkrd}|S |dkr| rdS |dkr#d}|S td| d)	NNHD)rL   r               )r   rQ   rL   rR   rS   HND)rL   rS   r   rQ   rR   rT   )r   rQ   rR   rL   rS   zUnknown cache layout format .)r"   rM   )rO   cache_layoutstride_orderr2   r2   r3   get_kv_cache_stride_orderv   s   z/FlashAttentionBackend.get_kv_cache_stride_orderkv_cache_dtypec                 C   s   | dv rt jS td|  )N)fp8fp8_e4m3zUnrecognized FP8 dtype: )torchfloat8_e4m3fnrM   )rZ   r2   r2   r3   get_fp8_dtype_for_flashattn   s   z1FlashAttentionBackend.get_fp8_dtype_for_flashattnc                 C   s   |d dko	|dkS )N   r      r2   )r>   rJ   r2   r2   r3   supports_head_size   s   z(FlashAttentionBackend.supports_head_sizeNc                 C   s$   |d u rdS | drt S |dv S )NTr[   )rF   bfloat16)
startswithr
   )r>   rZ   r2   r2   r3   supports_kv_cache_dtype   s
   
z-FlashAttentionBackend.supports_kv_cache_dtypec                 C   s   t  sdS t S )NF)r   r   )r>   r2   r2   r3   supports_sink   s   z#FlashAttentionBackend.supports_sink
capabilityc                 C   s   |t ddkS )Nr`   r   r   )r>   rg   r2   r2   r3   supports_compute_capability   s   z1FlashAttentionBackend.supports_compute_capabilitydtypeuse_mlahas_sink
use_sparsedevice_capabilityc	           	      C   s   |r|t ddk rdS d S )N	   r   z.sink not supported on compute capability < 9.0r   )	r>   rJ   ri   rZ   rH   rj   rk   rl   rm   r2   r2   r3   supports_combination   s   z*FlashAttentionBackend.supports_combination)rF   F)$__name__
__module____qualname__r%   bool__annotations__r]   float16rc   r&   r   listri   staticmethodintr   r4   r5   strr7   classmethodr?   typerC   rE   tuplerN   rY   r_   rb   r   re   rf   r   rh   ro   r2   r2   r2   r3   r$   :   s   
 	

	
r$   c                   @   s   e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< eed< eed	< ejd
B ed< ejd
B ed< ejd
B ed< d
Zed
B ed< d
Z	ejd
B ed< d
Z
ejd
B ed< d
Zejd
B ed< dZeed< dZeed< d
S )FlashAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensmax_dcp_context_kv_lendcp_context_kv_lensscheduler_metadataprefix_scheduler_metadatar   max_num_splitsTcausal)rq   rr   rs   ry   ru   r]   Tensorrt   r   r   r   r   r   r   r2   r2   r2   r3   r~      s&   
 




r~   r1   r'   c                 C   s@   t  }t| t}| D ]}t|jtsJ ||jj q|S )z<Get the set of all sliding window configs used in the model.)	setr   r   values
isinstanceimplr@   addsliding_window)r1   sliding_window_configslayerslayerr2   r2   r3   _get_sliding_window_configs   s   
r   c                	       s   e Zd ZU e dkrejnejZdZe	e
d< edddddefd	d
Zdedee dedejf fddZ	ddedede	defddZdedejdejdefddZde	fddZ  ZS )rD   rR   Tsupports_update_block_tabler1   r   kv_cache_specr#   r'   c                 C   s   | j S rB   )_cudagraph_support)r>   r1   r   r2   r2   r3   get_cudagraph_support  s   z3FlashAttentionMetadataBuilder.get_cudagraph_supportlayer_namesdevicec                    s,  t  |||| |j| _|j| _|j| _|j| _|j| _| j| j| _| j	| j| _
|j| _| j | _|j| _d| _t dk| _zddlm} | j| _| j| _W n tye   d| _d| _Y nw | jj| _| jj | _| jj| _| jr| jrt j!|j"j#d t j$| j%d| _&| jj'| _d | _(d S )Nr   rR   r   rQ   ri   r   ))super__init__r,   parallel_configr-   compilation_configattention_configget_num_attention_headsnum_heads_qget_num_kv_headsnum_heads_kvri   rZ   get_head_sizeheaddimrH   r   r   aot_schedulevllm.distributed.parallel_stater   
world_sizedcp_world_sizerank_in_groupdcp_rankAssertionErrorcp_kv_cache_interleave_sizecudagraph_modehas_full_cudagraphsuse_full_cuda_graphmax_cudagraph_capture_sizemax_cudagraph_sizer]   zerosscheduler_configmax_num_seqsint32r   r   (flash_attn_max_num_splits_for_cuda_graphaot_sliding_window)selfr   r   r1   r   r   	__class__r2   r3   r     sJ   




	
z&FlashAttentionMetadataBuilder.__init__Fr   common_attn_metadata
fast_buildc                    s  |j }|j}|j}|j}|j}|j}	|j}
|j}|j}j	o |  j
du rMd_
 rMtj}t|dkrB| }|durA|_
nt|dkrMd_	d djr_jdur_|jkr_jt rdd fdd}|dk}d}d}d}d}d}d}jdkr|dd |dd  }|	| }t|jjj}jj }|| d | j }||||||dd	}nB|rtjd|gtjjd
}tj|gtjjd
}|	d| | }|d||||dd	}||||||| dd	}n
|||||	||d	}jr|dur|jd }|jd|< dj|d< jd| }tdi d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|dd|}|S )zu
        fast_build disables AOT scheduling, used when there will be few
        iterations i.e. spec-decode
        Nr   rQ   Fr   c                    s^   j j}|drt|}nj} r-t| ||jj j	j
|||j|jdS d S )Nr[   )
batch_sizemax_seqlen_qmax_seqlen_kr   r   r   cache_seqlens	qkv_dtypecu_seqlens_q	page_sizer   window_size
num_splits)r-   cache_dtyperd   r$   r_   rZ   r   r   r   r   r   rH   r   )r   cu_query_lensr   seqlensr   r   r   r   r   r   r   r2   r3   schedule  s.   

z5FlashAttentionMetadataBuilder.build.<locals>.scheduler   )r   r   r   r   r   r   r   Tr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   )num_reqsr   r   r   r   r   block_table_tensorr   r   r   r   r   r1   lenpopr   r   r   r   r   r!   r   r   r]   tensorr   r   shaper   r~   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   sliding_window_configr   r   r   r   r   r   r   r   query_kv_lensnum_partitionsr   nattn_metadatar2   r   r3   buildI  s  






		
	
z#FlashAttentionMetadataBuilder.buildmetadata	blk_tabler   c                 C   s   t  |}||_||_|S rB   )copyr   r   )r   r   r   r   new_metadatar2   r2   r3   update_block_table  s   
z0FlashAttentionMetadataBuilder.update_block_tablec                 O   s   t |i |S rB   )use_cascade_attention)r   argskwargsr2   r2   r3   r     s   z3FlashAttentionMetadataBuilder.use_cascade_attentionrp   )rq   rr   rs   r   r   ALWAYSUNIFORM_BATCHr   r   rt   ru   r{   r   r#   rw   rz   r   r]   r   r   ry   r    r~   r   r   r   r   __classcell__r2   r2   r   r3   rD      sX   
 
A
 :
rD   c                   @   s  e Zd ZU dZeed< dejddfdedede	dede
e	 dB d	edB d
ede	dB dededB dejdB ddfddZ			d)dejjdejdejdejdejdedejdB dejdB dejdB dejfddZdejjdejdejdejdejddfddZ			d)dejdejdejd ejd!ejdejded"ejdB d#ejdB d$ejdB dejfd%d&Zdejdejdejdejdedejjdejfd'd(ZdS )*r@   Tcan_return_lse_for_decodeN	num_headsrJ   scalerI   alibi_slopesr   rZ   logits_soft_capr9   kv_sharing_target_layer_namesinksr'   c                 C   s0  || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|	t
jkr4|d |d f| _	n|d df| _	|| _|d u rDd}|| _|
| _| j | j | _|	| _t | _t | _t| jrht shtd|| _| jd urt swJ d| jjd |ksJ dd| _| jd ur| jd	k| _d S d
| _d S )N)ri   r   rQ   r   z<FlashAttention does not support fp8 kv-cache on this device.z,Sinks are only supported in FlashAttention 3zLSinks must have the same number of heads as the number of heads in the layerTrR   F)r   rJ   floatr   rI   r]   r   r(   r   r   r   r<   rZ   r   r   num_queries_per_kvr9   r   vllm_flash_attn_versionr   batch_invariant_enabledr	   r
   NotImplementedErrorr   r   r   supports_quant_query_inputsupports_per_head_quant_scales)r   r   rJ   r   rI   r   r   rZ   r   r9   r   r   r2   r2   r3   r     sN   



zFlashAttentionImpl.__init__r   querykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
                 C   s  |dusJ d| j dusJ d|dus|	durtd|du r&|dS | j}
|j}|
tjtjfv rN| |d| |d| |d| |d| ||S |	d\}}| j
drkt| j
}||}||}|js$|j}|j}|j}|j}|j}|j}|jd d | jf}|j|}|j|}|j|}| jdkr| j|d| |d| |d| |||d| ||||d
 |S | jdurt| jnd}t d*i d	|d| d
|d|d|d| d|d|d|d|d| j!d|j"d| j#d|d|d| j$d|d| j d|d|d|d|j%d| j& |S t'|d| |d| ||fi d|jd|jd |j(d!|j)d"|j*d#|jd| j!d| j#d$| jd%| j$d|jd&|j+d'|j%d| j d(|j,d)|jd|jd|jd|jd| j& |S )+a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.$FlashAttention version not detected.zEfused output quantization is not yet supported for FlashAttentionImplr   r[   rQ   )	q_descale	k_descale	v_descaleqkvoutr   r   	seqused_kr   softmax_scaler   r   r   r   softcapr   
fa_versionr  r  r  r   s_auxr   r   r   r   r   
max_kv_lenr   r   r   r   r   suffix_scheduler_metadatar2   )-r   r   fill_r9   r   r   r<   r;   _forward_encoder_attentionunbindrZ   rd   r$   r_   viewr   r   r   r   r   r   r   r   rI   _q_scaleexpand_k_scale_v_scaler   _forward_with_dcpr   rw   r   r   r   r   r   r   r   cascade_attentionr   r   r   r   r   )r   r   r   r   r   r   r   r   r   r  r9   r   	key_cachevalue_cacheri   r   r
  r   r   r   r   descale_shaper  r  r  sliding_window_sizer2   r2   r3   forwardS  s(  














	


	
zFlashAttentionImpl.forwardr   c              	   C   sD   | j tjtjfv rd S |d\}}t|||||| j|j|j d S )Nr   )	r9   r   r<   r;   r  r   rZ   r  r  )r   r   r   r   r   r   r  r  r2   r2   r3   do_kv_cache_update  s   	z%FlashAttentionImpl.do_kv_cache_updater  r  r  r  r  c                 C   s  | j d us	J d|j}|j}|j}| }t j|dd}| jd ur(t| jnd }t	di d|d|d|dd d|d	|d
|j
d|jd| jddd| jd|d|d| jddd|jd| j d|d|	d|
\}}t||ddt dd\}}|dd }t	di d|d|d|dd d|d	|d|d|d| jd|jd| jd|d| jddd| j d|d|	d|
\}}|j|jksJ |j|jksJ t||||| d S )Nr  rQ   )dimr  r  r  r	  r   r   r
  r   r  r   Fr   r   r   r  return_softmax_lseTr   r  r  r  r  r   )
return_lsecu_seqlens_kr2   )r   r   r   r   
contiguousr   
all_gatherr   rw   r   r   r   r   r   r   r   r   	transposer   r   r   )r   r   r   r   r  r  r   r   r  r  r  r   r   r   query_across_dcpr  context_attn_outcontext_lsecontext_attn_out_corcontext_lse_corquery_attn_out	query_lser2   r2   r3   r    s   	


	
z$FlashAttentionImpl._forward_with_dcpc                 C   s  | j dus	J d| jdrtd|j}|j}|j}	|j}
|jd d | jf}| jdur3t	| jnd}t
di d|d|d	|d
|d|d|d|	d|
d| jddd| jd|d| jd| j d|j|d|j|d|j|d| jrdnd |S  |S )a  Forward pass for encoder attention without KV cache.

        Args:
            query: shape = [num_encoder_tokens, num_heads, head_size]
            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
            output: shape = [num_encoder_tokens, num_heads, head_size]
            attn_metadata: Encoder attention metadata
            layer: The attention layer
        Nr  r[   z3quantization is not supported for encoder attentionr   rQ   r  r  r  r	  r   r$  r   r   r  r   Fr   r   r  r  r  r  r  r   r2   )r   rZ   rd   r   r   r   r   rI   r   rw   r   r   r   r   r  r  r  r  r   )r   r   r   r   r   r   r   r   r$  r   r   r  r  r2   r2   r3   r  q  sr   	
z-FlashAttentionImpl._forward_encoder_attention)NNN)rq   rr   rs   r   rt   ru   r   r:   ry   r   rw   rz   r]   r   r   nnModuler~   r  r   r  r  r2   r2   r2   r3   r@     s   
 
	

G	

 '
*	

Wr@   r   
query_lensnum_query_headsrI   	use_alibiuse_sliding_windowuse_local_attentionnum_smsr   c	                 C   s   | dk rdS |s|s|rdS t |}	|	dk rdS |dkrdS || }
|
dko2| o2| o2t|dk}|s7dS |	}d}d}t| |}|t|| }t||}|| }|	| t|
| }||9 }t||}||k S )zDecide whether to use cascade attention.

    This function 1) checks whether cascade attention is supported with the
    given configuration, and 2) heuristically decides whether using cascade
    attention can improve performance.
    ra   Fr`   rQ   T   )r   npallr   )r   r1  r2  rI   r3  r4  r5  r6  r   r   r   use_flash_decoding
num_tokensq_tile_sizekv_tile_sizenum_prefix_tilescascade_ctascascade_wavescascade_timeflash_decoding_ctasflash_decoding_timer2   r2   r3   r     s>   


r   r   r   r  r  r   r   r   r   r   r  r  r   r   r   r   r   r  r   r  r  r  r  r  c                  C   sh  |d u sJ d|dksJ d|j d }|j d }|| dks"J || }|dks,J |j d d |j d f}tdi d|d	|d
|d|d|d|d|d|
dddt|d|d d d|ddd|d|d|d urx||nd d|d ur||nd d|d ur||nd d|dt rdn|\}}|j d d |j d f}tdi d|d	|d
|d|d|d|d|	| d|
dddt|d|d d |d f d|ddd|d|d|d ur||nd d|d ur||nd d|d ur||nd dt rdn|\}}t| |||| d S \}}t| |||| d S )Nz)Cascade attention does not support ALiBi.r   z2Cascade attention does not support sliding window.r   rQ   r  r  r  r   r
  r   r   r  r   Fr   r   r  r"  Tr   r  r  r  r  r  r   r2   )r   r   rw   r  r   r   ) r   r   r  r  r   r   r   r   r   r  r  r   r   r   r   r   r   r  r   r  r  r  r  r  r;  rH   num_common_kv_blocksr  prefix_output
prefix_lsesuffix_output
suffix_lser2   r2   r3   r    s   


	
	
r  )NNNNNN)E__doc__r   dataclassesr   typingr   numpyr8  r]   $vllm.model_executor.layers.attentionr   vllm.v1.attention.backendr   r   r   r   r	   #vllm.v1.attention.backends.fa_utilsr
   r   r   vllm.v1.attention.ops.commonr   'vllm.v1.attention.ops.merge_attn_statesr   r   r   r   r   vllm.configr   r   r   vllm.config.cacher   r   r   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.utils.math_utilsr   r   r   r     vllm.v1.attention.backends.utilsr!   r"   vllm.v1.kv_cache_interfacer#   rq   loggerr$   r~   r   r}   ry   r   rD   r@   ndarrayrt   r   r   r   r  r2   r2   r2   r3   <module>   s   $
  )   '	

a	

