o
    
۾i/                     @   s4  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z# e
e$Z%G dd deZ&eG dd deZ'eG dd dee' Z(G dd dee( Z)G dd dee( Z*dS )    )	dataclass)ClassVarN)
VllmConfig)
CacheDType)init_logger)MLACommonBackendMLACommonDecodeMetadataMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)vllm_is_batch_invariant)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOfis_quantized_kv_cache)flash_attn_supports_mlaget_flash_attn_version)AttentionSpec)flash_attn_varlen_funcget_scheduler_metadatac                   @   s   e Zd ZU ejejgZeeej	  e
d< ddgZeee  e
d< edeeeB  fddZedefdd	Zeded
 fddZeded fddZededefddZededej	dedB dededededededB fddZdS )FlashAttnMLABackendsupported_dtypesautobfloat16supported_kv_cache_dtypesreturnc                   C   s
   t dgS )N   )r    r    r    `/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/flashattn_mla.py get_supported_kernel_block_sizes3      
z4FlashAttnMLABackend.get_supported_kernel_block_sizesc                   C   s   dS )NFLASH_ATTN_MLAr    r    r    r    r!   get_name7      zFlashAttnMLABackend.get_nameFlashAttnMLAMetadataBuilderc                   C      t S N)r'   r    r    r    r!   get_builder_cls;   r&   z#FlashAttnMLABackend.get_builder_clsFlashAttnMLAImplc                   C   r(   r)   )r+   r    r    r    r!   get_impl_cls?   r&   z FlashAttnMLABackend.get_impl_cls
capabilityc                 C   s
   |j dkS )N	   )major)clsr-   r    r    r!   supports_compute_capabilityC   r#   z/FlashAttnMLABackend.supports_compute_capability	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	           	      C   s   t  sdS d S )Nz/FlashAttention MLA not supported on this device)r   )	r0   r2   r3   r4   r5   r6   r7   r8   r9   r    r    r!   supports_combinationG   s   z(FlashAttnMLABackend.supports_combination)__name__
__module____qualname__torchfloat16r   r   r   listr3   __annotations__r   r   staticmethodintr   r"   strr%   typer*   r,   classmethodr   boolr1   r:   r    r    r    r!   r   ,   sF   
 	
r   c                   @   sF   e Zd ZU ejed< eed< eed< dZejdB ed< dZeed< dS )FlashAttnMLADecodeMetadataquery_start_locmax_query_lenmax_seq_lenNscheduler_metadatar   max_num_splits)	r;   r<   r=   r>   TensorrA   rC   rL   rM   r    r    r    r!   rH   X   s   
 
rH   c                   @   s   e Zd ZdS )FlashAttnMLAMetadataN)r;   r<   r=   r    r    r    r!   rO   a   s    rO   c                       s   e Zd ZU ejZee ed< ej	Z
ee ed< dZeed< dedee dedejf fd	d
Zdd ZdejdejdedejdejdedejdB defddZ  ZS )r'   _cudagraph_supportquery_len_supporti   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec                    s   |j j}t j||||t|dkd d| _t dk| _| jj	
 | _| jj| _| jrA| jrAtj|jjd tj| jd| _|jj| _t rId| _d S d S )N   )supports_dcp_with_varlenr      )r3   rV   )parallel_configcp_kv_cache_interleave_sizesuper__init__rO   rM   r   fa_aot_schedulecompilation_configcudagraph_modehas_full_cudagraphsuse_full_cuda_graphmax_cudagraph_capture_sizemax_cudagraph_sizer>   zerosscheduler_configmax_num_seqsint32rV   rL   attention_config(flash_attn_max_num_splits_for_cuda_graphr   )selfrS   rT   rU   rV   interleave_size	__class__r    r!   r]   k   s2   


	
z$FlashAttnMLAMetadataBuilder.__init__c                 C   sB   | j rt|||| j| j d| jj|| jj| jj| j	|||dS d S )NrW   )
batch_sizemax_seqlen_qmax_seqlen_knum_heads_qnum_heads_kvheaddimcache_seqlens	qkv_dtype	headdim_v	page_sizecu_seqlens_qcausal
num_splits)
r^   r   	num_headsdcp_world_sizemla_dimsqk_rope_head_dimrS   r3   kv_lora_rankrx   )rk   num_reqscu_query_lensrJ   seqlensrK   rz   rM   r    r    r!   _schedule_decode   s"   

z,FlashAttnMLAMetadataBuilder._schedule_decodeblock_table_tensorseq_lens_devicerK   query_start_loc_cpuquery_start_loc_devicenum_decode_tokensdcp_tot_seq_lens_deviceNr   c              
   C   s   |dd  |d d  }|   }	d}
| jr$| jd ur$|| jkr$| j}
t r)d}
| j|jd ||	||d|
d}| jro|d uro|jd }|| jjd ksZJ d| d| jjd  || jd |< d| j|d < | jd | }t	||||	|||
|d}|S )	NrW   r   T)r   r   rJ   r   rK   rz   rM   zScheduler metadata size z exceeds buffer size )block_tableseq_lensrI   rJ   rK   rL   rM   dcp_tot_seq_lens)
maxitemrb   rd   rM   r   r   shaperL   rH   )rk   r   r   rK   r   r   r   r   query_lens_cpurJ   rM   rL   nmetadatar    r    r!   _build_decode   sP   






z)FlashAttnMLAMetadataBuilder._build_decode)r;   r<   r=   r   UNIFORM_BATCHrP   r   rA   r   VARLENrQ   rR   rC   r   r@   rD   r   r>   rV   r]   r   rN   rH   r   __classcell__r    r    rm   r!   r'   f   s>   
 (	r'   c                       s   e Zd ZU dZeed< dededededee dB d	edB d
e	dedB de	de	dB ddf fddZ
dejeejejf B dejdededeejejdB f f
ddZ  ZS )r+   Tcan_return_lse_for_decoder|   r2   scalenum_kv_headsalibi_slopesNsliding_windowr4   logits_soft_cap	attn_typekv_sharing_target_layer_namer   c                    sv   t  j|||||||||	|
f
i | t sJ d|||g}t|r'td|	tjkr0tdt| jr9tdd S )Nz,FlashAttnMLA is not supported on this devicezeFlashAttnMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzcEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashAttnMLAImplz3FlashAttnMLA V1 with FP8 KV cache not yet supported)	r\   r]   r   anyNotImplementedErrorr   DECODERr   r4   )rk   r|   r2   r   r   r   r   r4   r   r   r   mla_argsunsupported_featuresrm   r    r!   r]      s:   


zFlashAttnMLAImpl.__init__qkv_c_and_k_pe_cacheattn_metadatalayerc                 C   sf  |  dksJ |jd usJ t|tu r|\}}ntj|| j| jgdd\}}| j	dr2t
d|dd | jf }|d| jd f }t|jjd}	tdi d|d	|d
d|d
d|d|	d|jjd|jjd|jjd|jjd| jddd| jddd|jjd|jjd| jd| jd|jj}
| jr|
\}}||ddfS |
}|d fS )Nr   r   )dimfp8z(FP8 FlashAttention MLA not yet supported.rW   r   kvq_vrp   ry   rq   	seqused_kr   softmax_scalerz   Treturn_softmax_lse
fa_versionrY   rL   r{   cp_world_sizecp_rankcp_tot_seqused_kr    )numeldecoderE   tupler>   splitr   r   r4   
startswithr   r   rJ   r   	unsqueezerI   rK   r   r   r   need_to_return_lse_for_decoderL   rM   r}   dcp_rankr   	transpose)rk   r   r   r   r   q_nopeq_pe
kv_c_cache
k_pe_cacherp   attn_outolser    r    r!   forward_mqa(  sl   



	
zFlashAttnMLAImpl.forward_mqa)r;   r<   r=   r   rG   rA   rC   floatr@   rD   r]   r>   rN   r   rO   r   r   r   r    r    rm   r!   r+      sH   
 
	
3r+   )+dataclassesr   typingr   r>   vllm.configr   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r	   r
   r   r   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r   r   #vllm.v1.attention.backends.fa_utilsr   r   vllm.v1.kv_cache_interfacer   vllm.vllm_flash_attnr   r   r;   loggerr   rH   rO   r'   r+   r    r    r    r!   <module>   s,    , 