o
    
۾i+                     @   s@  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlmZmZ d dlmZm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& e
e'Z(G dd deZ)eG dd deZ*eG dd dee* Z+G dd dee+ Z,G dd dee+ Z-dS )    )	dataclass)ClassVarN)
VllmConfig)
CacheDType)init_logger)MLACommonBackendMLACommonDecodeMetadataMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)vllm_is_batch_invariant)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOf)#reshape_attn_output_for_spec_decodereshape_query_for_spec_decode)FlashMLASchedMetaflash_mla_with_kvcacheflash_mla_with_kvcache_fp8get_mla_metadataget_mla_metadata_dense_fp8is_flashmla_dense_supported)AttentionSpecc                   @   s   e Zd ZU ejejgZeeej	  e
d< g dZeee  e
d< edeeeB  fddZedefddZeded	 fd
dZeded fddZededefddZededej	dedB dededededededB fddZdS )FlashMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                   C   s   dgS )N@    r%   r%   r%   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/flashmla.py get_supported_kernel_block_sizes8   s   z0FlashMLABackend.get_supported_kernel_block_sizesc                   C   s   dS )NFLASHMLAr%   r%   r%   r%   r&   get_name<      zFlashMLABackend.get_nameFlashMLAMetadataBuilderc                   C      t S N)r+   r%   r%   r%   r&   get_builder_cls@   r*   zFlashMLABackend.get_builder_clsFlashMLAImplc                   C   r,   r-   )r/   r%   r%   r%   r&   get_impl_clsD   r*   zFlashMLABackend.get_impl_cls
capabilityc                 C   s
   |j dv S )N)	   
   )major)clsr1   r%   r%   r&   supports_compute_capabilityH   s   
z+FlashMLABackend.supports_compute_capability	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	                 C   s0   |rddl m}	 |	 d S ddl m}
 |
 d S )Nr   )is_flashmla_sparse_supported   )r   )vllm.v1.attention.ops.flashmlar?   r   )r5   r7   r8   r9   r:   r;   r<   r=   r>   r?   r   r%   r%   r&   supports_combinationL   s
   

z$FlashMLABackend.supports_combination)__name__
__module____qualname__torchfloat16r   r   r   listr8   __annotations__r"   r   staticmethodintr   r'   strr)   typer.   r0   classmethodr   boolr6   rB   r%   r%   r%   r&   r   /   sB   
 	
r   c                   @   s   e Zd ZU eed< dS )FlashMLADecodeMetadatascheduler_metadataN)rC   rD   rE   r   rI   r%   r%   r%   r&   rP   b   s   
 rP   c                   @   s   e Zd ZdS )FlashMLAMetadataN)rC   rD   rE   r%   r%   r%   r&   rR   g   s    rR   c                       s   e Zd ZU ejZee ed< ej	Z
ee ed< dZeed< dedee dedejf fd	d
ZdejdejdedejdejdedejdB defddZ  ZS )r+   _cudagraph_supportquery_len_support   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configdevicec                    s   t  ||||t |j|j| _d | _d | _|j	j
d| _tj| j}|j}| jj rNtj|df| jtjd| _tj|jjd | jtjd| _d S d S )Nr       )rZ   r8   r@   )super__init__rR   model_configget_num_attention_headsparallel_confignum_q_headscg_buf_tile_scheduler_metadatacg_buf_num_splitscache_configcache_dtype
startswithis_fp8_kvcacherF   cudaget_device_propertiesrZ   multi_processor_countcompilation_configcudagraph_modehas_full_cudagraphszerosint32emptyscheduler_configmax_num_seqs)selfrW   rX   rY   rZ   device_propertiesnum_sms	__class__r%   r&   r]   r   s.   

z FlashMLAMetadataBuilder.__init__block_table_tensorseq_lens_devicemax_seq_lenquery_start_loc_cpuquery_start_loc_devicenum_decode_tokensdcp_tot_seq_lens_deviceNr#   c                 C   sz   |dd  |d d  }|   }	|	| j d }
t||
d| jd\}}| jr5t||
d\}}||_||_t||||dS )Nr@   )rg   )block_tableseq_lensrQ   dcp_tot_seq_lens)	maxitemra   r   rg   r   tile_scheduler_metadata
num_splitsrP   )rs   rx   ry   rz   r{   r|   r}   r~   query_lens_cpumax_query_lennum_q_tokens_per_head_krQ   _r   r   r%   r%   r&   _build_decode   s.   

z%FlashMLAMetadataBuilder._build_decode)rC   rD   rE   r   UNIFORM_BATCHrS   r   rI   r   UNIFORMrT   rV   rK   r   rH   rL   r   rF   rZ   r]   TensorrP   r   __classcell__r%   r%   rv   r&   r+   l   s<   
 $	r+   c                       s   e Zd ZU dZeed< dededededee dB d	edB d
e	dedB de	de	dB ddf fddZ
dejeejejf B dejdededeejejdB f f
ddZ  ZS )r/   Tcan_return_lse_for_decode	num_headsr7   scalenum_kv_headsalibi_slopesNsliding_windowr9   logits_soft_cap	attn_typekv_sharing_target_layer_namer#   c                    sl   t  j|||||||||	|
f
i | t \}}|sJ ||||g}t|r+td|	tjkr4tdd S )NzaFlashMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capz_Encoder self-attention and encoder/decoder cross-attention are not implemented for FlashMLAImpl)r\   r]   r   anyNotImplementedErrorr   DECODER)rs   r   r7   r   r   r   r   r9   r   r   r   mla_argsis_supportedreasonunsupported_featuresrv   r%   r&   r]      s4   


zFlashMLAImpl.__init__qkv_c_and_k_pe_cacheattn_metadatalayerc                 C   s  |  dksJ |jd usJ t|tu rtj|dd}t|tjs$J |j}t	||}|jj
}t r| jds|j}tj}|jd }	|jjjd }
d}|
| dks]J d|
 d| |
| }tjd||d	}d|d
< d|d< |	d |d< ||d< d|d< tj|	d f||d	}||_||_| jdrt||d|jj|jj| j|j|j| jd|jd|jdd\}}nt||d|jj|jj| j|| jddd	\}}t|}||fS )Nr   r   )dimr    r$   ztopk (z) must be divisible by )r@   r[   )r8   rZ   )r   r   )r   r@   r@   )r      )r      )r      T)r   k_cacher   cache_seqlens
head_dim_vr   r   softmax_scalecausal	descale_q	descale_kF)	r   r   r   r   r   r   r   r   rg   )numeldecoderM   tuplerF   cat
isinstancer   num_decodesr   rQ   r   r9   rf   rZ   ro   shaper   rn   r   r   r   	unsqueezer   kv_lora_rankr   _q_scalereshape_k_scaler   r   )rs   r   r   r   r   r   rQ   rZ   r8   BtopkB_TOPKend_block_idxr   r   olser%   r%   r&   forward_mqa   sh   

 


zFlashMLAImpl.forward_mqa)rC   rD   rE   r   rO   rI   rK   floatrH   rL   r]   rF   r   r   rR   r   r   r   r%   r%   rv   r&   r/      sH   
 
	
/r/   ).dataclassesr   typingr   rF   vllm.configr   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r	   r
   r   r   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r    vllm.v1.attention.backends.utilsr   r   rA   r   r   r   r   r   r   vllm.v1.kv_cache_interfacer   rC   loggerr   rP   rR   r+   r/   r%   r%   r%   r&   <module>   s*     3O