o
    
۾i                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ eeZd
ZG dd dee ZG dd de
ZejeejddZG dd dee Z dS )    )ClassVarN)%trtllm_batch_decode_with_kv_cache_mla)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadataMLACommonMetadataBuilderQueryLenSupport)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOf)KVCacheLayoutTypei   c                   @   s2   e Zd ZU ejZee ed< ej	Z
ee ed< dS )FlashInferMLAMetadataBuilder_cudagraph_supportquery_len_supportN)__name__
__module____qualname__r   UNIFORM_BATCHr   r   __annotations__r
   UNIFORMr    r   r   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/flashinfer_mla.pyr       s   
 r   c                   @   s   e Zd ZU ejejgZeeej	  e
d< g dZeee  e
d< edeeeB  fddZedefddZeded	 fd
dZeded fddZededefddZededej	dedB dededededededB fddZed ddZdS )!FlashInferMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                   C   s   ddgS )N    @   r   r   r   r   r    get_supported_kernel_block_sizes.   s   z5FlashInferMLABackend.get_supported_kernel_block_sizesc                   C      dS )NFLASHINFER_MLAr   r   r   r   r   get_name2      zFlashInferMLABackend.get_nameFlashInferMLAImplc                   C      t S N)r+   r   r   r   r   get_impl_cls6   r*   z!FlashInferMLABackend.get_impl_clsr   c                   C   r,   r-   )r   r   r   r   r   get_builder_cls:   r*   z$FlashInferMLABackend.get_builder_cls
capabilityc                 C   s
   |j dkS )N
   )major)clsr0   r   r   r   supports_compute_capability>   s   
z0FlashInferMLABackend.supports_compute_capability	head_sizedtypekv_cache_dtypeN
block_sizeuse_mlahas_sink
use_sparsedevice_capabilityc	                 C   sF   ddl m}	 |	 }
|
jd ur!|
jj}t|dd}|dkr!d| S d S )Nr   )get_current_vllm_configqk_nope_head_dim      z@FlashInfer MLA kernel requires qk_nope_head_dim == 128, but got )vllm.configr=   model_confighf_text_configgetattr)r3   r5   r6   r7   r8   r9   r:   r;   r<   r=   vllm_configrC   r>   r   r   r   supports_combinationB   s   
z)FlashInferMLABackend.supports_combinationKVCacheLayoutType | Nonec                 C   r'   )NHNDr   )r3   r   r   r   get_required_kv_cache_layout\   r*   z1FlashInferMLABackend.get_required_kv_cache_layout)r#   rG   )r   r   r   torchfloat16r   r   r   listr6   r   r"   r   staticmethodintr   r&   strr)   typer.   r/   classmethodr   boolr4   rF   rI   r   r   r   r   r   %   sF   
 	
r   cuda)r6   devicec                       s   e Zd Zdededededee dB dedB ded	edB d
ededB ddf fddZdej	e
ej	ej	f B dej	dedede
ej	ej	dB f f
ddZ  ZS )r+   	num_headsr5   scalenum_kv_headsalibi_slopesNsliding_windowr7   logits_soft_cap	attn_typekv_sharing_target_layer_namer#   c                    sh   t  j|||||||||	|
f
i | |||g}t|r td|	tjkr)tdt| _d | _d | _	d S )NzfFlashInferMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzdEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferMLAImpl)
super__init__anyNotImplementedErrorr   DECODERg_fi_workspace_workspace_buffer
bmm1_scale
bmm2_scale)selfrU   r5   rV   rW   rX   rY   r7   rZ   r[   r\   mla_argsunsupported_features	__class__r   r   r^   i   s4   


zFlashInferMLAImpl.__init__qkv_c_and_k_pe_cacheattn_metadatalayerc                 C   s  |  dksJ |jd usJ t|tr!|\}}tj||gdd}|j|j dkr4t	d |
d}n||jd|jd |jd }| jd u rR|j|j | j | _| jd u r[|j| _t||
d| j| j| j| j|jj|jj|j| j| jd}|d|jd |jd }|d fS )Nr   )dimzFlashInferMLAImpl got a query of uneven length.
                This usually indicates an issue in batch reordering
                or incorrect setup in dummy_run.r?   )querykv_cacheworkspace_bufferr>   kv_lora_rankqk_rope_head_dimblock_tablesseq_lensmax_seq_lenrd   re   )numeldecode
isinstancetuplerJ   catnum_decode_tokensnum_decodesloggerwarning_once	unsqueezeviewshaperd   _q_scale_float_k_scale_floatrV   re   _v_scale_floatr   rc   r>   ru   rv   block_tablerx   ry   )rf   rk   rl   rm   rn   q_nopeq_peor   r   r   forward_mqa   s<   


zFlashInferMLAImpl.forward_mqa)r   r   r   rN   floatrL   rO   r^   rJ   Tensorr}   r   r   r   __classcell__r   r   ri   r   r+   h   sF    
	
0r+   )!typingr   rJ   flashinfer.decoder   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r   r	   r
   vllm.platforms.interfacer   vllm.v1.attention.backendr   r   r   r    vllm.v1.attention.backends.utilsr   r   r   $FLASHINFER_MLA_WORKSPACE_BUFFER_SIZEr   r   zerosuint8rb   r+   r   r   r   r   <module>   s&   <