o
    
۾ia                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d d	lmZ eeZG d
d deZG dd de	e
 ZdS )    )ClassVarN)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadata)vllm_is_batch_invariant)DeviceCapability)AttentionLayerAttentionTypeis_quantized_kv_cache)decode_attention_fwdc                   @   s   e Zd ZU ejejgZeeej	  e
d< ddgZeee  e
d< edefddZeded fd	d
ZededefddZdS )TritonMLABackendsupported_dtypesautobfloat16supported_kv_cache_dtypesreturnc                   C      dS )N
TRITON_MLA r   r   r   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/triton_mla.pyget_name$      zTritonMLABackend.get_nameTritonMLAImplc                   C   s   t S )N)r   r   r   r   r   get_impl_cls(   r   zTritonMLABackend.get_impl_cls
capabilityc                 C   r   )NTr   )clsr   r   r   r   supports_compute_capability,   r   z,TritonMLABackend.supports_compute_capabilityN)__name__
__module____qualname__torchfloat16r   r   r   listdtype__annotations__r   r   staticmethodstrr   typer   classmethodr	   boolr   r   r   r   r   r      s   
 r   c                       s   e Zd ZU dZeed< dededededee dB d	edB d
e	dedB de	de	dB ddf fddZ
	d fdd	Zdejeejejf B dejdededeejejdB f f
ddZ  ZS )r   Tcan_return_lse_for_decode	num_heads	head_sizescalenum_kv_headsalibi_slopesNsliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                    sh   t  j|||||||||	|
f
i | |||g}t|r td|	tjkr)tdt| jr2tdd S )NzbTritonMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capz`Encoder self-attention and encoder/decoder cross-attention are not implemented for TritonMLAImplz0TritonMLA V1 with FP8 KV cache not yet supported)super__init__anyNotImplementedErrorr   DECODERr   r3   )selfr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   mla_argsunsupported_features	__class__r   r   r8   4   s8   


zTritonMLAImpl.__init__Fc                    s   t  j|||f||d|S )N)return_softmax_lsesoftmax_scale)r7    _flash_attn_varlen_diff_headdims)r<   qkvrA   rB   kwargsr?   r   r   rC   e   s   z.TritonMLAImpl._flash_attn_varlen_diff_headdimsrD   kv_c_and_k_pe_cacheattn_metadatalayerc                 C   s$  |  dksJ |jd usJ | jdrtdt|tu r&tj|dd}t	|tj
s.J |jd }|jd }tj||| j|j|jd}tj|||j|jd}t rUdnd}	tj|||	| jd ftj|jd}
|d	}|d
d | jf }|d}t||||||jj|jj|
|	| j| ||fS )Nr   fp8z FP8 Triton MLA not yet supported)dim   )r%   device      .)numeldecoder3   
startswithr:   r)   tupler"   cat
isinstanceTensorshapezeroskv_lora_rankr%   rO   r   emptyfloat32	unsqueezesizer   block_tableseq_lensr/   )r<   rD   rH   rI   rJ   Bq_num_headsolsenum_kv_splitsattn_logits
kv_c_cache	PAGE_SIZEr   r   r   forward_mqaq   sP   



zTritonMLAImpl.forward_mqa)FN)r   r    r!   r,   r+   r&   intfloatr$   r(   r8   rC   r"   rX   rU   r   r
   rj   __classcell__r   r   r?   r   r   1   sL   
 
	
2r   )typingr   r"   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r   *vllm.model_executor.layers.batch_invariantr   vllm.platforms.interfacer	   vllm.v1.attention.backendr
   r   r   -vllm.v1.attention.ops.triton_decode_attentionr   r   loggerr   r   r   r   r   r   <module>   s   