o
    .i                     @   s  d dl Z d dlmZ d dlmZ eeZe r,zd dlZ	dZ
W n ey+   dZ
Y nw dZ
e rFzd dlZ	dZW n eyE   dZY nw dZdeeedB f fddZdeeedB f fd	d
ZdeeedB f fddZdd Ze d  rd dlmZmZmZmZmZmZmZ nG dd dZeZeZeZeZeZeZde jdededee je jf fddZ				d#de jde jde jde jdede jde jde dB dede jdB d e jdB dee je jf fd!d"Z!dS )$    N)init_logger)current_platformTFreturnc                   C   s   t sdS tsdS dS )N)Fzvllm._flashmla_C is not available, likely was not compiled due to insufficient nvcc version or a supported arch was not in the list of target arches to compile for.)FzZvllm._flashmla_extension_C is not available, likely was not compiled due to a build error.TN)_flashmla_C_AVAILABLE_flashmla_extension_C_AVAILABLE r   r   [/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/ops/flashmla.py_is_flashmla_available!   s
   r
   c                  C   s(   t  \} }| sd|fS tdsdS dS )C
    Return: is_supported_flag, unsupported_reason (optional).
    FZ   )Fz3FlashMLA Dense is only supported on Hopper devices.r   r
   r   is_device_capability_familyis_availblemaybe_reasonr   r   r	   is_flashmla_dense_supported3   s   

r   c                  C   s2   t  \} }| sd|fS tdstdsdS dS )r   Fr   d   )FzBFlashMLA Sparse is only supported on Hopper and Blackwell devices.r   r   r   r   r   r	   is_flashmla_sparse_supported?   s   
r   c                  O   s   t  \}}t|p	d)NzFlashMLA is not available)r
   RuntimeError)_args_kwargs_reasonr   r   r	   _raise_flashmla_unavailableQ   s   
r   )FlashMLASchedMetaflash_attn_varlen_funcflash_attn_varlen_kvpacked_func flash_attn_varlen_qkvpacked_funcflash_mla_sparse_fwdflash_mla_with_kvcacheget_mla_metadatac                   @   s   e Zd ZdS )r   N)__name__
__module____qualname__r   r   r   r	   r   b   s    r   cache_seqlensnum_q_tokens_per_head_knum_heads_kc                 C   s"   t  d st  tjj| ||S )Nr   )r
   r   torchops_flashmla_extension_C#get_mla_decoding_metadata_dense_fp8)r%   r&   r'   r   r   r	   get_mla_metadata_dense_fp8m   s   
r,   qk_cacheblock_table
head_dim_vtile_scheduler_metadata
num_splitssoftmax_scalecausal	descale_q	descale_kc                 C   sT   t  d st  |d u r| jd d }tjj| |||||||||	|
\}}||fS )Nr   g      )r
   r   shaper(   r)   r*   fwd_kvcache_mla_fp8)r-   r.   r/   r%   r0   r1   r2   r3   r4   r5   r6   outsoftmax_lser   r   r	   flash_mla_with_kvcache_fp8{   s$   
r<   )NFNN)"r(   vllm.loggerr   vllm.platformsr   r"   loggeris_cudavllm._flashmla_Cvllmr   ImportErrorvllm._flashmla_extension_Cr   tupleboolstrr
   r   r   r   -vllm.third_party.flashmla.flash_mla_interfacer   r   r   r   r   r    r!   Tensorintr,   floatr<   r   r   r   r	   <module>   s   
&
	
