o
    
۾i"                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZ e
eZG dd	 d	ee ZG d
d deZG dd dZedZdZG dd dee ZdS )    N)ClassVar)
CacheDType)init_logger)MLACommonBackendMLACommonImplMLACommonMetadataMLACommonMetadataBuilder)DeviceCapability)AttentionCGSupportAttentionLayerAttentionType
MultipleOfis_quantized_kv_cachec                   @   s    e Zd ZU ejZee ed< dS )CutlassMLAMetadataBuilder_cudagraph_supportN)__name__
__module____qualname__r
   UNIFORM_SINGLE_TOKEN_DECODEr   r   __annotations__ r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/cutlass_mla.pyr      s   
 r   c                   @   s   e Zd ZU ejejgZeeej	  e
d< g dZeee  e
d< edeeeB  fddZedefddZeded	 fd
dZeded fddZededefddZdS )CutlassMLABackendsupported_dtypes)autobfloat16fp8fp8_e4m3supported_kv_cache_dtypesreturnc                   C   s   dgS )N   r   r   r   r   r    get_supported_kernel_block_sizes.   s   z2CutlassMLABackend.get_supported_kernel_block_sizesc                   C   s   dS )NCUTLASS_MLAr   r   r   r   r   get_name2      zCutlassMLABackend.get_nameCutlassMLAImplc                   C      t S N)r%   r   r   r   r   get_impl_cls6   r$   zCutlassMLABackend.get_impl_clsr   c                   C   r&   r'   )r   r   r   r   r   get_builder_cls:   r$   z!CutlassMLABackend.get_builder_cls
capabilityc                 C   s
   |j dkS )N
   )major)clsr*   r   r   r   supports_compute_capability>   s   
z-CutlassMLABackend.supports_compute_capabilityN)r   r   r   torchfloat16r   r   r   listdtyper   r   r   staticmethodintr   r!   strr#   typer(   r)   classmethodr	   boolr.   r   r   r   r   r   %   s   
 r   c                   @   s.   e Zd Zdd Zdd ZdedefddZd	S )
SM100Workspacec                 C   s8   t j|dt jd| _d| _t jt d}|j| _	d S )Ncuda)devicer2   r    zcuda:0)
r/   emptyuint8_workspace_buf_block_sizer:   get_device_propertiesr;   multi_processor_count	_sm_count)selfinitial_workspace_size
propertiesr   r   r   __init__D   s   zSM100Workspace.__init__c                 C   s   | j S r'   )r>   )rC   r   r   r   get_bufP   s   zSM100Workspace.get_bufattn_metadatanum_kv_splitsc                 C   sJ   |j }|j}tj|| j || j|d}| jjd |k r#| j| d S d S )N)rI   r   )	num_reqsmax_query_lenops$sm100_cutlass_mla_get_workspace_sizer?   rB   r>   shaperesize_)rC   rH   rI   
batch_sizemax_seq_lenworkspace_sizer   r   r   ensure_sizeS   s   zSM100Workspace.ensure_sizeN)r   r   r   rF   rG   r   r4   rS   r   r   r   r   r9   C   s    r9   i   r    c                       s   e Zd ZU dZeed< dededededee dB d	edB d
e	dedB de	de	dB ddf fddZ
dejdejdejdejdejdejdededeejejf fddZdejeejejf B dejdededeejejdB f f
ddZ  ZS ) r%   Tcan_return_lse_for_decode	num_heads	head_sizescalenum_kv_headsalibi_slopesNsliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_namer   c                    s   t  j|||||||||	|
f
dti| |||g}t|r"td|	tjkr+tdtj	dd }|rBt
dt| t|| _nd| _t| _d S )Nq_pad_num_headszcCutlassMLAImpl does not support one of the following: alibi_slopes, sliding_window, logits_soft_capzaEncoder self-attention and encoder/decoder cross-attention are not implemented for CutlassMLAImplFORCE_NUM_KV_SPLITSzForcing num_kv_splits to %d)superrF   	MAX_HEADSanyNotImplementedErrorr   DECODERosenvirongetlogger
debug_oncer4   _num_kv_splitsg_sm100_workspace
_workspace)rC   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   mla_argsunsupported_featuresforce_num_kv_splits	__class__r   r   rF   j   s>   



zCutlassMLAImpl.__init__q_nopeq_pekv_c_and_k_pe_cacheseq_lens
page_table	workspacesm_scalerI   c	                 C   s`  |j dksJ d|j  |j dksJ d|j  |j dks'J d|j |j\}	}
}|j\}}}|	|kr;|
|ks=J |j\}}}d}d}||ksMJ ||ksSJ ||| ks[J d}|
|kskJ d| d	|
 t|jd
kstJ |j\}}||	ksJ |dksJ d| |d|  dksJ |jtjtjtjfv sJ d|j d|j|j  kr|jksJ  J |jtj	ksJ d|j d|jtj	ksJ d|j dt
| jrtjn|j}|j|	||f|d}| jrtj|	|ftj|jdnt }t||||||||||
 |
|k r,| jr |d d d |
f n|}|d d d |
f }||fS )N   z$q_nope must be a 3D tensor, but got z"q_pe must be a 3D tensor, but got z3kv_c_and_k_pe_cache must be a 3D tensor, but got {}i   @   r    zH must be <= z
, but got    r   z&block num must be greater than 0, got z6q_nope.dtype needs to be fp16 or bf16 or e4m3 but got .z)seq_lens.dtype needs to be int32 but got z+page_table.dtype needs to be int32 but got )r2   )r2   r;   )ndimformatrN   lenr2   r/   r0   r   float8_e4m3fnint32r   r[   	new_emptyneed_to_return_lse_for_decoder<   float32r;   TensorrL   sm100_cutlass_mla_decode)rC   rt   ru   rv   rw   rx   ry   rz   rI   B_qHD_q_nopeB_q_2H_2D_q_pe_	PAGE_SIZED_ckvD_latentD_roperc   B_block_table	block_numr2   outlser   r   r   _sm100_cutlass_mla_decode   st   
"
 z(CutlassMLAImpl._sm100_cutlass_mla_decodeqrH   layerc           	   
   C   s   |  dksJ |jd usJ t|tu r|\}}ntj|| j| jgdd\}}| j	|| j
 | ||||jj|jj| j | j| j
\}}|| jrN|fS d fS )Nr   ra   )dim)numeldecoder6   tupler/   splitkv_lora_rankqk_rope_head_dimrn   rS   rl   r   rw   block_tablerG   rW   r   )	rC   r   rv   rH   r   rt   ru   or   r   r   r   forward_mqa   s&   

zCutlassMLAImpl.forward_mqa)r   r   r   rT   r8   r   r4   floatr1   r5   rF   r/   r   r   r   r   r   r   __classcell__r   r   rr   r   r%   g   sn   
 
	
:	

Sr%   ) rg   typingr   r/   vllm._custom_ops_custom_opsrL   vllm.config.cacher   vllm.loggerr   2vllm.model_executor.layers.attention.mla_attentionr   r   r   r   vllm.platforms.interfacer	   vllm.v1.attention.backendr
   r   r   r   r   r   rj   r   r   r9   rm   rc   r%   r   r   r   r   <module>   s    