o
    
۾ih                     @   s  d dl mZ d dlmZmZ d dlZd dlZd dlm	Z
 d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZmZmZm Z m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 erd dl2m3Z3 ee4Z5dZ6	 G dd deZ7eG dd deZ8ej9dej:dej:dej:dej:fddZ;				 		d1d!ej<d"ej<d#ej<de=d$e=de=d%e>d&ej<dB d'ej<dB fd(d)Z?d*e=fd+d,Z@G d-d. d.e e8 ZAG d/d0 d0e#e8 ZBdS )2    )	dataclass)TYPE_CHECKINGClassVarN)_custom_ops)
VllmConfigget_current_vllm_config)
CacheDType)init_logger)get_mla_dims)current_platform)DeviceCapability)tltriton)AttentionBackendAttentionCGSupportAttentionLayerAttentionMetadataAttentionMetadataBuilderCommonAttentionMetadata
MultipleOfSparseMLAAttentionImpl)#reshape_attn_output_for_spec_decodereshape_query_for_spec_decodesplit_decodes_and_prefillssplit_prefill_chunks)FlashMLASchedMetaflash_mla_sparse_fwdflash_mla_with_kvcacheget_mla_metadata)AttentionSpec)current_workspace_manager)Indexer    c                   @   s&  e Zd ZU dZeed< ejgZe	e
ej  ed< g dZe	e
e  ed< ede
eeB  fddZedefd	d
Zeded fddZeded fddZede
e fddZedefddZedefddZededefddZe	d$dedededededeed f fd!d"Zd#S )%FlashMLASparseBackendTaccept_output_buffersupported_dtypes)autobfloat16
fp8_ds_mlasupported_kv_cache_dtypesreturnc                   C      dgS )N@    r-   r-   r-   b/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/flashmla_sparse.py get_supported_kernel_block_sizesU      z6FlashMLASparseBackend.get_supported_kernel_block_sizesc                   C      dS )NFLASHMLA_SPARSEr-   r-   r-   r-   r.   get_nameY      zFlashMLASparseBackend.get_nameFlashMLASparseMetadataBuilderc                   C      t S N)r5   r-   r-   r-   r.   get_builder_cls]   r4   z%FlashMLASparseBackend.get_builder_clsFlashMLASparseImplc                   C   r6   r7   )r9   r-   r-   r-   r.   get_impl_clsa   r4   z"FlashMLASparseBackend.get_impl_clsc                 C   r+   )Ni@  r-   clsr-   r-   r.   get_supported_head_sizese   r0   z.FlashMLASparseBackend.get_supported_head_sizesc                 C   r1   NTr-   r;   r-   r-   r.   is_mlai   r4   zFlashMLASparseBackend.is_mlac                 C   r1   r>   r-   r;   r-   r-   r.   	is_sparsem   r4   zFlashMLASparseBackend.is_sparse
capabilityc                 C   s
   |j dv S )N)	   
   )major)r<   rA   r-   r-   r.   supports_compute_capabilityq   s   
z1FlashMLASparseBackend.supports_compute_capabilityr&   
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s   |dkr	| |dfS | ||fS )Nr(   i  r-   )rF   rG   rH   rI   rJ   r-   r-   r.   get_kv_cache_shapeu   s   

z(FlashMLASparseBackend.get_kv_cache_shapeN)r&   )__name__
__module____qualname__r$   bool__annotations__torchr'   r%   r   listdtyper)   r   staticmethodintr   r/   strr3   typer8   r:   classmethodr=   r?   r@   r   rE   tuplerK   r-   r-   r-   r.   r#   L   sF   
 
r#   c                   @   s   e Zd ZU eed< eed< eed< eed< ejed< ejed< ejed< ejed< d	Zeed
< dZeed< e	G dd dZ
e	G dd dZdZee
B dB ed< dZeed< dS )FlashMLASparseMetadatanum_reqsmax_query_lenmax_seq_lennum_actual_tokensquery_start_locslot_mappingblock_tablereq_id_per_tokenr,   rG      topk_tokensc                   @   s*   e Zd ZU eed< ejed< ejed< dS )(FlashMLASparseMetadata.FP8KernelMetadatascheduler_metadatadummy_block_table
cache_lensN)rL   rM   rN   r   rP   rQ   Tensorr-   r-   r-   r.   FP8KernelMetadata   s   
 
rj   c                   @   s   e Zd ZU eG dd dZeG dd dZdZeed< dZ	eed< dZ
eed< dZeed	< d
Zed
B ed< d
Zed
B ed< d
S )/FlashMLASparseMetadata.FP8SeparatePrefillDecodec                   @   s   e Zd ZU ded< eed< dS )z6FlashMLASparseMetadata.FP8SeparatePrefillDecode.Decodere   kernel_metadatadecode_query_lenN)rL   rM   rN   rP   rU   r-   r-   r-   r.   Decode   s   
 rn   c                   @   sJ   e Zd ZU ejed< ejed< ejed< eG dd dZee ed< dS )z7FlashMLASparseMetadata.FP8SeparatePrefillDecode.Prefillseq_lensrequest_idsworkspace_startsc                   @   sH   e Zd ZU dZejed< eed< ejed< eed< ejed< eed< dS )	z=FlashMLASparseMetadata.FP8SeparatePrefillDecode.Prefill.ChunkzMetadata for a chunk of prefill requests.

                Prefill requests may be chunked to fit within the fixed workspace size.
                ro   tokens_slicera   req_start_idxrq   chunk_tot_seqlenN)	rL   rM   rN   __doc__rQ   ri   rP   slicerU   r-   r-   r-   r.   Chunk   s   
 


rw   chunksN)	rL   rM   rN   rQ   ri   rP   r   rw   rR   r-   r-   r-   r.   Prefill   s   
 


ry   r   num_prefillsnum_decodesnum_prefill_tokensnum_decode_tokensNdecodeprefill)rL   rM   rN   r   rn   ry   rz   rU   rP   r{   r|   r}   r~   r   r-   r-   r-   r.   FP8SeparatePrefillDecode   s   
  r   Nfp8_extra_metadataFfp8_use_mixed_batch)rL   rM   rN   rU   rP   rQ   ri   rG   rd   r   rj   r   r   r   rO   r-   r-   r-   r.   rZ      s"   
 



/rZ   max_num_blocks_per_req
BLOCK_SIZEBLOCK_NHAS_PREFILLc           "      C   s:  t d}t d}|| t d| }t | | }|||  ||  }t |}|dk }d}|	r=t || }|dk}|| }|| }||k |dk@ }|||
  ||  }|| O }t j||| @ dd}|| | }|	rt j|| |dd}|| } t || |}t |d|}|||  ||  }!t |!| d S )Nr      F)maskother)r   
program_idarangeloadwherestore)"
req_id_ptrblock_table_ptrtoken_indices_ptrout_ptrprefill_request_id_ptrworkspace_starts_ptrr   r   r   r   
bt_stride0
bt_stride1
ti_stride0
ti_stride1out_stride0out_stride1token_idtile_id	indice_idreqti_ptrtokis_invalid_tok
is_prefillprefill_req_idblock_idinblock_offvalid_blockbt_ptrbaseout_valworkspace_startprefill_out
out_ptr_ijr-   r-   r.   )_convert_req_index_to_global_index_kernel   s6   




r   r,   rc      Freq_idra   token_indicesNUM_TOPK_TOKENSHAS_PREFILL_WORKSPACEprefill_workspace_request_idsprefill_workspace_startsc	                 C   sl  | j tjksJ |j tjksJ |j tjksJ |jd |ks!J || dks2J d| d| d|rP|dus:J |dus@J |j tjksHJ |j tjksPJ | jd }	|jd }
|| }|  }| }| }t|}| \}}| \}}| \}}|r|dusJ |dusJ | sJ | sJ |	|f}t| |||||||
||||||||| |S )a:  
    out[token_id, indice_id] =
        block_table[req_id[token_id],
            token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE
        + token_indices[token_id, indice_id] % BLOCK_SIZE

    Only when token_indices[token_id, indice_id] == -1 do we output -1.
    For safety, we also output -1 if the derived block_id would be
        out-of-bounds.

    When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets
    instead of global cache slots. prefill_workspace_request_ids and
    prefill_workspace_starts must be provided.

    prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else
        prefill request index (maps to prefill_workspace_starts)
    prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace
        starts for each prefill request
    r   r   zNUM_TOPK_TOKENS (z ) must be divisible by BLOCK_N ()N)	rS   rQ   int32shape
contiguous
empty_likestrideis_contiguousr   )r   ra   r   r   r   r   r   r   r   
num_tokensr   tiles_per_rowreq_id_cblock_table_ctoken_indices_coutr   r   r   r   r   r   gridr-   r-   r.   (triton_convert_req_index_to_global_index  s^   


r   max_model_lenc                 C   s   | d S )N   r-   )r   r-   r-   r.   get_prefill_workspace_sizen  s   r   c                
   @   s   e Zd ZU ejZee ed< dede	e
 dedejddf
dd	Zd
eddfddZd
eddfddZ	dded
ededefddZdS )r5   _cudagraph_supportkv_cache_speclayer_namesvllm_configdevicer*   Nc                 C   s\  || _ || _|j}|| _|j| _|j}|| _| jddd tj	
|}|j}| j|| _t| j| _t| j| _|jjj| _|jdk| _|jj}	tj|	f| j|tjd| _tj|	f| jj|tjd| _tj|	dftj| jd| _ | j}
t!"dr}|}n	|t#d|
d  }tj|d	ftj|d| _$tj|	d ftj|d| _%tj|jj&ftj|d| _'d S )
Nr   T)supports_spec_as_decoder(   )r   rS   rS   r   d   r,      )(r   r   cache_configr   model_configparallel_configr   _init_reorder_batch_thresholdrQ   cudaget_device_propertiesmulti_processor_countget_num_attention_heads	num_headsr
   mla_dimsr9    _compute_fp8_decode_padded_headsfp8_decode_padded_heads	hf_config
index_topkrd   cache_dtypeuse_fp8_kv_cachescheduler_configmax_num_seqsfullr   topk_tokens_tensorr   max_model_len_tensoremptyrg   r   is_device_capability_familymaxtile_scheduler_metadata_buffernum_splits_buffermax_num_batched_tokensreq_id_per_token_buffer)selfr   r   r   r   r   r   propssm_countr   h_qmax_num_sm_partsr-   r-   r.   __init__{  s`   

z&FlashMLASparseMetadataBuilder.__init__common_attn_metadatare   c                 C   s\   |j }| j}t| jdd || | j|ddd\}}tj|| jdd | jdd d}|S )zBuild FP8 metadata treating all tokens as one mixed batch.

        This matches main branch's approach and avoids the BF16 prefill kernel
        which has head padding overhead when num_heads is small (high TP case).
        Nr   Tcache_seqlensnum_q_tokens_per_head_ktopknum_heads_qnum_heads_kis_fp8_kvcache)rf   rh   rg   )	r^   r   r   r   rd   rZ   rj   r   rg   )r   r   r   padded_headsrf   _fp8_metadatar-   r-   r.   _build_fp8_mixed_decode_prefill  s    	
	z=FlashMLASparseMetadataBuilder._build_fp8_mixed_decode_prefillrk   c           '      C   s  |j }t|| jp	ddd\}}}}tj}|||||d}d }	d }
d }d }|dkr|j }|j}|j}||d  }||d  }	tj	|fdtj
| jd}
t|D ]}|| }|| }||d  }||
||< qPtj|tj
dd}tj|d d dd	|dd < tj|tj
| jd}t| jjj}t||}g }|D ]W\}}||  }|||  |8  < |	|| }|||  }|||   }|||   }t||}||| } |j|| ||  }!||jj|||!|| |d
 q|j|dd |j|	|
||d|_|dkrE|j}|d |d   }"| j}#t| j d | |"|# | j!|#ddd\}$}%tj"|$| j#d | | j$d | d}&|j%|&|"d|_&|S )Nr   T)decode_thresholdrequire_uniform)r{   rz   r}   r|   r   r   r   )rS   
pin_memorydim)ro   rr   ra   rs   rq   rt   non_blocking)ro   rp   rq   rx   r   )rf   rg   rh   )rl   rm   )'r^   r   reorder_batch_thresholdrZ   r   ro   cpuquery_start_loc_cpurQ   r   r   r   rangezeroscumsumr   r   r   r   r   r   itemsumrv   block_table_tensorappendry   rw   copy_r   r   r   r   rd   rj   rg   r   rn   r~   )'r   r   r   r{   rz   r}   r|   FP8Metar   prefill_seq_lensprefill_request_idr   prefill_chunksseq_lens_cpuro   r  prefill_seq_lens_cpureq_idxglobal_req_idxreq_query_startreq_query_endprefill_workspace_starts_cpumax_prefill_buffer_sizechunk_boundschunk_start	chunk_endoffsetchunk_seq_lensrt   token_start	token_endrr   chunk_workspace_startschunk_block_tablerm   r   rf   r   kernel_metar-   r-   r.   "_build_fp8_separate_prefill_decode  s   
	





	z@FlashMLASparseMetadataBuilder._build_fp8_separate_prefill_decodeFcommon_prefix_len
fast_buildc                 C   s   |}|j }tj|jtjd}t|}ttj|jd tjd|}| j	
d | j	d |jd  jt|dd | j	d | }d }	| jtk }
| jrX|
rS| |}	n| |}	t|j|j|j|j |j|j|j|| jj| j|	|
d}|S )N)rS   r   Tr  )r[   r\   r]   r^   r_   r`   ra   rb   rG   rd   r   r   )r^   npasarrayr  r   diffrepeatr   r   r   fill_r  rQ   
from_numpyr   MIN_HEADS_FOR_BF16_PREFILLr   r   r'  rZ   r[   r\   r]   r_   r`   r  r   rG   rd   )r   r(  r   r)  cmr   startsseg_lengthsrb   r   r   metadatar-   r-   r.   build  sD   



z#FlashMLASparseMetadataBuilder.build)F)rL   rM   rN   r   UNIFORM_BATCHr   r   rP   r   rR   rV   r   rQ   r   r   r   r   r'  rU   rO   rZ   r5  r-   r-   r-   r.   r5   x  sD   
 
Q
 
 r5   c                   @   s  e Zd ZededefddZ		d&dedededed	ee dB d
edB dededB dededB de	j
dB ddddfddZde	j
de	j
de	j
dede	j
f
ddZde	j
de	j
de	j
dede	j
f
ddZde	j
de	j
de	j
dede	j
f
ddZde	j
de	j
de	j
dejdee	j
e	j
f f
dd Zde	j
de	j
de	j
de	j
fd!d"Zde	j
ee	j
e	j
f B de	j
ded#edee	j
e	j
dB f f
d$d%ZdS )'r9   r   r*   c                 C   s   | dkrdS dS )Nr,   r   r-   )r   r-   r-   r.   r     s   z3FlashMLASparseImpl._compute_fp8_decode_padded_headsNrI   scalerH   alibi_slopessliding_windowkv_cache_dtypelogits_soft_cap	attn_typekv_sharing_target_layer_nametopk_indice_bufferindexerzIndexer | Nonec                 K   s   || _ || _t|| _|| _|| _|d | _|| _|d usJ |j| _t	
dr*dnd| _| || _|dkr^t }|d urC|jd usEJ t|jj}||f| _t | jtjf\| _d S d S )Nkv_lora_rankr   r   r,   r(   )r   rI   floatr7  rH   r:  r@  softmax_scaletopk_indices_bufferr   r   prefill_paddingr   r   r   r   r   r   prefill_workspace_shaper    get_simultaneousrQ   r'   prefill_bf16_workspace)r   r   rI   r7  rH   r8  r9  r:  r;  r<  r=  r>  r?  mla_argsr   prefill_workspace_sizer-   r-   r.   r     s0   




zFlashMLASparseImpl.__init__qkv_c_and_k_pe_cachetopk_indicesattn_metadatac                 C   s,   t |j|j||j|jd d}| |||S )Nr   r   r   )r   rb   ra   rG   r   _bf16_flash_mla_kernel)r   rJ  rK  rL  rM  r-   r-   r.   _forward_bf16_kv  s   	z#FlashMLASparseImpl._forward_bf16_kvc              
      s  |j  t tjsJ  jd }d }d} jd ur# jj} jj}d}t|j	|j
||j|jd |||d}|j  t tjs@J dtjdtjdtjf fdd	} j}	 j}
|	d
kro|
d
kro jd ushJ |||}|S |j|jjjf|j|jd}|	d
kr||d |	 |d |	 |d |	<  jd usJ  jjD ].}jd |j }t||j
|j|jt|j
 ||j }||j } |||||j< q|S )NFTr   )r   r   r   r   r   rJ  rL  r*   c                    sR   t | } | jd }||d} jd usJ j| | jjd\}}t|S )Nr   r   rJ  rK  rL  rl   )r   r   viewr~   _fp8_flash_mla_kernelrl   r   )rJ  rL  seq_lenattn_outr   r   rK  r{   r   r-   r.   _fp8_decode   s   


zOFlashMLASparseImpl._forward_fp8_kv_separate_prefill_decode.<locals>._fp8_decoder   r   )!r   
isinstancerZ   r   r{   r   rp   rq   r   rb   ra   rG   r   rQ   ri   r}   r|   r~   	new_emptyr^   r   r@  rS   r   rx   rG  rt   ops$cp_gather_and_upconvert_fp8_kv_cachero   lenrr   rO  )r   rJ  rK  rL  rM  prefill_request_idsr   has_prefill_workspacerW  r}   r|   rU  chunkchunk_workspacechunk_qchunk_topk_indices_workspacer-   rV  r.   '_forward_fp8_kv_separate_prefill_decode  sr   
&
#
	
z:FlashMLASparseImpl._forward_fp8_kv_separate_prefill_decodec                 C   sp   t |j|j||j|jd d}|jdusJ t|jtjsJ |j}| j	|
d||
d|d\}}|dS )a  Mixed batch FP8 forward path that treats all tokens as one batch.

        This is equivalent to main branch's approach and avoids the BF16
        prefill kernel which has head padding overhead when num_heads is small.
        Used when use_mixed_batch is True.
        r   rN  Nr   rQ  )r   rb   ra   rG   r   r   rX  rZ   rj   rS  	unsqueezesqueeze)r   rJ  rK  rL  rM  r   	_attn_outr   r-   r-   r.   _forward_fp8_kv_mixed_batch^  s&   

z.FlashMLASparseImpl._forward_fp8_kv_mixed_batchrl   c           
      C   s   | d}| j}||k r<td| d| d || d| d|| df}||d d d d d |d d f< |}t||tj	d|j
d	|j|jd
|| jd	\}}	||k rj|d d d d d |d d f }||	fS )N   Padding num_heads from  to z for FP8 sparse decode kernelr   r      i   T)	rJ  k_cachera   
head_dim_vr   tile_scheduler_metadatar   indicesrB  )sizer   loggerwarning_once	new_zerosr   rR  rQ   uint8rd  rg   rh   rf   rB  )
r   rJ  rK  rL  rl   actual_num_headspadded_num_headsq_paddedr   lser-   r-   r.   rS    s2   
$ 
 z(FlashMLASparseImpl._fp8_flash_mla_kernelc                 C   s   |j d }|dd|j d }| j| j dkrN| j| j dks!J td| j d| j d ||j d | j|j d f}||d d d | jd d f< |}||dd}t|||| jd }|d d d | jd d f }|S )Nr   r   r   ri  rj  z for BF16 sparse prefill kernelrh  )	r   rR  r   rD  rr  rs  rY  r   rB  )r   rJ  rK  rL  r   rx  outputr-   r-   r.   rO    s,   


z)FlashMLASparseImpl._bf16_flash_mla_kernellayerc           	      C   s   t |trtj|dd}|jd }| jd usJ | jd | }| jdk}|s2| ||||}|d fS |jrA| 	||||}|d fS | 
||||}|d fS )Nr   r  r   r(   )rX  rY   rQ   catr   rC  r:  rP  r   rg  rc  )	r   rJ  rK  rM  r{  num_actual_toksrL  use_fp8_cacherU  r-   r-   r.   forward_mqa  s(   


zFlashMLASparseImpl.forward_mqa)NN)rL   rM   rN   rT   rU   r   rA  rR   rV   rQ   ri   r   rZ   rP  rc  rg  rj   rY   rS  rO  r   r  r-   r-   r-   r.   r9     s    
	

.

e
'
'
r9   )r,   rc   r   FNN)Cdataclassesr   typingr   r   numpyr*  rQ   vllmr   rZ  vllm.configr   r   vllm.config.cacher   vllm.loggerr	   2vllm.model_executor.layers.attention.mla_attentionr
   vllm.platformsr   vllm.platforms.interfacer   vllm.triton_utilsr   r   vllm.v1.attention.backendr   r   r   r   r   r   r   r    vllm.v1.attention.backends.utilsr   r   r   r   vllm.v1.attention.ops.flashmlar   r   r   r   vllm.v1.kv_cache_interfacer   vllm.v1.worker.workspacer    &vllm.model_executor.models.deepseek_v2r!   rL   rr  r0  r#   rZ   jit	constexprr   ri   rU   rO   r   r   r5   r9   r-   r-   r-   r.   <module>   s   (
9I	
F	
\
  ;