o
    i2                  
   @   s.  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZmZmZ d d	lmZmZ eeZG d
d deZeG dd dZeG dd dZeG dd dZeG dd dZdejdejdejde ejejf fddZ!defddZ"G dd deZ#dS )    )	dataclass)ClassVarN)
VllmConfig)init_logger)current_platform)get_paged_mqa_logits_metadatais_deep_gemm_supported)AttentionBackendAttentionCGSupportAttentionMetadataBuilderCommonAttentionMetadata
MultipleOf)split_decodes_and_prefillssplit_prefill_chunksc                   @   s   e Zd ZedefddZedeeeB  fddZ	e
dee fddZeded fd	d
Ze	ddedededededeedf fddZe	ddedeedf fddZdS )DeepseekV32IndexerBackendreturnc                   C   s   dS )NDEEPSEEK_V32_INDEXER r   r   r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/backends/mla/indexer.pyget_name      z"DeepseekV32IndexerBackend.get_namec                   C   s   t  rdgS dgS )N   @   )r   is_rocmr   r   r   r    get_supported_kernel_block_sizes    s   z:DeepseekV32IndexerBackend.get_supported_kernel_block_sizesc                 C   s   g dS )N)    r      r   )clsr   r   r   get_supported_head_sizes$   s   z2DeepseekV32IndexerBackend.get_supported_head_sizes!DeepseekV32IndexerMetadataBuilderc                   C   s   t S )N)r   r   r   r   r   get_builder_cls(   r   z)DeepseekV32IndexerBackend.get_builder_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s   |dksJ | ||fS )Nr   r   )r"   r#   r$   r%   r&   r   r   r   get_kv_cache_shape,   s   
z,DeepseekV32IndexerBackend.get_kv_cache_shapeFinclude_num_layers_dimensionc                 C   s   | rdS dS )N)r   r         )r   r   r)   r   )r(   r   r   r   get_kv_cache_stride_order7   s   z3DeepseekV32IndexerBackend.get_kv_cache_stride_orderN)r!   F)__name__
__module____qualname__staticmethodstrr   listintr   r   classmethodr   typer    tupler'   boolr+   r   r   r   r   r      s>    


r   c                   @   s`   e Zd ZU ejed< ejed< ejed< ejed< ejed< eed< eed< eed< eed	< d
S )&DeepseekV32IndexerPrefillChunkMetadatablock_tablecu_seqlen_kscu_seqlen_kecu_seq_lenstoken_to_seqtotal_seq_lenstoken_start	token_endnum_reqsN)r-   r.   r/   torchTensor__annotations__r3   r   r   r   r   r8   @   s   
 




r8   c                   @   s   e Zd ZU ee ed< dS )!DeepseekV32IndexerPrefillMetadatachunksN)r-   r.   r/   r2   r8   rD   r   r   r   r   rE   M   s   
 rE   c                   @   s>   e Zd ZU ejed< ejed< ejed< eed< ejed< dS ) DeepSeekV32IndexerDecodeMetadatar9   seq_lensdecode_lensrequires_paddingschedule_metadataN)r-   r.   r/   rB   rC   rD   r7   r   r   r   r   rG   R   s   
 


rG   c                   @   s   e Zd ZU ejed< eed< eed< eed< eed< ejed< ejed< eed< eed	< eed
< eed< eed< dZedB ed< dZ	e
dB ed< dS )DeepseekV32IndexerMetadatarH   rA   max_query_lenmax_seq_lennum_actual_tokensquery_start_locslot_mappinghead_dimnum_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensNdecodeprefill)r-   r.   r/   rB   rC   rD   r3   rW   rG   rX   rE   r   r   r   r   rL   [   s   
 


rL   start_seq_locseq_len_per_batchdevicer   c                 C   sL  | j tjd}|j tjd}| dkr| dksJ | | d ks*J d|dd |dd  }t|d  }| }|dkrXtjdtj|dtjdtj|dfS tj|dd| }t	t
||}	||	 }
t	||}t	||}tj
|tjdt	|dd | d }|| | }|
| }|
  ||  |fS )	a  
    Args:
      start_seq_loc: 1D long tensor [B+1], cumulative counts of
                     selected tokens per batch.
            Example: [0, 2, 4, 7] ->
                     batch sizes (selected) [2, 2, 3], N=7 tokens total.
      seq_len_per_batch: 1D long tensor [B],
                         full sequence length (KV length) of each batch.
                         Example: [5, 9, 4].

    Returns:
      start_tensor: 1D long tensor [N], start offset in the
                    concatenated KV cache for each token's batch.
      end_location: 1D long tensor [N],
                    **exclusive** end = start + token's local position.
                    (So the attended KV slice is kv[start:end].)

    Assumes each batch contributes its full `seq_len_per_batch[i]`
    keys to the KV cache, andthe selected tokens within a batch
    are the **last** `counts[i]` positions of that sequence.
    dtyper   z"start_seq_loc must have length B+1Nr   r]   r[   dim)torB   longra   numelr3   itememptycumsumrepeat_interleavearange)rY   rZ   r[   qLcountsNBkv_starts_per_batchbatch_idstart_tensorL_expandm_expand
pos_within	local_posend_locationr   r   r   kv_spans_from_batchesw   s*   &rw   vllm_configc                 C   s   | j j}|d S )N(   )model_configmax_model_len)rx   r{   r   r   r   get_max_prefill_buffer_size   s   	r|   c                	       sb   e Zd ZU ejZee ed< dZe	ed<  fddZ
dd Z	dd	e	d
ededefddZ  ZS )r   _cudagraph_supportr   reorder_batch_thresholdc                    s   t  j|i | | jj}t| j| _| jjr| jjjnd| _|  jt	| jd7  _t
j| j}|j}|| _t
j|jft
j| jd| _t
j| jd dft
j| jd| _d S )Nr   r   r_   r)   )super__init__rx   scheduler_configr|   max_prefill_buffer_sizespeculative_confignum_speculative_tokensr~   minrB   cudaget_device_propertiesr[   multi_processor_countnum_smsrf   max_num_seqsint32decode_lens_bufferscheduler_metadata_buffer)selfargskwargsr   propssm_count	__class__r   r   r      s"   z*DeepseekV32IndexerMetadataBuilder.__init__c                 C   s   |||d  ||  }t |||| | j\}}||  }	||  }
|||  }tjd|| tjd}t|||| | j}|| j	ksMJ t
tjdtjd||| jddgtj| j}t|||||||| |	|
|| d	S )Nr   r   r\   r`   )	r:   r;   r<   r=   r>   r9   r?   r@   rA   )rw   r[   re   sumrB   ri   r   rh   rb   r   catzerosrg   r8   )r   
reqs_startreqs_endquery_start_loc_cpuseq_lens_cpur9   prefill_query_start_locr:   r;   r?   r@   r>   seq_idxr=   r<   r   r   r   build_one_prefill_chunk   sH   

z9DeepseekV32IndexerMetadataBuilder.build_one_prefill_chunkFcommon_prefix_lencommon_attn_metadata
fast_buildr   c                    s   j } j} jt jd\}}}}	|| |ksJ ||	 |ks$J d }
|dkrGt j|d  j|d} fdd|D }t|d}
d }|dkrt	j
 jd |d  jd | d jd | }t	
 jd |d  }| | k } jd | }t rt|jjjjd d < t jd |d	f  jd | ||jd
}t j j  j j j j jd||||	|
|d}|S )N)decode_thresholdr   )request_offsetc              	      s&   g | ]\}} || j jqS r   )r   r   block_table_tensor).0r   r   r   r   r   r   r   
<listcomp>'  s    z;DeepseekV32IndexerMetadataBuilder.build.<locals>.<listcomp>)rF   r   )out.)r9   rH   rI   rJ   rK   r   )rH   rA   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rX   rW   )rA   rO   r   r   r~   r   r   r   rE   rB   diffrP   r   maxr   re   rH   r   r   kv_cache_specr#   r   r   rG   r   rL   rM   rN   rQ   )r   r   r   r   rA   
num_tokensrS   rU   rT   rV   prefill_metadatachunk_seq_idsrF   decode_metadatarI   decode_lens_cpurJ   rH   attn_metadatar   r   r   build  s|   

z'DeepseekV32IndexerMetadataBuilder.buildr,   )r-   r.   r/   r
   UNIFORM_SINGLE_TOKEN_DECODEr}   r   rD   r~   r3   r   r   r   r7   rL   r   __classcell__r   r   r   r   r      s    
 ,r   )$dataclassesr   typingr   rB   vllm.configr   vllm.loggerr   vllm.platformsr   vllm.utils.deep_gemmr   r   vllm.v1.attention.backendr	   r
   r   r   r    vllm.v1.attention.backends.utilsr   r   r-   loggerr   r8   rE   rG   rL   rC   r[   r6   rw   r|   r   r   r   r   r   <module>   s<   %
@