o
    iNs                  	   @   s,  d dl mZmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ e	rbd dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" G dd de#eZ$G dd dZ%G dd deZ&G dd dZ'ede'dZ(eG dd dZ)edZ*G dd deZ+G dd deee* Z,G dd  d eZ-G d!d" d"eee( Z.G d#d$ d$e.e( ee( Z/G d%d& d&e.e( ee( Z0G d'd( d(e.e( ee( Z1d)e#d*e2fd+d,Z3d-e#d.e4e& d/e4e,e*  d*e4e& fd0d1Z5d-e#d.e4e& d2e6e#e
f d*e4e& fd3d4Z7dS )5    )ABCabstractmethod)	dataclassreplace)Enum)TYPE_CHECKINGAnyClassVarGenericProtocolTypeVarget_argsN)
deprecated)
VllmConfig)
CacheDType)ColumnParallelLinear)QuantKey)DeviceCapability)KVCacheLayoutType)AttentionSpecc                   @   &   e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderencoderencoder_onlyencoder_decoderN)__name__
__module____qualname____doc__DECODERENCODERENCODER_ONLYENCODER_DECODER r$   r$   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/backend.pyr      s    r   c                   @   s$   e Zd ZU eed< defddZdS )
MultipleOfbasec                 C   s
   || _ d S N)r'   )selfr'   r$   r$   r%   __init__*      
zMultipleOf.__init__N)r   r   r   int__annotations__r*   r$   r$   r$   r%   r&   '   s   
 r&   c                   @   s  e Zd ZU dZdZeed< ejej	gZ
eeej  ed< ddgZeed  ed< d	Zeed
< edeeeB  fddZeedefddZeeded fddZeedd Zee	dMdedededededeedf fddZe	dNdedeedf fddZedeeef fd d!Zedee fd"d#Zededefd$d%Z ed&ejdefd'd(Z!ed)d*defd+d,Z"eded-B defd.d/Z#edefd0d1Z$edefd2d3Z%edefd4d5Z&edefd6d7Z'edefd8d9Z(ed:edefd;d<Z)ed=d>defd?d@Z*eded&ejd)d*dedAedBedCedDd>ded-B fdEdFZ+eded&ejd)d*dedAedBedCedGedDd>d:edee fdHdIZ,edOdKdLZ-d-S )PAttentionBackendz&Abstract class for attention backends.Faccept_output_buffersupported_dtypesautobfloat16r   supported_kv_cache_dtypesT forward_includes_kv_cache_updatereturnc                   C   s
   t dgS )N   )r&   r$   r$   r$   r%    get_supported_kernel_block_sizes;      
z1AttentionBackend.get_supported_kernel_block_sizesc                   C      t r(   NotImplementedErrorr$   r$   r$   r%   get_name?      zAttentionBackend.get_nameAttentionImplBasec                   C   r9   r(   r:   r$   r$   r$   r%   get_impl_clsD   r=   zAttentionBackend.get_impl_clsc                   C   r9   r(   r:   r$   r$   r$   r%   get_builder_clsI   r=   z AttentionBackend.get_builder_cls
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   r9   r(   r:   )rA   rB   rC   rD   rE   r$   r$   r%   get_kv_cache_shapeN      	z#AttentionBackend.get_kv_cache_shapeinclude_num_layers_dimensionc                 C   r9   )av  
        Get the physical (memory layout) ordering of the kv cache dimensions.
        e.g. if the KV cache shape is
        [2, num_blocks, block_size, num_heads, head_size],
        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
        ordering of dimensions is
        [num_blocks, num_heads, 2, block_size, head_size].

        If this function is unimplemented / raises NotImplementedError,
        the physical layout of the KV cache will match the logical shape.

        Args:
            include_num_layers_dimension: if True, includes an additional
                num_layers dimension, which is assumed to be prepended
                to the logical KV cache shape.
                With the above example, a return value (2, 4, 0, 1, 3, 5)
                corresponds to
                [num_blocks, num_heads, num_layers, 2, block_size, head_size].

                If an additional dimension is NOT included in the returned
                tuple, the physical layout will not include a layers dimension.

        Returns:
            A tuple of ints which is a permutation of range(len(shape)).
        r:   )rH   r$   r$   r%   get_kv_cache_stride_orderY   s   z*AttentionBackend.get_kv_cache_stride_orderc                 C   s   | j | jfS r(   )r   r   clsr$   r$   r%   full_cls_namex   s   zAttentionBackend.full_cls_namec                 C   s   g S r(   r$   rJ   r$   r$   r%   get_supported_head_sizes|      z)AttentionBackend.get_supported_head_sizesc                 C   s   |   }| p
||v S r(   )rM   )rK   rD   supported_head_sizesr$   r$   r%   supports_head_size   s   z#AttentionBackend.supports_head_sizedtypec                 C   s
   || j v S r(   )r0   )rK   rQ   r$   r$   r%   supports_dtype   r8   zAttentionBackend.supports_dtypekv_cache_dtypezCacheDType | Nonec                 C   s   |d u rdS | j  p|| j v S NT)r3   )rK   rS   r$   r$   r%   supports_kv_cache_dtype   s
   z(AttentionBackend.supports_kv_cache_dtypeNc                 C   sl   ddl m} |d u rdS t|}||vrdS |  }|sdS |D ]}t|tr*|j}|| dkr3 dS q dS )Nr   )	BlockSizeTF)vllm.config.cacherV   r   r7   
isinstancer&   r'   )rK   rB   rV   valid_sizessupported_kernel_block_sizessupported_sizer$   r$   r%   supports_block_size   s    
z$AttentionBackend.supports_block_sizec                 C      dS NFr$   rJ   r$   r$   r%   is_mla   rN   zAttentionBackend.is_mlac                 C   r]   r^   r$   rJ   r$   r$   r%   supports_sink   rN   zAttentionBackend.supports_sinkc                 C   r]   r^   r$   rJ   r$   r$   r%   supports_alibi_sqrt   rN   z$AttentionBackend.supports_alibi_sqrtc                 C   r]   r^   r$   rJ   r$   r$   r%   supports_mm_prefix   rN   z#AttentionBackend.supports_mm_prefixc                 C   r]   r^   r$   rJ   r$   r$   r%   	is_sparse   rN   zAttentionBackend.is_sparse	attn_typec                 C   s
   |t jkS )zCheck if backend supports a given attention type.

        By default, only supports decoder attention.
        Backends should override this to support other attention types.
        )r   r    )rK   rd   r$   r$   r%   supports_attn_type   s   
z#AttentionBackend.supports_attn_type
capabilityr   c                 C   r]   rT   r$   )rK   rf   r$   r$   r%   supports_compute_capability   rN   z,AttentionBackend.supports_compute_capabilityuse_mlahas_sink
use_sparsedevice_capabilityc	           	      C      d S r(   r$   )	rK   rD   rQ   rS   rB   rh   ri   rj   rk   r$   r$   r%   supports_combination      z%AttentionBackend.supports_combinationuse_mm_prefixc              
   C   s*  g }|  |s|d | |s|d | |s |d | |s*|d |r5|  s5|d ||  krH|rC|d n|d |rS|  sS|d ||  krf|ra|d	 n|d
 | 	|	sp|d | 
|
s~|d|
 d | ||||||||	}|d ur|| |S )Nzhead_size not supportedzdtype not supportedzkv_cache_dtype not supportedzblock_size not supportedz5partial multimodal token full attention not supportedzMLA not supportedznon-MLA not supportedzsink setting not supportedzsparse not supportedznon-sparse not supportedz compute capability not supportedzattention type z not supported)rP   appendrR   rU   r\   rb   r_   r`   rc   rg   re   rm   )rK   rD   rQ   rS   rB   rh   ri   rj   ro   rk   rd   invalid_reasonscombination_reasonr$   r$   r%   validate_configuration   sP   















z'AttentionBackend.validate_configurationKVCacheLayoutType | Nonec                 C   rl   r(   r$   rJ   r$   r$   r%   get_required_kv_cache_layout  rN   z-AttentionBackend.get_required_kv_cache_layout)r1   F)r5   rt   ).r   r   r   r   r/   boolr-   torchfloat16r2   r0   r	   listrQ   r3   r4   staticmethodr,   r&   r7   r   strr<   typer?   r@   tuplerF   rI   classmethodrL   rM   rP   rR   rU   r\   r_   r`   ra   rb   rc   re   rg   rm   rs   ru   r$   r$   r$   r%   r.   .   s   
 
	
	
	
8r.   c                   @   s   e Zd ZdS )AttentionMetadataN)r   r   r   r$   r$   r$   r%   r     s    r   T)boundc                   @   s  e Zd ZU dZejed< ejed< 	 ejed< 	 eed< 	 eed< 	 eed< 	 eed< 	 ejed	< ejed
< dZe	ed< dZ
ejdB ed< dZedB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< 	 dZejdB ed< dZejdB ed< dZejdB ed< defddZdejfddZd)ddZeeddejfdd Zeed!dejfd"d#Zdejfd$d%Zded&edd fd'd(ZdS )*CommonAttentionMetadataz
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.

    For many of the tensors we keep both GPU and CPU versions.
    query_start_locquery_start_loc_cpuseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingTcausalNlogits_indices_paddednum_logits_indicesencoder_seq_lensencoder_seq_lens_cpudcp_local_seq_lensdcp_local_seq_lens_cpu_seq_lens_cpu_num_computed_tokens_cpu_num_computed_tokens_cacher5   c                 C   s   | j jd S )Nr   )r   shaper)   r$   r$   r%   
batch_sizeO  s   z"CommonAttentionMetadata.batch_sizec                 C   s   | j dd | j dd  S )zENaive because it assumes that query ends where the next query starts.r6   N)r   r   r$   r$   r%   naive_query_lensR  s   z(CommonAttentionMetadata.naive_query_lensc                 K   s   t | fi |S r(   )r   )r)   kwargsr$   r$   r%   r   V  s   zCommonAttentionMetadata.replacez
    Prefer using device seq_lens directly to avoid implicit H<>D sync.
    If a CPU copy is needed, use `seq_lens.cpu()` instead.
    Will be removed in a future release, please migrate as soon as possible.
    c                 C   s   | j d u r| jd| _ | j S )Ncpu)r   r   tor   r$   r$   r%   seq_lens_cpuY  s   
	z$CommonAttentionMetadata.seq_lens_cpua  
    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
    async scheduling. If a CPU copy is needed, it can be derived from 
    query_start_loc_cpu and seq_lens.
    Will be removed in a future release, please migrate as soon as possible.
    c                 C   s8   | j d u r| jdd  | jd d  }| j| | _ | j S )Nr6   r   )r   r   r   )r)   query_seq_lensr$   r$   r%   num_computed_tokens_cpuf  s
   

z/CommonAttentionMetadata.num_computed_tokens_cpuc                 C   s8   | j du r| jdd | jdd  }| j| | _ | j S )z>Compute num_computed_tokens on device (seq_lens - query_lens).Nr6   r   )r   r   r   )r)   
query_lensr$   r$   r%   compute_num_computed_tokensw  s   
z3CommonAttentionMetadata.compute_num_computed_tokensnum_actual_reqsc                    s   fdd}t di d| jd  d  d| jd  d  d| jd   d| jd ur2| jd   nd d| jd urA| jd   nFd d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS )Nc                    s   | d ur
| d   S d S r(   r$   )xr   r$   r%   <lambda>  s    z2CommonAttentionMetadata.unpadded.<locals>.<lambda>r   r6   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r$   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r)   r   r   maybe_slice_reqsr$   r   r%   unpadded  s   











z CommonAttentionMetadata.unpadded)r5   r   )r   r   r   r   rx   Tensorr-   r,   r   rw   r   r   r   r   npndarrayr   r   r   r   r   r   r   r   propertyr   r   r   r   r   r$   r$   r$   r%   r     sb   
 





r   Mc                   @   r   )AttentionCGSupportzConstants for the cudagraph support of the attention backend
    Here we do not consider the cascade attention, as currently
    it is never cudagraph supported.      r6   r   N)r   r   r   r   ALWAYSUNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODENEVERr$   r$   r$   r%   r     s    r   c                   @   sD  e Zd ZU ejZee ed< dZe	dB ed< dZ
eed< edddee d	d
dejfddZeded  d	d
dddefddZ			d0de	dB dededdfddZe	d1de	dededefddZdedejdejdefdd Zdedefd!d"Zded#e	defd$d%Zde	d&ejd'e	d(e	d)ed*ed+ed,e	d-e	defd.d/ZdS )2AttentionMetadataBuilder_cudagraph_supportNreorder_batch_thresholdFsupports_update_block_tablekv_cache_specr   layer_namesvllm_configr   devicec                 C   s   || _ || _|| _|| _d S r(   )r   r   r   r   )r)   r   r   r   r   r$   r$   r%   r*     s   
z!AttentionMetadataBuilder.__init__rK   r5   c                 C   s   | j S )z6Get the cudagraph support level of this builder class.)r   )rK   r   r   r$   r$   r%   get_cudagraph_support  s   z.AttentionMetadataBuilder.get_cudagraph_supportr6   supports_spec_as_decodesupports_dcp_with_varlenc                 C   sx   || _ | j d ur*|r*| jj}|d ur*|jd ur*d|jrdnd|j  }t| j || _ | jjjdkr8|s:d| _ d S d S d S )Nr6   r   )r   r   speculative_confignum_speculative_tokensparallel_draftingmaxparallel_configdecode_context_parallel_size)r)   r   r   r   r   max_num_queries_for_specr$   r$   r%   _init_reorder_batch_threshold  s(   

z6AttentionMetadataBuilder._init_reorder_batch_thresholdcommon_prefix_lencommon_attn_metadata
fast_buildc                 C   r9   )a  
        Central method that builds attention metadata.
        Some builders (MLA) require reorder_batch to be called prior to build.

        Args:
            common_prefix_len: The length of the common prefix of the batch.
            common_attn_metadata: The common attention metadata.
            fast_build: The meta-data will prioritize speed of building over
                then speed at execution. Can be used for spec-decode where the
                result of a build call may only be used for few layers/iters.
        r:   )r)   r   r   r   r$   r$   r%   build  s   zAttentionMetadataBuilder.buildmetadata	blk_tabler   c                 C   r9   )a  
        Update the block table for the attention metadata.
        Faster when theres multiple kv-cache groups that create virtually the
        same metadata but just with different block tables.

        Only needs to be implemented if supports_update_block_table is True.
        r:   )r)   r   r   r   r$   r$   r%   update_block_table     z+AttentionMetadataBuilder.update_block_tablec                 C   s   | j d|dS )z
        Build attention metadata for CUDA graph capture. Uses build by default.
        Subclasses that override this method should call self.build or
        super().build_for_cudagraph_capture.
        r   )r   r   r   )r)   r   r$   r$   r%   build_for_cudagraph_capture  s   z4AttentionMetadataBuilder.build_for_cudagraph_capturedraft_indexc                 C   s   | j d|ddS )a  
        Build attention metadata for draft model. Uses build by default.

        Args:
            common_attn_metadata: The common attention metadata.
            draft_index: The index of the current draft operation.
                When speculating a chain of tokens, this index refers to the
                draft attempt for the i-th token.
                For tree-based attention, this index instead refers to the
                draft attempt for the i-th level in the tree of tokens.
        r   T)r   r   r   r   )r)   r   r   r$   r$   r%   build_for_drafting"  s
   z+AttentionMetadataBuilder.build_for_draftingr   num_query_headsrC   	use_alibiuse_sliding_windowuse_local_attentionnum_smsdcp_world_sizec
           
      C   r]   r^   r$   )
r)   r   r   r   rC   r   r   r   r   r   r$   r$   r%   use_cascade_attention8  rn   z.AttentionMetadataBuilder.use_cascade_attention)r6   FFrv   ) r   r   r   r   r   r   r	   r-   r   r,   r   rw   r   rz   r|   rx   r   r*   r   r}   r   r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   r     s   
 

 


	
r   c                   @   sz   e Zd ZU ejed< ejed< ejed< eed< eed< eed< ejed< dejd	ejd
ejdejdedejfddZdS )AttentionLayer_q_scale_k_scale_v_scale_q_scale_float_k_scale_float_v_scale_float_prob_scalequerykeyvaluekv_cacheattn_metadatar5   c                 C   rl   r(   r$   )r)   r   r   r   r   r   r$   r$   r%   forwardP  s   zAttentionLayer.forwardN)	r   r   r   rx   r   r-   floatr   r   r$   r$   r$   r%   r   G  s*   
 



r   c                       s   e Zd ZU dZeed< eed< eed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dZeed< eed< eed< eed< eed< eed< eed<  fddZdejfddZ  ZS )r>   zBase class for attention implementations.

    Contains common attributes and initialization logic shared by both
    standard AttentionImpl and MLAAttentionImpl. Does not define a forward
    method - subclasses define their own forward interfaces.
    	num_headsrD   scaleFcan_return_lse_for_decodesupports_pcp0supports_mtp_with_cp_non_trivial_interleave_sizeneed_to_return_lse_for_decodesupports_quant_query_inputsupports_per_head_quant_scalesr   dcp_rankpcp_world_sizepcp_ranktotal_cp_world_sizetotal_cp_rankc                    s   t  | }zddlm} | j|_| j|_W n ty'   d|_d|_Y nw zddlm	} | j|_
| j|_W n tyI   d|_
d|_Y nw |j
|j |_|j|j |j |_|jdkob|j|_|S )Nr   )get_dcp_groupr6   )get_pcp_group)super__new__vllm.distributed.parallel_stater   
world_sizer   rank_in_groupr   AssertionErrorr   r   r   r   r   r   r   )rK   argsr   r)   r   r   	__class__r$   r%   r     s,   



zAttentionImplBase.__new__	act_dtypec                 C   rl   r(   r$   )r)   r  r$   r$   r%   process_weights_after_loading  s   z/AttentionImplBase.process_weights_after_loading)r   r   r   r   r,   r-   r   r   rw   r   r   r   r   r   r   rx   rQ   r  __classcell__r$   r$   r  r%   r>   Z  s&   
 
r>   c                   @   s   e Zd ZdZedddddejdfdededededB de	e dB d	edB d
e
dedB de
de
dB ddfddZe			d dedejdejdejdejdedejdB dejdB dejdB dejfddZd!ddZdS )"AttentionImplz6Standard attention implementation with forward method.Nr1   r   rD   r   rC   alibi_slopessliding_windowrS   logits_soft_caprd   kv_sharing_target_layer_namer5   c                 C   r9   r(   r:   )r)   r   rD   r   rC   r  r  rS   r	  rd   r
  r$   r$   r%   r*     s   zAttentionImpl.__init__layerr   r   r   r   r   outputoutput_scaleoutput_block_scalec
           
      C   r9   r(   r:   )
r)   r  r   r   r   r   r   r  r  r  r$   r$   r%   r     r   zAttentionImpl.forward	quant_keyr   c                 C   r]   )ab  
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.

        :param quant_key: QuantKey object that describes the quantization op
        :return: is fusion supported for this type of quantization
        Fr$   )r)   r  r$   r$   r%   fused_output_quant_supported  rG   z*AttentionImpl.fused_output_quant_supported)NNN)r  r   )r   r   r   r   r   r   r    r,   r   rz   r|   r*   r   rx   r   r   r   r  r$   r$   r$   r%   r    sv    
	
		
r  c                *   @   s  e Zd ZdZe		d&dededededee dB dedB d	ed
edB dededB dedB dedededededdde	dB dedB ddf(ddZ
edejdejdejdejdedejd ejddfd!d"Zedejeejejf B dejded#edeejejdB f f
d$d%ZdS )'MLAAttentionImplzFMLA attention implementation with forward_mqa and forward_mha methods.Nr   rD   r   rC   r  r  rS   r	  rd   r
  q_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dimqk_head_dim
v_head_dim	kv_b_projr   indexerq_pad_num_headsr5   c                 C   r9   r(   r:   r)   r   rD   r   rC   r  r  rS   r	  rd   r
  r  r  r  r  r  r  r  r  r  r$   r$   r%   r*        zMLAAttentionImpl.__init__qkv_c_normedk_pekv_c_and_k_pe_cacher   k_scaler  c                 C   r9   )zMHA-style prefill forward pass.r:   )r)   r  r  r  r   r   r!  r  r$   r$   r%   forward_mha  rn   zMLAAttentionImpl.forward_mhar  c                 C   r9   zMQA-style decode forward pass.r:   r)   r  r   r   r  r$   r$   r%   forward_mqa  rG   zMLAAttentionImpl.forward_mqaNN)r   r   r   r   r   r,   r   rz   r|   objectr*   rx   r   r   r"  r~   r   r%  r$   r$   r$   r%   r    s    
	
	r  c                *   @   s   e Zd ZdZe		d dededededee dB dedB d	ed
edB dededB dedB dedededededdde	dB dedB ddf(ddZ
edejeejejf B dejdededeejejdB f f
ddZdS )!SparseMLAAttentionImplzSparse MLA attention implementation with only forward_mqa method.

    Sparse MLA implementations only support decode (MQA-style) attention.
    They do not support prefill (MHA-style) attention.
    Nr   rD   r   rC   r  r  rS   r	  rd   r
  r  r  r  r  r  r  r  r   r  r  r5   c                 C   r9   r(   r:   r  r$   r$   r%   r*     r  zSparseMLAAttentionImpl.__init__r  r   r   r  c                 C   r9   r#  r:   r$  r$   r$   r%   r%  /  rG   z"SparseMLAAttentionImpl.forward_mqar&  )r   r   r   r   r   r,   r   rz   r|   r'  r*   rx   r   r~   r   r   r%  r$   r$   r$   r%   r(    st    
	
r(  rS   r5   c                 C   s
   |  dS )Nfp8)
startswith)rS   r$   r$   r%   is_quantized_kv_cache;  r+   r+  name_prefixattention_backend_clsbuilder_clsc                    s$   | |j  }t||fd fddiS )zN
    Return a new subclass where `get_builder_cls` returns `builder_cls`.
    r@   c                      s    S r(   r$   r$   r.  r$   r%   r   J  s    z,subclass_attention_backend.<locals>.<lambda>r   r}   )r,  r-  r.  namer$   r/  r%   subclass_attention_backend?  s   
r2  	overridesc                 C   s   | |j  }t||f|S r(   r0  )r,  r-  r3  r1  r$   r$   r%   )subclass_attention_backend_with_overridesN  s   
r4  )8abcr   r   dataclassesr   r   enumr   typingr   r   r	   r
   r   r   r   numpyr   rx   typing_extensionsr   vllm.configr   rW   r   !vllm.model_executor.layers.linearr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platforms.interfacer    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r|   r   r&   r.   r   r   r   r   r   r   r   r>   r  r  r(  rw   r+  r}   r2  dictr4  r$   r$   r$   r%   <module>   sf   $ j~ O.7-


