o
    .ii                  	   @   s  d dl mZmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ e	rbd dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" G dd de#eZ$G dd dZ%G dd deZ&G dd dZ'ede'dZ(eG dd dZ)edZ*G dd deZ+G dd deee* Z,G dd  d eZ-G d!d" d"eee( Z.G d#d$ d$e.e( ee( Z/d%e#d&e0fd'd(Z1d)e#d*e2e& d+e2e,e*  d&e2e& fd,d-Z3d)e#d*e2e& d.e4e#e
f d&e2e& fd/d0Z5dS )1    )ABCabstractmethod)	dataclassreplace)Enum)TYPE_CHECKINGAnyClassVarGenericProtocolTypeVarget_argsN)
deprecated)
VllmConfig)
CacheDType)ColumnParallelLinear)QuantKey)DeviceCapability)KVCacheLayoutType)AttentionSpecc                   @   &   e Zd ZdZdZ	 dZ	 dZ	 dZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderencoderencoder_onlyencoder_decoderN)__name__
__module____qualname____doc__DECODERENCODERENCODER_ONLYENCODER_DECODER r$   r$   V/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backend.pyr      s    r   c                   @   s$   e Zd ZU eed< defddZdS )
MultipleOfbasec                 C   s
   || _ d S N)r'   )selfr'   r$   r$   r%   __init__*      
zMultipleOf.__init__N)r   r   r   int__annotations__r*   r$   r$   r$   r%   r&   '   s   
 r&   c                   @   s  e Zd ZU dZdZeed< ejej	gZ
eeej  ed< ddgZeed  ed< d	Zeed
< edeeeB  fddZeedefddZeeded fddZeedd Zee	dMdedededededeedf fddZe	dNdedeedf fddZedeeef fd d!Zedee fd"d#Zededefd$d%Z ed&ejdefd'd(Z!ed)d*defd+d,Z"eded-B defd.d/Z#edefd0d1Z$edefd2d3Z%edefd4d5Z&edefd6d7Z'edefd8d9Z(ed:edefd;d<Z)ed=d>defd?d@Z*eded&ejd)d*dedAedBedCedDd>ded-B fdEdFZ+eded&ejd)d*dedAedBedCedGedDd>d:edee fdHdIZ,edOdKdLZ-d-S )PAttentionBackendz&Abstract class for attention backends.Faccept_output_buffersupported_dtypesautobfloat16r   supported_kv_cache_dtypesT forward_includes_kv_cache_updatereturnc                   C   s
   t dgS N   )r&   r$   r$   r$   r%    get_supported_kernel_block_sizes;      
z1AttentionBackend.get_supported_kernel_block_sizesc                   C      t r(   NotImplementedErrorr$   r$   r$   r%   get_name?      zAttentionBackend.get_nameAttentionImplc                   C   r:   r(   r;   r$   r$   r$   r%   get_impl_clsD   r>   zAttentionBackend.get_impl_clsc                   C   r:   r(   r;   r$   r$   r$   r%   get_builder_clsI   r>   z AttentionBackend.get_builder_cls
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   r:   r(   r;   )rB   rC   rD   rE   rF   r$   r$   r%   get_kv_cache_shapeN      	z#AttentionBackend.get_kv_cache_shapeinclude_num_layers_dimensionc                 C   r:   )av  
        Get the physical (memory layout) ordering of the kv cache dimensions.
        e.g. if the KV cache shape is
        [2, num_blocks, block_size, num_heads, head_size],
        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
        ordering of dimensions is
        [num_blocks, num_heads, 2, block_size, head_size].

        If this function is unimplemented / raises NotImplementedError,
        the physical layout of the KV cache will match the logical shape.

        Args:
            include_num_layers_dimension: if True, includes an additional
                num_layers dimension, which is assumed to be prepended
                to the logical KV cache shape.
                With the above example, a return value (2, 4, 0, 1, 3, 5)
                corresponds to
                [num_blocks, num_heads, num_layers, 2, block_size, head_size].

                If an additional dimension is NOT included in the returned
                tuple, the physical layout will not include a layers dimension.

        Returns:
            A tuple of ints which is a permutation of range(len(shape)).
        r;   )rI   r$   r$   r%   get_kv_cache_stride_orderY   s   z*AttentionBackend.get_kv_cache_stride_orderc                 C   s   | j | jfS r(   )r   r   clsr$   r$   r%   full_cls_namex   s   zAttentionBackend.full_cls_namec                 C   s   g S r(   r$   rK   r$   r$   r%   get_supported_head_sizes|      z)AttentionBackend.get_supported_head_sizesc                 C   s   |   }| p
||v S r(   )rN   )rL   rE   supported_head_sizesr$   r$   r%   supports_head_size   s   z#AttentionBackend.supports_head_sizedtypec                 C   s
   || j v S r(   )r0   )rL   rR   r$   r$   r%   supports_dtype   r9   zAttentionBackend.supports_dtypekv_cache_dtypezCacheDType | Nonec                 C   s   |d u rdS | j  p|| j v S NT)r3   )rL   rT   r$   r$   r%   supports_kv_cache_dtype   s
   z(AttentionBackend.supports_kv_cache_dtypeNc                 C   sl   ddl m} |d u rdS t|}||vrdS |  }|sdS |D ]}t|tr*|j}|| dkr3 dS q dS )Nr   )	BlockSizeTF)vllm.config.cacherW   r   r8   
isinstancer&   r'   )rL   rC   rW   valid_sizessupported_kernel_block_sizessupported_sizer$   r$   r%   supports_block_size   s    
z$AttentionBackend.supports_block_sizec                 C      dS NFr$   rK   r$   r$   r%   is_mla   rO   zAttentionBackend.is_mlac                 C   r^   r_   r$   rK   r$   r$   r%   supports_sink   rO   zAttentionBackend.supports_sinkc                 C   r^   r_   r$   rK   r$   r$   r%   supports_alibi_sqrt   rO   z$AttentionBackend.supports_alibi_sqrtc                 C   r^   r_   r$   rK   r$   r$   r%   supports_mm_prefix   rO   z#AttentionBackend.supports_mm_prefixc                 C   r^   r_   r$   rK   r$   r$   r%   	is_sparse   rO   zAttentionBackend.is_sparse	attn_typec                 C   s
   |t jkS )zCheck if backend supports a given attention type.

        By default, only supports decoder attention.
        Backends should override this to support other attention types.
        )r   r    )rL   re   r$   r$   r%   supports_attn_type   s   
z#AttentionBackend.supports_attn_type
capabilityr   c                 C   r^   rU   r$   )rL   rg   r$   r$   r%   supports_compute_capability   rO   z,AttentionBackend.supports_compute_capabilityuse_mlahas_sink
use_sparsedevice_capabilityc	           	      C      d S r(   r$   )	rL   rE   rR   rT   rC   ri   rj   rk   rl   r$   r$   r%   supports_combination      z%AttentionBackend.supports_combinationuse_mm_prefixc              
   C   s*  g }|  |s|d | |s|d | |s |d | |s*|d |r5|  s5|d ||  krH|rC|d n|d |rS|  sS|d ||  krf|ra|d	 n|d
 | 	|	sp|d | 
|
s~|d|
 d | ||||||||	}|d ur|| |S )Nzhead_size not supportedzdtype not supportedzkv_cache_dtype not supportedzblock_size not supportedz5partial multimodal token full attention not supportedzMLA not supportedznon-MLA not supportedzsink setting not supportedzsparse not supportedznon-sparse not supportedz compute capability not supportedzattention type z not supported)rQ   appendrS   rV   r]   rc   r`   ra   rd   rh   rf   rn   )rL   rE   rR   rT   rC   ri   rj   rk   rp   rl   re   invalid_reasonscombination_reasonr$   r$   r%   validate_configuration   sP   















z'AttentionBackend.validate_configurationKVCacheLayoutType | Nonec                 C   rm   r(   r$   rK   r$   r$   r%   get_required_kv_cache_layout  rO   z-AttentionBackend.get_required_kv_cache_layout)r1   F)r5   ru   ).r   r   r   r   r/   boolr-   torchfloat16r2   r0   r	   listrR   r3   r4   staticmethodr,   r&   r8   r   strr=   typer@   rA   tuplerG   rJ   classmethodrM   rN   rQ   rS   rV   r]   r`   ra   rb   rc   rd   rf   rh   rn   rt   rv   r$   r$   r$   r%   r.   .   s   
 
	
	
	
8r.   c                   @   s   e Zd ZdS )AttentionMetadataN)r   r   r   r$   r$   r$   r%   r     s    r   T)boundc                   @   s  e Zd ZU dZejed< ejed< 	 ejed< 	 eed< 	 eed< 	 eed< 	 eed< 	 ejed	< ejed
< dZe	ed< dZ
ejdB ed< dZedB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< 	 dZejdB ed< dZejdB ed< dZejdB ed< defddZdejfddZd)ddZeeddejfdd Zeed!dejfd"d#Zdejfd$d%Zded&edd fd'd(ZdS )*CommonAttentionMetadataz
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.

    For many of the tensors we keep both GPU and CPU versions.
    query_start_locquery_start_loc_cpuseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingTcausalNlogits_indices_paddednum_logits_indicesencoder_seq_lensencoder_seq_lens_cpudcp_local_seq_lensdcp_local_seq_lens_cpu_seq_lens_cpu_num_computed_tokens_cpu_num_computed_tokens_cacher5   c                 C   s   | j jd S )Nr   )r   shaper)   r$   r$   r%   
batch_sizeO  s   z"CommonAttentionMetadata.batch_sizec                 C   s   | j dd | j dd  S )zENaive because it assumes that query ends where the next query starts.r7   N)r   r   r$   r$   r%   naive_query_lensR  s   z(CommonAttentionMetadata.naive_query_lensc                 K   s   t | fi |S r(   )r   )r)   kwargsr$   r$   r%   r   V  s   zCommonAttentionMetadata.replacez
    Prefer using device seq_lens directly to avoid implicit H<>D sync.
    If a CPU copy is needed, use `seq_lens.cpu()` instead.
    Will be removed in a future release (v0.15.0)
    c                 C   s   | j d u r| jd| _ | j S )Ncpu)r   r   tor   r$   r$   r%   seq_lens_cpuY  s   
	z$CommonAttentionMetadata.seq_lens_cpuz
    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
    async scheduling. If a CPU copy is needed, it can be derived from 
    query_start_loc_cpu and seq_lens.
    Will be removed in a future release (v0.15.0)
    c                 C   s8   | j d u r| jdd  | jd d  }| j| | _ | j S )Nr7   r   )r   r   r   )r)   query_seq_lensr$   r$   r%   num_computed_tokens_cpuf  s
   

z/CommonAttentionMetadata.num_computed_tokens_cpuc                 C   s8   | j du r| jdd | jdd  }| j| | _ | j S )z>Compute num_computed_tokens on device (seq_lens - query_lens).Nr7   r   )r   r   r   )r)   
query_lensr$   r$   r%   compute_num_computed_tokensw  s   
z3CommonAttentionMetadata.compute_num_computed_tokensnum_actual_reqsc                    s   fdd}t di d| jd  d  d| jd  d  d| jd   d| jd ur2| jd   nd d| jd urA| jd   nFd d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS d	 d
|d| jd| jd| jd   d| j	d | d| j
d| jd| jd|| jd|| jd|| jd|| jS )Nc                    s   | d ur
| d   S d S r(   r$   )xr   r$   r%   <lambda>  s    z2CommonAttentionMetadata.unpadded.<locals>.<lambda>r   r7   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r$   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r)   r   r   maybe_slice_reqsr$   r   r%   unpadded  s   











z CommonAttentionMetadata.unpadded)r5   r   )r   r   r   r   ry   Tensorr-   r,   r   rx   r   r   r   r   npndarrayr   r   r   r   r   r   r   r   propertyr   r   r   r   r   r$   r$   r$   r%   r     sb   
 





r   Mc                   @   r   )AttentionCGSupportzConstants for the cudagraph support of the attention backend
    Here we do not consider the cascade attention, as currently
    it is never cudagraph supported.      r7   r   N)r   r   r   r   ALWAYSUNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODENEVERr$   r$   r$   r%   r     s    r   c                   @   sD  e Zd ZU ejZee ed< dZe	dB ed< dZ
eed< edddee d	d
dejfddZeded  d	d
dddefddZ			d0de	dB dededdfddZe	d1de	dededefddZdedejdejdefdd Zdedefd!d"Zded#e	defd$d%Zde	d&ejd'e	d(e	d)ed*ed+ed,e	d-e	defd.d/ZdS )2AttentionMetadataBuilder_cudagraph_supportNreorder_batch_thresholdFsupports_update_block_tablekv_cache_specr   layer_namesvllm_configr   devicec                 C   s   || _ || _|| _|| _d S r(   )r   r   r   r   )r)   r   r   r   r   r$   r$   r%   r*     s   
z!AttentionMetadataBuilder.__init__rL   r5   c                 C   s   | j S )z6Get the cudagraph support level of this builder class.)r   )rL   r   r   r$   r$   r%   get_cudagraph_support  s   z.AttentionMetadataBuilder.get_cudagraph_supportr7   supports_spec_as_decodesupports_dcp_with_varlenc                 C   sf   || _ | j d ur!|r!| jj}|d ur!|jd ur!t| j d|j | _ | jjjdkr/|s1d| _ d S d S d S r6   )r   r   speculative_confignum_speculative_tokensmaxparallel_configdecode_context_parallel_size)r)   r   r   r   r   r$   r$   r%   _init_reorder_batch_threshold  s   

z6AttentionMetadataBuilder._init_reorder_batch_thresholdcommon_prefix_lencommon_attn_metadata
fast_buildc                 C   r:   )a  
        Central method that builds attention metadata.
        Some builders (MLA) require reorder_batch to be called prior to build.

        Args:
            common_prefix_len: The length of the common prefix of the batch.
            common_attn_metadata: The common attention metadata.
            fast_build: The meta-data will prioritize speed of building over
                then speed at execution. Can be used for spec-decode where the
                result of a build call may only be used for few layers/iters.
        r;   )r)   r   r   r   r$   r$   r%   build  s   zAttentionMetadataBuilder.buildmetadata	blk_tabler   c                 C   r:   )a  
        Update the block table for the attention metadata.
        Faster when theres multiple kv-cache groups that create virtually the
        same metadata but just with different block tables.

        Only needs to be implemented if supports_update_block_table is True.
        r;   )r)   r   r   r   r$   r$   r%   update_block_table     z+AttentionMetadataBuilder.update_block_tablec                 C   s   | j d|dS )z
        Build attention metadata for CUDA graph capture. Uses build by default.
        Subclasses that override this method should call self.build or
        super().build_for_cudagraph_capture.
        r   )r   r   r   )r)   r   r$   r$   r%   build_for_cudagraph_capture  s   z4AttentionMetadataBuilder.build_for_cudagraph_capturedraft_indexc                 C   s   | j d|ddS )a  
        Build attention metadata for draft model. Uses build by default.

        Args:
            common_attn_metadata: The common attention metadata.
            draft_index: The index of the current draft operation.
                When speculating a chain of tokens, this index refers to the
                draft attempt for the i-th token.
                For tree-based attention, this index instead refers to the
                draft attempt for the i-th level in the tree of tokens.
        r   T)r   r   r   r   )r)   r   r   r$   r$   r%   build_for_drafting  s
   z+AttentionMetadataBuilder.build_for_draftingr   num_query_headsrD   	use_alibiuse_sliding_windowuse_local_attentionnum_smsdcp_world_sizec
           
      C   r^   r_   r$   )
r)   r   r   r   rD   r   r   r   r   r   r$   r$   r%   use_cascade_attention3  ro   z.AttentionMetadataBuilder.use_cascade_attention)r7   FFrw   ) r   r   r   r   r   r   r	   r-   r   r,   r   rx   r   r{   r}   ry   r   r*   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r$   r$   r$   r%   r     s   
 




	
r   c                   @   sz   e Zd ZU ejed< ejed< ejed< eed< eed< eed< ejed< dejd	ejd
ejdejdedejfddZdS )AttentionLayer_q_scale_k_scale_v_scale_q_scale_float_k_scale_float_v_scale_float_prob_scalequerykeyvaluekv_cacheattn_metadatar5   c                 C   rm   r(   r$   )r)   r   r   r   r   r   r$   r$   r%   forwardK  s   zAttentionLayer.forwardN)	r   r   r   ry   r   r-   floatr   r   r$   r$   r$   r%   r   B  s*   
 



r   c                       s  e Zd ZU eed< eed< eed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< eed< eed< eed< eed< eed< eed<  fddZedddddejdfdededededB dee dB dedB dededB dededB ddfddZe			d1ded ejd!ejd"ejd#ejd$ed%ejdB d&ejdB d'ejdB dejfd(d)Zd2d,d-Zd.ejfd/d0Z  ZS )3r?   	num_headsrE   scaleFcan_return_lse_for_decodesupports_pcp0supports_mtp_with_cp_non_trivial_interleave_sizeneed_to_return_lse_for_decodesupports_quant_query_inputsupports_per_head_quant_scalesr   dcp_rankpcp_world_sizepcp_ranktotal_cp_world_sizetotal_cp_rankc                    s   t  | }zddlm} | j|_| j|_W n ty'   d|_d|_Y nw zddlm	} | j|_
| j|_W n tyI   d|_
d|_Y nw |j
|j |_|j|j |j |_|jdkob|j|_|S )Nr   )get_dcp_groupr7   )get_pcp_group)super__new__vllm.distributed.parallel_stater   
world_sizer   rank_in_groupr   AssertionErrorr   r   r   r   r   r   r   )rL   argsr   r)   r   r   	__class__r$   r%   r   }  s,   



zAttentionImpl.__new__Nr1   rD   alibi_slopessliding_windowrT   logits_soft_capre   kv_sharing_target_layer_namer5   c                 C   r:   r(   r;   )r)   r   rE   r   rD   r  r  rT   r  re   r  r$   r$   r%   r*     s   zAttentionImpl.__init__layerr   r   r   r   r   outputoutput_scaleoutput_block_scalec
           
      C   r:   r(   r;   )
r)   r  r   r   r   r   r   r  r  r	  r$   r$   r%   r     r   zAttentionImpl.forward	quant_keyr   c                 C   r^   )ab  
        Does this attention implementation support fused output quantization.
        This is used by the AttnFusionPass to only fuse output quantization
        onto implementations that support it.

        :param quant_key: QuantKey object that describes the quantization op
        :return: is fusion supported for this type of quantization
        Fr$   )r)   r
  r$   r$   r%   fused_output_quant_supported  rH   z*AttentionImpl.fused_output_quant_supported	act_dtypec                 C   rm   r(   r$   )r)   r  r$   r$   r%   process_weights_after_loading  s   z+AttentionImpl.process_weights_after_loadingNNN)r
  r   )r   r   r   r,   r-   r   r   rx   r   r   r   r   r   r   r   r   r    r{   r}   r*   r   ry   r   r   r   r  rR   r  __classcell__r$   r$   r   r%   r?   U  s   
 

	
		

r?   c                (   @   s   e Zd Ze	d#dededededee dB dedB ded	edB d
ededB dedB dedededededddedB ddf&ddZ	e			d$de
dejdejdejdejdedejdB dejdB d ejdB dejfd!d"ZdS )%MLAAttentionImplNr   rE   r   rD   r  r  rT   r  re   r  q_lora_rankkv_lora_rankqk_nope_head_dimqk_rope_head_dimqk_head_dim
v_head_dim	kv_b_projr   indexerr5   c                 C   r:   r(   r;   )r)   r   rE   r   rD   r  r  rT   r  re   r  r  r  r  r  r  r  r  r  r$   r$   r%   r*     s   zMLAAttentionImpl.__init__r  hidden_states_or_cqkv_c_normedk_per   r   r  r  r	  c
           
      C   r:   r(   r;   )
r)   r  r  r  r  r   r   r  r  r	  r$   r$   r%   r     r   zMLAAttentionImpl.forwardr(   r  )r   r   r   r   r,   r   r{   r}   objectr*   r   ry   r   r   r   r$   r$   r$   r%   r    s    
	
		
r  rT   r5   c                 C   s
   |  dS )Nfp8)
startswith)rT   r$   r$   r%   is_quantized_kv_cache  r+   r  name_prefixattention_backend_clsbuilder_clsc                    s$   | |j  }t||fd fddiS )zN
    Return a new subclass where `get_builder_cls` returns `builder_cls`.
    rA   c                      s    S r(   r$   r$   r"  r$   r%   r      s    z,subclass_attention_backend.<locals>.<lambda>r   r~   )r   r!  r"  namer$   r#  r%   subclass_attention_backend  s   
r&  	overridesc                 C   s   | |j  }t||f|S r(   r$  )r   r!  r'  r%  r$   r$   r%   )subclass_attention_backend_with_overrides  s   
r(  )6abcr   r   dataclassesr   r   enumr   typingr   r   r	   r
   r   r   r   numpyr   ry   typing_extensionsr   vllm.configr   rX   r   !vllm.model_executor.layers.linearr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platforms.interfacer    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r}   r   r&   r.   r   r   r   r   r   r   r   r?   r  rx   r  r~   r&  dictr(  r$   r$   r$   r%   <module>   sb   $ j~ r*


