o
    i                     @   st   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 eG dd dZed	G d
d deZdS )    )	dataclassN)CacheConfig)PluggableLayer)MLAAttention)QuantizationConfigc                   @   s   e Zd ZU dZejjed< ejjed< ejjed< ejjed< ejjdB ed< ejjdB ed< ejjdB ed	< ejjdB ed
< ejjdB ed< ejjdB ed< eed< ej	dB ed< dZ
ejjdB ed< dS )
MLAModuleszModules used in MLA.kv_a_layernorm	kv_b_proj
rotary_embo_projNfused_qkv_a_projkv_a_proj_with_mqaq_a_layernormq_b_projq_projindexer	is_sparsetopk_indices_bufferindexer_rotary_emb)__name__
__module____qualname____doc__torchnnModule__annotations__boolTensorr    r   r   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/layers/mla.pyr      s   
 r   multi_head_latent_attentionc                       s   e Zd ZdZ			ddededededed	ed
edB dedededB dedB de	ddf fddZ
	ddejdejdejdB dejfddZ  ZS )MultiHeadLatentAttentionWrappera  Pluggable MLA layer which allows OOT backends to add
    custom implementations of the outer MLA layer (including rope & o_proj).
    Note that currently oot platforms can still use CustomOp.register_oot to
    replace MLA layer entirly, although we use PluggableLayer to register
    this layer now.

    This class takes positions and hidden_states as input.
    The input tensors can either contain prefill tokens or decode tokens.
    The class does the following:

    1. MLA Preprocess.
    2. Perform multi-head attention to prefill tokens and
       multi-query attention to decode tokens separately.
    3. Return the output tensor.
    N hidden_size	num_headsscaleqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankmla_modulescache_configquant_configprefixreturnc                    s  t    || _|| _|| _|| | _|| _|| _|| _|| _	|	j
| _
|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _| jd uret| jds\J | jj| _|	j| _t| j	|| j| j| j| j| j|
|| d| j| j| jd| _|| _d S )Ntopk_tokensz.attn)r%   r&   r'   r(   r)   r*   r+   r-   r.   r/   r	   
use_sparser   )super__init__r$   r'   r(   qk_head_dimr)   r*   r+   r%   r   r   r   r   r   r   r	   r
   r   r   r   indexer_rope_embr   hasattrr1   r   r   mla_attnr/   )selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   	__class__r   r    r4   5   sR   




z(MultiHeadLatentAttentionWrapper.__init__	positionshidden_statesllama_4_scalingc                 C   s  d }d }| j d urH| jd usJ d| jd usJ d| jd us$J d| |d }|j| j | j| j gdd\}}| |}| |d }n | jd usQJ d| jd usZJ d| |d }| |d }|j| j| jgdd\}}	| 	|}
|
d| j| j}|	d	}	| jd ur| ||d
| jd f |	\|d
| jd f< }	| jr| jr| |||| j}|d ur||9 }| j||
|	|jd | j| j fd}| |d S )Nz9fused_qkv_a_proj is required when q_lora_rank is not Nonez6q_a_layernorm is required when q_lora_rank is not Nonez1q_b_proj is required when q_lora_rank is not Noner   )dimz7kv_a_proj_with_mqa is required when q_lora_rank is Nonez+q_proj is required when q_lora_rank is None   .)output_shape)r*   r   r   r   splitr+   r(   r   r   r   viewr%   r5   	unsqueezer
   r'   r   r   r6   r8   shaper)   r   )r9   r<   r=   r>   q_ckv_loraqkv_loraqkv_ck_pekv_c_normed_topk_indicesattn_outr   r   r    forwardq   s`   






z'MultiHeadLatentAttentionWrapper.forward)NNr#   )N)r   r   r   r   intfloatr   r   r   strr4   r   r   rP   __classcell__r   r   r:   r    r"   !   sT    	
@r"   )dataclassesr   r   vllm.configr   vllm.model_executor.custom_opr   $vllm.model_executor.layers.attentionr   'vllm.model_executor.layers.quantizationr   r   registerr"   r   r   r   r    <module>   s   