o
    .i'G                     @   sd  d dl mZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZmZmZmZ d d	lmZ d d
lmZmZ eeZejejfZ G dd deZ!eG dd dZ"G dd dee" Z#G dd deZ$dej%dej&dej%de'ej% fddZ(dej%de)de)dej&de'ej% f
ddZ*	d!dej&de)de)dB de+fdd Z,dS )"    )	dataclass)ClassVarN)_custom_ops)
VllmConfig)init_logger)CpuArchEnumcurrent_platform)AttentionBackendAttentionImplAttentionLayerAttentionMetadataBuilderAttentionTypeCommonAttentionMetadatais_quantized_kv_cache)split_decodes_and_prefills)AttentionSpecCrossAttentionSpecc                   @   s  e Zd ZU dZeed< ejejej	gZ
eeej  ed< edeej fddZedee fddZedefd	d
ZededefddZeded fddZeded fddZe	d dedededededeedf fddZedefddZdS )!CPUAttentionBackendTaccept_output_buffersupported_dtypesreturnc                 C   s   t jt jt jgS N)torchfloat16bfloat16float32cls r   `/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/cpu_attn.pyget_supported_dtypes'   s   z(CPUAttentionBackend.get_supported_dtypesc                 C   s   g dS )N)
    @   P   `   p                  r   r   r   r   r   get_supported_head_sizes+   s   z,CPUAttentionBackend.get_supported_head_sizesc                   C      dS )NCPU_ATTNr   r   r   r   r   get_name/      zCPUAttentionBackend.get_name	attn_typec                 C   s   |t jt jt jt jfv S )zSCPU attention supports decoder,
        encoder-only and encoder-decoder attention.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)r   r0   r   r   r   supports_attn_type3   s   z&CPUAttentionBackend.supports_attn_typeCPUAttentionBackendImplc                   C      t S r   )r6   r   r   r   r   get_impl_cls>   r/   z CPUAttentionBackend.get_impl_clsCPUAttentionMetadataBuilderc                   C   r7   r   )r9   r   r   r   r   get_builder_clsB   r/   z#CPUAttentionBackend.get_builder_clsauto
num_blocks
block_sizenum_kv_heads	head_sizecache_dtype_str.c                 C   s   d| |||fS )N   r   )r<   r=   r>   r?   r@   r   r   r   get_kv_cache_shapeF   s   z&CPUAttentionBackend.get_kv_cache_shapec                  O   r,   )NFr   )argskwargsr   r   r   use_cascade_attentionP   r/   z)CPUAttentionBackend.use_cascade_attentionN)r;   )__name__
__module____qualname__r   bool__annotations__r   r   r   r   r   r   listdtypeclassmethodr    intr+   staticmethodstrr.   r5   typer8   r:   tuplerB   rE   r   r   r   r   r      sF   
 

	r   c                   @   s   e Zd ZU eed< eed< eed< ejed< eed< ejed< ejed< ejed< ejd	B ed
< dZe	ed< dZ
e	ed< dZeed< d	Zeejd	B  d	B ed< d	Zejd	B ed< d	S )CPUAttentionMetadataisanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingNscheduler_metadataTcausalFuse_sdpa_prefillr   num_decode_tokenssdpa_attn_maskssdpa_start_loc)rF   rG   rH   rP   rJ   rN   r   Tensorr]   rI   r^   r_   r`   rK   ra   r   r   r   r   rS   U   s   
 



rS   c                
       sV   e Zd Zdedee dedejddf
 fddZ			dd
e
dededefddZ  ZS )r9   kv_cache_speclayer_namesvllm_configdevicer   Nc                    s   t  |||| d| _d }t tvrd}d| _| |d || _|| _|j	}|j
|| _|j
|| _|j| _|j
j| _t|dd| _| jd u rNd| _|jj| _t| j| j| j| _t|t| _d S )NF   Tsliding_window)super__init__r^   r   get_cpu_architecture_CPU_ARCH_PREFER_MIXED_BATCH_init_reorder_batch_thresholdrc   re   parallel_configmodel_configget_num_kv_headsr>   get_num_attention_heads	num_headsr?   head_dimrL   getattrwindow_sizecache_configr=   _get_attn_isarT   
isinstancer   is_cross_attention)selfrc   rd   re   rf   reorder_batch_thresholdro   	__class__r   r   rk   j   s,   


z$CPUAttentionMetadataBuilder.__init__Fcommon_prefix_lencommon_attn_metadata
fast_buildc                 C   s  |j }|j}|j}|j}|j}|j}	|j}
|j}| jrdn|j	}|}d}| j
rX|rX| js.J t|| jdd\}}}}|}||d  | }|	d | }	|d |d  }|
d | }
tj|| j| j| j|	| j||| j| jdd}t| j|||||	|
|||| j
||d}|S )NFr   T)decode_thresholdrequire_uniformrg   )num_reqsrs   r>   rt   rY   rL   rW   r]   sliding_window_sizerT   enable_kv_split)rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   ra   )r   rU   rV   rX   rW   rY   block_table_tensorr[   rz   r]   r^   r|   r   opscpu_attn_get_scheduler_metadatars   r>   rt   rL   rv   rT   rS   )r{   r   r   r   r   rU   rV   rX   rW   rY   r   r[   r]   ra   r_   num_decodesnum_prefillsnum_prefill_tokenssheduler_metadataattn_metadatar   r   r   build   sj   


z!CPUAttentionMetadataBuilder.build)F)rF   rG   rH   r   rK   rP   r   r   rf   rk   rN   r   rI   rS   r   __classcell__r   r   r}   r   r9   i   s,    )r9   c                   @   s   e Zd Zdejddfdededededee dB dedB ded	edB d
ededB de	j
dB ddfddZ			ddede	j
de	j
de	j
de	j
dedB de	j
dB de	j
dB de	j
dB de	j
fddZde	j
de	j
de	j
de	j
ded
ede	j
fddZdS )r6   Nrs   r?   scaler>   alibi_slopesrh   kv_cache_dtypelogits_soft_capr0   kv_sharing_target_layer_namesinksr   c                 C   s
  |
| _ || _|| _t|| _|d ur|	tjtjfv rt	d |d u r%d}|| _
|| _|d ur7tj|tjd}|| _|d u rBd| _n|	tjkrQ|d |d f| _n|d df| _|| _| j| j | _t|rjtd|	| _|| _| jd ur| jjd |ksJ dd S d S )NzbCPU_ATTN does not support logits softcap for ENCODER and ENCODER_ONLY, outputs may be slightly offr   rL   )ri   ri   rg   z'FP8 KV cache is unsupported in CPU_ATTNzLSinks must have the same number of heads as the number of heads in the layer)r   rs   r?   floatr   r   r2   r3   loggerwarning_oncer   r>   r   tensorr   r   rh   r   num_queries_per_kvr   NotImplementedErrorr0   r   shape)r{   rs   r?   r   r>   r   rh   r   r   r0   r   r   r   r   r   rk      sF   



z CPUAttentionBackendImpl.__init__layerquerykeyvaluekv_cacher   outputoutput_scaleoutput_block_scalec
                 C   sp  |dusJ d|dus|	durt d|du r|S |j}
| jtjtjfv rA| |d|
 |d|
 |d|
 |d|
 || jS |d\}}| jdu ra|dura|durat	
|||||j|j |jr| jdu smJ d|j}| |||
 |||
 |||
 |||
 || j |}
|
dkrt	j|d|
 |||d|
 |j|j| j|j| j| j|j| j|j| jd |S )a  Forward pass for CPU attention backend.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zJfused output quantization is not yet supported for CPUAttentionBackendImplr   z-Attention sink is unsupported in SDPA prefill)r   	key_cachevalue_cacher   rW   rY   r   r]   r   rh   rZ   softcapr\   s_aux)r   rU   r0   r   r3   r2   _run_sdpa_forwardunbindr   r   cpu_attn_reshape_and_cacher[   rT   r^   r   r_   cpu_attention_with_kv_cacherW   rY   r   r]   r   rh   rZ   r   r\   )r{   r   r   r   r   r   r   r   r   r   rU   r   r   r_   r   r   r   forward	  sv   




	





zCPUAttentionBackendImpl.forwardc              
   C   s  |j }|d u rI| jd urt| j|j|j}n0| jd dks$| jd dkr;|jd us+J t|j| jd | jd |j}nd g|jdd  }||_ |	d|
 d }|	d|
 d }|	d|
 d }| j| jkr}|j| jdd}|j| jdd}|tjk}|j }	tt|D ]_}
||
 }|	|
 }|	|
d  }tjjj|d d d ||d d f |d d d ||d d f |d d d ||d d f |d|o|d u | jdd	|
 d d}||||d d d d f< q|S )	Nr   ri   rg   rA   )dimg        )	attn_mask	dropout_p	is_causalr   )r`   r   _make_alibi_biasrL   ra   rh   rY   _make_sliding_window_biassizemovedimr   r>   rs   repeat_interleaver   r   r1   numpyrangelenr   nn
functionalscaled_dot_product_attentionr   squeeze)r{   r   r   r   r   r   r0   
attn_maskscausal_attnra   imaskstart_qend_qsub_outr   r   r   r   p  sX   	



	z)CPUAttentionBackendImpl._run_sdpa_forward)NNN)rF   rG   rH   r   r1   rN   r   rK   rP   r   rb   rk   r   rS   r   r   r   r   r   r   r6      s    

	

>	

gr6   r   rL   ra   r   c           
      C   s   g }| dd }| }t|D ]c}||d  ||  }tj||d}|d d d f |d d d f  }| jd }|d d d f |ddf}|| d d d d f d tj	d||f|j
dtj jdd}	|||	 | q|S )Nr   rg   r   diagonal)r   r   r   r   aranger   repeatmul_
unsqueeze_emptyrL   fill_inftriu_appendto)
r   rL   ra   attn_biasesseq_numr   seq_lenbiasrs   inf_maskr   r   r   r     s"    

r   left_window_sizeright_window_sizec           	      C   s   g }|  dd }|  } t|D ]8}| |d  | |  }tjd||fd|d}|dkr3tj||d}|dkr?tj|| d}t|}|| q|S )Nr   rg   )
fill_valuerL   ri   r   )	r   r   r   r   fulltriltriulogr   )	ra   r   r   rL   r   r   r   r   r   r   r   r   r     s"   
r   r=   r?   c                 C   sv   |d ur|d dkr|d dkrdS t jj }|r(| t jfv r(|d dkr(dS |d dkr9t tjkr7dS dS dS )Nr!   r      vec16amxneonvec)	r   _C_cpu_is_amx_tile_supportedr   r   rl   r   ARM)rL   r=   r?   supports_amxr   r   r   rx     s    rx   r   )-dataclassesr   typingr   r   vllmr   r   vllm.configr   vllm.loggerr   vllm.platformsr   r   vllm.v1.attention.backendr	   r
   r   r   r   r   r    vllm.v1.attention.backends.utilsr   vllm.v1.kv_cache_interfacer   r   rF   r   X86r   rm   r   rS   r9   r6   rb   rL   rK   r   rN   r   rP   rx   r   r   r   r   <module>   s`   $	6i \

