o
    .i';                     @   s
  d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  e	e!Z"eG dd dZ#G dd dee# Z$G dd deZ%G dd deZ&dS )z>Attention layer with PagedAttention and Triton prefix prefill.    )	dataclass)ClassVarN)
VllmConfig)init_logger)QuantKeykFp8StaticTensorSym)current_platform)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)FlashAttentionMetadata)chunked_prefill_paged_decode)PagedAttention)triton_reshape_and_cache_flash)AttentionSpecc                   @   s   e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< eed< eed	< ejd
B ed< ejd
B ed< ejd
B ed< d
Zejd
B ed< d
Z	ejd
B ed< d
S )RocmAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadata)
__name__
__module____qualname__int__annotations__torchTensorboolr"   r#    r,   r,   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/rocm_attn.pyr   '   s   
 




r   c                	       sx   e Zd ZU ejZee ed< dede	e
 dedejf fddZded	efd
dZ	ddededed	efddZ  ZS )RocmAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configdevicec                    sJ   t  |||| |j| _|j}||j| _||j| _|	 | _
d S N)super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddim)selfr0   r1   r2   r3   r8   	__class__r,   r-   r6   H   s   z%RocmAttentionMetadataBuilder.__init__common_attn_metadatareturnc                 C   s0   |  d|}|jd |j  |j  |S )Nr      )buildr   fill_r   zero_query_start_loc_cpu)r@   rC   attn_metadatar,   r,   r-   build_for_cudagraph_captureZ   s
   

z8RocmAttentionMetadataBuilder.build_for_cudagraph_captureFr   
fast_buildc                 C   s   |j }|j}|j}|j}|j}|j}	|j}
|dk}|r@tjd|gtj	| j
d}tj|gtj	| j
d}|j | }|| j
}nd }d }d }d }t||||||	|
||||||d}|S )Nr   )dtyper3   )r   r   r   r   r   r   r   r   r   r   r    r!   r#   )r   r   r   r   r   block_table_tensorr   r)   tensorint32r3   cputor   )r@   r   rC   rL   r   r   r   r   r   rN   r   r   r   r    r!   r#   rJ   r,   r,   r-   rF   k   sJ   z"RocmAttentionMetadataBuilder.build)F)r$   r%   r&   r
   ALWAYSr/   r   r(   r   liststrr   r)   r3   r6   r   r   rK   r'   r+   rF   __classcell__r,   r,   rA   r-   r.   E   s4   
 
r.   c                   @   s  e Zd ZU dZeed< ejejej	gZ
eeej  ed< edeeeB  fddZedee fddZed	edd
fddZedefddZeded fddZe	ddededed	ededeedf fddZedefddZeded fddZd
S ) RocmAttentionBackendTaccept_output_buffersupported_dtypesrD   c                   C      g dS )N)       i   r,   r,   r,   r,   r-    get_supported_kernel_block_sizes   s   z5RocmAttentionBackend.get_supported_kernel_block_sizesc                 C   rZ   )N)r\   @   `                  r,   )clsr,   r,   r-   get_supported_head_sizes   s   z-RocmAttentionBackend.get_supported_head_sizes	head_sizeNc                 C   s:   |  |s| jd}td| d| d|   dd S )NBackendz
Head size z is not supported by z. Supported head sizes are: zd. Set --attention-backend=FLEX_ATTENTION to use FlexAttention backend which supports all head sizes.)supports_head_sizer$   removesuffix
ValueErrorrf   )re   rg   	attn_typer,   r,   r-   validate_head_size   s   
z'RocmAttentionBackend.validate_head_sizec                   C      dS )N	ROCM_ATTNr,   r,   r,   r,   r-   get_name      zRocmAttentionBackend.get_nameRocmAttentionImplc                   C      t S r4   )rr   r,   r,   r,   r-   get_impl_cls   rq   z!RocmAttentionBackend.get_impl_clsauto
num_blocksr7   num_kv_headscache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nr[   r   z$Block size must be a multiple of 16.   )rk   )rv   r7   rw   rg   rx   r,   r,   r-   get_kv_cache_shape   s   z'RocmAttentionBackend.get_kv_cache_shapec                  O   rn   )NFr,   )argskwargsr,   r,   r-   use_cascade_attention   rq   z*RocmAttentionBackend.use_cascade_attentionr.   c                   C   rs   r4   )r.   r,   r,   r,   r-   get_builder_cls   rq   z$RocmAttentionBackend.get_builder_cls)ru   )r$   r%   r&   rX   r+   r(   r)   float16bfloat16float32rY   r   rT   rM   staticmethodr'   r   r]   classmethodrf   rm   rU   rp   typert   tuplerz   r}   r~   r,   r,   r,   r-   rW      sF   
 

rW   c                   @   s   e Zd ZdefddZdejddfdedededed	e	e dB d
edB de
dedB dededB dejdB ddfddZ			ddejjdejdejdejdejdedejdB dejdB dejdB dejfddZdS )rr   	quant_keyc                 C   s   |t kS r4   )r   )r@   r   r,   r,   r-   fused_output_quant_supported   s   z.RocmAttentionImpl.fused_output_quant_supportedN	num_headsrg   scalerw   alibi_slopessliding_windowkv_cache_dtypelogits_soft_caprl   kv_sharing_target_layer_namesinksrD   c                 C   s   || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|d df| _	|| _
|d u r5d}|| _|
| _| j | j | _t| |	tjtjfvrStdt | _|| _|d urr|jd |kstJ d|j d| dd S d S )	N)rM   )r   rE   r   z?Encoder self-attention is not implemented for RocmAttentionImplz[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   rg   floatr   rw   r)   rO   r   r   r   r   r   r   num_queries_per_kvrW   rm   r   DECODERENCODER_DECODERNotImplementedErrorr   	fp8_dtyper   shape)r@   r   rg   r   rw   r   r   r   r   rl   r   r   r,   r,   r-   r6      sB   


zRocmAttentionImpl.__init__layerquerykeyvaluekv_cacherJ   outputoutput_scaleoutput_block_scalec
              
   C   s  |dusJ d|	durt d|du r|dS |jdu s J |j}
t|| j| j\}}| jdu rf|j	d }|dkoC||d @ dk}|rWt
|||||j| j|j|j nt|||||j| j|j|j | jdr|| j}|| j}|jd	ksJ d
|j}|j}|j}|j}|j}tdi d|d|
 d|d|
 d|d|
 d|d|
 d| jd|d|d|d|d|d|d|d|jd|jd| jd| jd d| jd|d| j |S )a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zPfused block_scale output quantization is not yet supported for RocmAttentionImplr   F   rE   fp8g      ?z-A non 1.0 q_scale is not currently supported.r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r   k_scalev_scaler   r   sm_scaler   r   r,   )r   rG   r   r   r   split_kv_cacherw   rg   r   r   write_to_paged_cacher   r   _k_scale_v_scaler   
startswithviewr   _q_scale_floatr   r   r   r   r   r   r   r   r   r   )r@   r   r   r   r   r   rJ   r   r   r   r   r   r   r7   is_pow2cu_seqlens_q	seqused_kmax_seqlen_qmax_seqlen_kr   r,   r,   r-   forward  s   



	

zRocmAttentionImpl.forward)NNN)r$   r%   r&   r   r   r   r   r'   r   rT   rU   r)   r*   r6   nnModuler   r   r,   r,   r,   r-   rr      sn    
	

;	
rr   )'__doc__dataclassesr   typingr   r)   vllm.configr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   vllm.platformsr   vllm.v1.attention.backendr	   r
   r   r   r   r   r   %vllm.v1.attention.backends.flash_attnr   2vllm.v1.attention.ops.chunked_prefill_paged_decoder    vllm.v1.attention.ops.paged_attnr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   vllm.v1.kv_cache_interfacer   r$   loggerr   r.   rW   rr   r,   r,   r,   r-   <module>   s(   $	XD