o
    i;                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z! e	e"Z#eG dd dZ$G dd dee$ Z%G dd deZ&G dd deZ'dS )z>Attention layer with PagedAttention and Triton prefix prefill.    )	dataclass)ClassVarN)
VllmConfig)init_logger)QuantKeykFp8StaticTensorSym)current_platform)AttentionBackendAttentionCGSupportAttentionImplAttentionLayerAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)FlashAttentionMetadata)chunked_prefill_paged_decode)PagedAttention)triton_reshape_and_cache_flash)AttentionSpecc                   @   s   e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< eed< eed	< ejd
B ed< ejd
B ed< ejd
B ed< d
Zejd
B ed< d
Z	ejd
B ed< d
S )RocmAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappinguse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadata)
__name__
__module____qualname__int__annotations__torchTensorboolr#   r$    r-   r-   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/backends/rocm_attn.pyr   (   s   
 




r   c                	       sx   e Zd ZU ejZee ed< dede	e
 dedejf fddZded	efd
dZ	ddededed	efddZ  ZS )RocmAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configdevicec                    sJ   t  |||| |j| _|j}||j| _||j| _|	 | _
d S N)super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddim)selfr1   r2   r3   r4   r9   	__class__r-   r.   r7   I   s   z%RocmAttentionMetadataBuilder.__init__common_attn_metadatareturnc                 C   s0   |  d|}|jd |j  |j  |S )Nr      )buildr   fill_r   zero_query_start_loc_cpu)rA   rD   attn_metadatar-   r-   r.   build_for_cudagraph_capture[   s
   

z8RocmAttentionMetadataBuilder.build_for_cudagraph_captureFr   
fast_buildc                 C   s   |j }|j}|j}|j}|j}|j}	|j}
|dk}|r@tjd|gtj	| j
d}tj|gtj	| j
d}|j | }|| j
}nd }d }d }d }t||||||	|
||||||d}|S )Nr   )dtyper4   )r   r   r   r   r   r   r   r   r   r    r!   r"   r$   )r   r   r   r   r   block_table_tensorr   r*   tensorint32r4   cputor   )rA   r   rD   rM   r   r   r   r   r   rO   r   r   r    r!   r"   r$   rK   r-   r-   r.   rG   l   sJ   z"RocmAttentionMetadataBuilder.build)F)r%   r&   r'   r
   ALWAYSr0   r   r)   r   liststrr   r*   r4   r7   r   r   rL   r(   r,   rG   __classcell__r-   r-   rB   r.   r/   F   s4   
 
r/   c                   @   s  e Zd ZU dZeed< ejejej	gZ
eeej  ed< edeeeB  fddZedee fddZed	edd
fddZdZeed< edefddZeded fddZe	d!dededed	ededeedf fddZedefddZeded fdd Zd
S )"RocmAttentionBackendTaccept_output_buffersupported_dtypesrE   c                   C      g dS )N)       i   r-   r-   r-   r-   r.    get_supported_kernel_block_sizes   s   z5RocmAttentionBackend.get_supported_kernel_block_sizesc                 C   r[   )N)r]   @   `                  r-   )clsr-   r-   r.   get_supported_head_sizes   s   z-RocmAttentionBackend.get_supported_head_sizes	head_sizeNc                 C   s:   |  |s| jd}td| d| d|   dd S )NBackendz
Head size z is not supported by z. Supported head sizes are: zd. Set --attention-backend=FLEX_ATTENTION to use FlexAttention backend which supports all head sizes.)supports_head_sizer%   removesuffix
ValueErrorrg   )rf   rh   	attn_typer-   r-   r.   validate_head_size   s   
z'RocmAttentionBackend.validate_head_sizeF forward_includes_kv_cache_updatec                   C      dS )N	ROCM_ATTNr-   r-   r-   r-   r.   get_name      zRocmAttentionBackend.get_nameRocmAttentionImplc                   C      t S r5   )rt   r-   r-   r-   r.   get_impl_cls   rs   z!RocmAttentionBackend.get_impl_clsauto
num_blocksr8   num_kv_headscache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nr\   r   z$Block size must be a multiple of 16.   )rl   )rx   r8   ry   rh   rz   r-   r-   r.   get_kv_cache_shape   s   z'RocmAttentionBackend.get_kv_cache_shapec                  O   rp   )NFr-   )argskwargsr-   r-   r.   use_cascade_attention   rs   z*RocmAttentionBackend.use_cascade_attentionr/   c                   C   ru   r5   )r/   r-   r-   r-   r.   get_builder_cls   rs   z$RocmAttentionBackend.get_builder_cls)rw   )r%   r&   r'   rY   r,   r)   r*   float16bfloat16float32rZ   r   rU   rN   staticmethodr(   r   r^   classmethodrg   rn   ro   rV   rr   typerv   tupler|   r   r   r-   r-   r-   r.   rX      sH   
 

rX   c                   @   s   e Zd ZdefddZdejddfdedededed	e	e dB d
edB de
dedB dededB dejdB ddfddZ			d!dejjdejdejdejdejdedejdB dejdB dejdB dejfddZdedejdejdejdejf
dd ZdS )"rt   	quant_keyc                 C   s   |t kS r5   )r   )rA   r   r-   r-   r.   fused_output_quant_supported   s   z.RocmAttentionImpl.fused_output_quant_supportedN	num_headsrh   scalery   alibi_slopessliding_windowkv_cache_dtypelogits_soft_caprm   kv_sharing_target_layer_namesinksrE   c                 C   s   || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|d df| _	|| _
|d u r5d}|| _|
| _| j | j | _t| |	tjtjfvrStdt | _|| _|d urr|jd |kstJ d|j d| dd S d S )	N)rN   )r   rF   r   z?Encoder self-attention is not implemented for RocmAttentionImplz[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   rh   floatr   ry   r*   rP   r   r   r   r   r   r   num_queries_per_kvrX   rn   r   DECODERENCODER_DECODERNotImplementedErrorr   	fp8_dtyper   shape)rA   r   rh   r   ry   r   r   r   r   rm   r   r   r-   r-   r.   r7      sB   


zRocmAttentionImpl.__init__layerquerykeyvaluekv_cacherK   outputoutput_scaleoutput_block_scalec
                 C   s  |dusJ d|	durt d|du r|dS |jdu s J |j}
t|| j| j\}}| j	drI|
| j}|
| j}|jdksIJ d|j}|j}|j}|j}|j}tdi d	|d|
 d
|durm|d|
 ndd|durz|d|
 nCdd|d|
 d| jd|d|d|d|d|d|d|d|jd|jd| jd| jd d| jd|d| j |S d|d|
 d| jd|d|d|d|d|d|d|d|jd|jd| jd| jd d| jd|d| j |S )a  Forward pass with FlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zPfused block_scale output quantization is not yet supported for RocmAttentionImplr   Ffp8g      ?z-A non 1.0 q_scale is not currently supported.r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r   k_scalev_scaler   r   sm_scaler   r   r-   )r   rH   r   r   r   split_kv_cachery   rh   r   
startswithviewr   _q_scale_floatr   r   r   r   r   r   _k_scale_v_scaler   r   r   r   )rA   r   r   r   r   r   rK   r   r   r   r   r   r   cu_seqlens_q	seqused_kmax_seqlen_qmax_seqlen_kr   r-   r-   r.   forward  s   

	

	

zRocmAttentionImpl.forwardr   c           
   
   C   s~   t || j| j\}}|jd }|dko||d @ dk}	|	r/t |||||| j|j|j d S t	|||||| j|j|j d S )N   r   rF   )
r   r   ry   rh   r   write_to_paged_cacher   r   r   r   )
rA   r   r   r   r   r   r   r   r8   is_pow2r-   r-   r.   do_kv_cache_updatev  s4   

z$RocmAttentionImpl.do_kv_cache_update)NNN)r%   r&   r'   r   r   r   r   r(   r   rU   rV   r*   r+   r7   nnModuler   r   r   r   r-   r-   r-   r.   rt      s    
	

;	

[rt   )(__doc__dataclassesr   typingr   r*   vllm.configr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   vllm.platformsr   vllm.v1.attention.backendr	   r
   r   r   r   r   r   r   %vllm.v1.attention.backends.flash_attnr   2vllm.v1.attention.ops.chunked_prefill_paged_decoder    vllm.v1.attention.ops.paged_attnr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   vllm.v1.kv_cache_interfacer   r%   loggerr   r/   rX   rt   r-   r-   r-   r.   <module>   s(   (
XF