o
    
۾i                     @   st  d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z" dZ#dZ$e% rddl&m'Z'm(Z( dd Z)dd Z*e(j+de'j,de'j,de'j,de'j,fddZ-dej.dej.dej.d ej.d!ej.d"ej.d#ej.d$ej.d%ej.d&ej.d'e/d(e0d)e1fd*d+Z2e(j+de'j,d,e'j,d-e'j,fd.d/Z3dej.d ej.dej.dej.d0ej.d1e0d"ej.d#ej.fd2d3Z4ee5Z6eG d4d5 d5Z7eG d6d7 d7Z8eG d8d9 d9Z9eG d:d; d;Z:eG d<d= d=Z;eG d>d? d?Z<G d@dA dAee< Z=G dBdC dCeZ>G dDdE dEeZ?dS )Fz)Attention layer with AiterFlashAttention.    )	dataclass)ClassVarN)rocm_aiter_ops)
VllmConfigget_layers_from_vllm_config)init_logger)	Attention)current_platform)cdiv)get_cu_count)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)"split_decodes_prefills_and_extends)merge_attn_states)AttentionSpec   i   )tltritonc                 C   s   t d|   t|S )Ni   )minelement_sizer   next_power_of_2)xhead_dim r   \/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/rocm_aiter_fa.py
block_size%   s   r    c                 C   s   t | t S N)r   r   )total_tokensr   r   r   num_programs(   s   r#   DEQUANT	PAGE_SIZECACHE_FORMAT
BLOCK_SIZEc           (      C   s  t d}t d}t d|}||| |
  ||  }||| |
  ||  }t || }t || }t || }|| | }|| }t |||  | t j}|| }|dkr| ||
 | |  ||
 |  ||  }|||
 | |  ||
 |  ||  }t || } t || }!|rt |}"t |	}#| j}$|!j}%| t j|" |$} |!t j|# |%}!t || |  t || |! d S |dkrJ| ||
 | |  || |  ||  }|||
 | |  || |  || | |  ||  }|| | | ||  }&|| }'t ||& } t ||' }!|r8d}"d}#| t j|" } |!t j|# }!t || |  t || |! d S d S )Nr      NHDSHUFFLE      ?)	r   
program_idarangeloadtoint64dtypefloat32store)(key_cache_ptrvalue_cache_ptrkey_ptr	value_ptrblock_table_ptrcu_seqlens_kv_ptrtoken_to_batch_ptrseq_start_ptrk_scale_ptrv_scale_ptr	num_heads	head_sizer   max_block_numr$   r%   r&   r'   token_idhead_idcol_offsetskey_ptr_offsetvalue_ptr_offset	batch_idxbatch_starttoken_startbatch_offsetblock_offsetblock_idslot_idkey_cache_ptr_offsetvalue_cache_ptr_offsetk_regv_regk_scalev_scalek_dtypev_dtypek_reg_offsetv_reg_offsetr   r   r   cp_mha_gather_cache_kernel+   s   








rW   	key_cachevalue_cachekeyvalueblock_tablesk_scalesv_scalescu_seqlens_kvtoken_to_batch
seq_startsdequantkv_cache_layoutr"   c                    s   |dv sJ d|j d }d|   }|| j d ksJ d| j d }| j d   fdd	}t| | |||||||	|| |||d|
|||d
 d S )N)r)   r*   z)kv_cache_layout only support NHD, SHUFFLE         zaWe assume your kv cache layout is [num_blocks, page_size, num_heads, head_dim], but got otherwiser(   c                    s    fS r!   r   )metar>   r"   r   r   <lambda>   s    z%cp_mha_gather_cache.<locals>.<lambda>)r$   r%   r&   r'   )shaper   rW   size)rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   r"   r   r   	page_sizegridr   rh   r   cp_mha_gather_cache   s>   




rn   QUANTIS_FNUZc           !      C   s^  t d}t d}t d|}|| ||  }||	 ||  }t || }|dk r-d S ||
 }||
 }|| | |
 || |
  }||| |
 |  ||  ||  }||| | |  ||  ||  }t | | | }t || | }|rd}d}|jj}|jj} |t j| |}|t j| | }t || | t || | d S )Nr   r(   r+   )	r   r,   r-   r.   type
element_tyr/   r2   r3   )!r6   r7   r4   r5   slot_mapping_ptrr<   r=   r   	k_stride0	v_stride0r    r?   num_kv_headsr'   ro   rp   tidrB   offsetsrc_offset_ksrc_offset_vrL   rK   rJ   
dst_offsetdst_k_shuffle_offsetdst_v_shuffle_offsetk_valv_valrQ   rR   rS   rT   r   r   r    reshape_and_cache_shuffle_kernel   sF   


"r   slot_mappingkv_cache_dtypec                 C   s   |j d }| j \}	}
}|j \}}}	}	d|  }tj||
|| ||g|jdd}tj||
|| ||g|jdd}||}||}d}|drKd}||
f}t| | |||||||| d|d|||
||t	
 tjkd d S )	Nr   re   rg   r1   deviceFfp8T)r'   ro   rp   )rj   r   torchemptyr1   view_as
startswithr   strider	   	fp8_dtypefloat8_e4m3fnuz)rZ   r[   rX   rY   r   r   r]   r^   
num_tokens_rv   r?   
num_blocksr    r   k_cache_templatev_cache_templatenew_key_cachenew_value_cachero   rm   r   r   r    reshape_and_cache_shuffle_triton   sP   





r   c                   @   0   e Zd ZU eed< eed< eed< ejed< dS )!AiterFlashAttentionDecodeMetadatamax_query_lenmin_query_lenmax_seq_lenquery_start_locN__name__
__module____qualname__int__annotations__r   Tensorr   r   r   r   r   /  
   
 r   c                   @   r   )"AiterFlashAttentionPrefillMetadatar   r   r   r   Nr   r   r   r   r   r   7  r   r   c                   @   sP   e Zd ZU ejed< ejed< ejed< ejed< eed< eed< ejed< dS )	AiterChunkSlidingWindowMetadataswa_seqlensswa_cu_seqlensswa_seq_startsswa_token_to_batchswa_max_seqlensswa_total_tokensswa_workspaceN)r   r   r   r   r   r   r   r   r   r   r   r   ?  s   
 



r   c                   @   sx   e Zd ZU ejed< ejed< ejed< ejed< ee ed< ee ed< ejed< eed< ee ed	< ed
B ed< d
S )AiterChunkContextMetadata	workspacecu_seq_lens_chunkchunk_startsr`   seq_totmax_seq_lensseq_lens
num_chunkstotal_token_per_batchNswa_metadata)	r   r   r   r   r   r   listr   r   r   r   r   r   r   J  s   
 




r   c                   @   s8   e Zd ZU eed< eed< eed< ejed< eed< dS )'AiterFlashAttentionChunkPrefillMetadatar   r   r   r   chunk_context_metadataN)r   r   r   r   r   r   r   r   r   r   r   r   r   X  s   
 
r   c                   @   s   e Zd ZU eed< eed< eed< ejed< eed< ejed< ejed< ejed< eed	< eed
< eed< eed< eed< eed< edB ed< edB ed< e	dB ed< e
ed< eed< eed< eeejf dB ed< eeejf dB ed< dS )AiterFlashAttentionMetadatanum_actual_tokensnum_actual_kv_tokensr   r   r   r   r   block_tablenum_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensnum_extendsnum_extend_tokensNdecode_metadataprefill_metadataextend_metadatause_cascadecommon_prefix_lenr"   rQ   rR   )r   r   r   r   r   r   r   r   r   r   booldictstrr   r   r   r   r   a  s.   
 




r   c                	       s   e Zd ZU ejZdZeed< de	de
e dedejf fddZd	efd
dZ	dded	ededdfddZdefddZ  ZS )"AiterFlashAttentionMetadataBuilderr(   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configr   c           	         s2  t  |||| |j| _|j| _|j| _| j| j| _| j| j| _| j	 | _
|j| _d | _d| _t }t| jt}| D ]}t|jtsLJ ||jj qBt|dkrz| }|d urt|d dkrt| jd u sqJ d|| _t|dksZtjdt| j| j
g| jj|d| _tjdgtj | j!d| _"d S )Nr   z@Aiter Flash ATTENTION can only support one valid sliding window!rd   r   r+   )#super__init__model_configparallel_configcache_configget_num_attention_headsnum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimr    aot_sliding_windowr"   setr   r   r   values
isinstanceimplAiterFlashAttentionImpladdsliding_windowlenpopr   r   _CP_TOKENS_PER_ITER_ROCMr1   extend_workspacetensorfloatr   scale)	selfr   r   r   r   sliding_window_configslayerslayersliding_window_config	__class__r   r   r     s>   z+AiterFlashAttentionMetadataBuilder.__init__common_attn_metadatac                 C   s,   | j j| jjj | _| jd|d}d| _|S )Nr   )r   r   )r   max_model_lenr   scheduler_configmax_num_partial_prefillsr"   build)r   r   resr   r   r   build_for_cudagraph_capture  s   z>AiterFlashAttentionMetadataBuilder.build_for_cudagraph_captureFr   
fast_buildreturnr   c           3      C   s  t || jd}t rH| j dkrH| jjj	drHt
| jt}dd |D d }| jjj| jd j}|d }tj|| j| jgtj| jd| _|\}	}
}}}}|j}|j }|dd  |d d  }d }|	dkrt|d |	   |d |	   |d |	   |jd |	d  d	}d }|dkr||	|
 d  }|j|	|
 d  }t|  |  ||	|
 d    ||d  d	}d }|
dkrst|	|	|
 }|| }|| }|| }d }| j d urkt!||| j d  d }tj"|
d tj#|jd}tj$|d|j%|dd  d
 tj&d|
tj#|jd}t'||}|d  }tj(d|| j| j)f| jj*j%| jd} || }!|  }"|d  }#t+|j,| jdd|j,| jdd|!j,| jdd|j,| jdd|"|#| d}t-|
 }$t.|  |$}%tj&|%tj#d/d0d|
|$ }&t|/d|&|$ }'|'|& j1dd}(tj"|%|
d gtj#dd})tj$|(d|)d d dd f tj#d |)d d df   }*tj&|*tj#dd d d d f }+|+|)d d dd f d d d d d f k},|,j2dd},tj$|,dd}-t3| j4|)j,| jdd|&j,| jdd|(j2dd5 |(jddj65 |(|-j,| jdd|%|)d d df 5 |d
}.|j|	|	|
 d  }|j| }/tj"|
d tj#|/jd}tj$|/d|j%|dd  d
 t7|  |  ||   ||d  |.d}t2| }0|dk}1t8d,i d|j9d|0d|j:d|jd|j;d|jd|j<d|j=d|	d|d |d!|d"|
d#|d$|d%|d&|d'|1d(|d)| j>d*| jd+| j}2|2S )-N)decode_thresholdr(   r   c                 S   s   g | ]}|qS r   r   ).0kr   r   r   
<listcomp>  s    z<AiterFlashAttentionMetadataBuilder.build.<locals>.<listcomp>r   r   r   )r   r   r   r   )dimr1   outrd   T)non_blocking)r   r   r   r   r   r   r   r1   )r   )r1   
pin_memory)r   r   r1   )r   )
r   r   r   r   r   r   r`   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   rQ   rR   r   )?r   r   r   is_shuffle_kv_cache_enabledr   numelr   r   cache_dtyper   r   r   compilation_configstatic_forward_contextkv_cacherj   r   onesr   r    r2   r   query_start_loc_cpur   cpur   maxitemr   r   r   slicer   minimumzerosint32cumsumr1   r-   repeat_interleaver   r   r   r   r/   r   r
   	unsqueezeexpandclampsumr   r   tolistr   r   r   r   r   r   block_table_tensorr   r"   )3r   r   r   r   	split_retr   first_layer_namekv_cache_shaper   r   r   r   r   r   r   r  r   query_lens_cpur   r   query_lens_for_prefillquery_start_loc_devicer   num_extends_slicequery_lens_for_extendseq_lens_for_extendcomputed_kv_lensr   swa_seqlen_for_extendcu_seq_lenstoken_to_seqfetched_shaper   ra   max_seqlen_kr"   max_context_chunkr   r   
chunk_endschunk_seq_lenscu_seq_lens_cpumax_cum_tokens	range_idxidx_to_batch_tensortoken_to_batch_tensorr   seq_lens_devicer   r   attn_metadatar   r   r   r     s  






*



	
z(AiterFlashAttentionMetadataBuilder.buildc                 O      dS )NFr   )r   argskwargsr   r   r   use_cascade_attention  s   z8AiterFlashAttentionMetadataBuilder.use_cascade_attention)F)r   r   r   r   UNIFORM_SINGLE_TOKEN_DECODE_cudagraph_supportr   r   r   r   r   r   r   r   r   r   r   r   r   r   r7  __classcell__r   r   r   r   r     s6   
 -

 Zr   c                   @   s   e Zd ZU dZeed< ejejgZ	e
eej  ed< edeeeB  fddZedee fddZedefd	d
Zeded fddZeded fddZe	ddedededededeedf fddZdS )AiterFlashAttentionBackendTaccept_output_buffersupported_dtypesr   c                   C   s   ddgS )Nre       r   r   r   r   r    get_supported_kernel_block_sizes     z;AiterFlashAttentionBackend.get_supported_kernel_block_sizesc                 C   s   g dS )N)@      r   r   )clsr   r   r   get_supported_head_sizes  r@  z3AiterFlashAttentionBackend.get_supported_head_sizesc                   C   r4  )N
FLASH_ATTNr   r   r   r   r   get_name     z#AiterFlashAttentionBackend.get_namer   c                   C      t S r!   )r   r   r   r   r   get_impl_cls  rG  z'AiterFlashAttentionBackend.get_impl_clsr   c                   C   rH  r!   )r   r   r   r   r   get_builder_cls  rG  z*AiterFlashAttentionBackend.get_builder_clsautor   r    rv   r?   cache_dtype_str.c                 C   s"   |d dkr
t dd| |||fS )Nre   r   z$Block size must be a multiple of 16.rd   )
ValueError)r   r    rv   r?   rL  r   r   r   get_kv_cache_shape  s   z-AiterFlashAttentionBackend.get_kv_cache_shapeN)rK  )r   r   r   r<  r   r   r   float16bfloat16r=  r   r   r1   staticmethodr   r   r?  classmethodrD  r   rF  rq   rI  rJ  tuplerN  r   r   r   r   r;    s8   
 
r;  c                   @   sH  e Zd Zdejdfdededededee dB dedB ded	edB d
ededB ddfddZ	de
dejdejdejdedejdedefddZde
dejdejdejdejdejdejdejdedededejdejdejdejfd d!Z			d(d"ejjdejdejdejd#ejde
dejdB d$ejdB d%ejdB dejfd&d'ZdS ))r   Nr>   r?   r   rv   alibi_slopesr   r   logits_soft_cap	attn_typekv_sharing_target_layer_namer   c                 C   s   || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|d df| _	|| _
|d u r5d}|| _|
| _| j | j dksEJ | j | j | _|	tjtjfvrXtdd S )Nr  )r   r   r(   r           z@Encoder self-attention is not implemented for FlashAttentionImpl)r>   r?   r   r   rv   r   r   r2   rT  r   r   rU  rW  num_queries_per_kvr   DECODERENCODER_DECODERNotImplementedError)r   r>   r?   r   rv   rT  r   r   rU  rV  rW  r   r   r   r     s,   
z AiterFlashAttentionImpl.__init__r3  queryoutputcu_seqlens_qmax_seqlen_qr   rQ   rR   c                 C   s   |j d usJ |j jd usJ |j j}|j}|d usJ |j}|j}|j}|j}|j}|jd |jd }}t	||||||	|
|||| j
dd|d tj|||||||dd| jd| j| jd|d	 d S )
Nr   r(   r   r)   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   r"   rX  TF)qr   vr_  cu_seqlens_kr`  r)  min_seqlen_q	dropout_psoftmax_scalecausalwindow_sizerT  
return_lser   )r   r   r   r   r   r   r   r   r   rn   r   r   r   flash_attn_varlen_funcr   r   rT  )r   r3  r]  rX   rY   r^  r_  r`  r   rQ   rR   chunked_metadatar   r   r   r   r   r   key_fetchedvalue_fetchedr   r   r   extend_for_sliding_window  sZ   

z1AiterFlashAttentionImpl.extend_for_sliding_windowrZ   r[   rX   rY   r)  re  r   c           #      C   s  | j d dkr| |||||||	|||
 d S tj||||||	|	|d| jd| j | jdd\}}|jd us6J |jj}|j}|j	}|j
}|j}|j}|j}|j}|d |d }}d }d }t|D ]b}t||||||||| || || | jdt r~dnd	|| d
 tj|||||| |	|| |d| jd| j | jdd\}} |d u r|}| }q`t|}!t|}"t|!|"|||| d |!}|"}q`t|||||d d S )Nr   r   rX  T)rb  r   rc  r_  rd  r`  r)  re  rf  rg  rh  ri  rT  rj  r(   r   r*   r)   ra  F)r^  
output_lseprefix_output
prefix_lsesuffix_output
suffix_lse)r^  rq  rr  rs  rt  )r   ro  r   rk  r   rT  r   r   r   r   r   r   r   r`   r   rangern   r   r   r  r   
empty_liker   )#r   r3  r]  rZ   r[   rX   rY   r^  r_  r`  r)  re  r   r   rQ   rR   r   lser   r   r   r_   max_seqlensr   r`   r   rm  rn  chunked_outputchunked_lse	chunk_idxsuf_outsuf_lse
tmp_outputtmp_lser   r   r   extend_forward0  s   





z&AiterFlashAttentionImpl.extend_forwardr   r	  output_scaleoutput_block_scalec
           /      C   s  |dusJ d|dus|	durt d|du r|dS |j}
|d\}}| jdr;|t }|t }| j	du rn|durn|durnt
 r\t|||||j| j|j|j ntjj|||||j| j|j|j |d|
 }|dur~|d|
 }|dur|d|
 }|d|
 }|j}|j}|j}|j}|j}|js|dkr|jdusJ ||| d }||| d }||| d }t
j||||jj|jj|jj|jj dd| j!d| j"| j#||| d d	 |dkrD|j$dusJ t%||| }|| }|| }|| }|| }|j}|j}t
 r|j}|j}| j&||||||||j$j|j$j|j$j d|j'|||  |j|||  ||d
 |dkr|j(dusQJ | j"d dkrt
 rbJ dddl)m*} |jd| j+d d |j+d f}|d(i d|d| d|d|d|d| d|jd| ddd|j,d| d|j d| j!ddd| j#d| j"d|j'd| d| j-ddd|j.|d|j.| dS |j(dusJ t
 rG|j+\}} }!}"d |/  }#tj0||!|"|# | |#g|j1d!d"}$tj0||!| |# |"|#g|j1d!d"}%|2|$}&|2|%}'t
j3|d| |&|'|j'd| |j,d| |j'd| 4d|j|j|d| d#	 |S |j+\}(})}"t5|j1j6d$ }*|j,j+d }+|j t7 d t7 },tj0|+|) |, |" |* d|+|) |,  d%  tj8|j9d"}-ddl:}.tjj:;|d| |-|d| ||| j!|j'd| |jd| |j,d| |j | j#| jd&| j-|j|jdt7 |S t d'))a  Forward pass with AiterFlashAttention.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [2, num_blocks, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NOTE: FP8 quantization, flash-attn expect the size of
              {q,k,v}_descale to be (num_sequences, num_kv_heads).
              We use torch's .expand() to avoid duplicating values
        NzOutput tensor must be provided.zEfused output quantization is not yet supported for FlashAttentionImplr   r   r(   rX  T)rb  r   rc  r_  rd  r`  r)  re  rf  rg  rh  ri  rT  r   )r3  r]  rZ   r[   rX   rY   r^  r_  r`  r)  re  r   r   rQ   rR   r   z8Sliding window with shuffle layout is not supported yet.)unified_attentionrd   rb  r   rc  r   r_  r`  	seqused_kr)  rg  rh  rT  ri  r   softcap	q_descale	k_descale	v_descalere   rg   r   )	QKVr\   context_lensblock_tables_stride0K_QScaleV_QScaleout_      r)   z3Cascade attention is not implemented for ROCM AITERr   )<r\  fill_r   unbindr   r   viewr	   r   rW  r   r  r   r   rQ   rR   r   ops_C_cache_opsreshape_and_cache_flash_k_scale_v_scaler   r   r   r   r   r   r   rk  r   r   r   r   r   rT  r   r  r  r   r   "aiter.ops.triton.unified_attentionr  rj   r   rU  r  r   r   r1   r   
pa_fwd_asmr   finfobits_PARTITION_SIZE_ROCMuint8r   aiterpaged_attention_v1)/r   r   r]  rZ   r[   r	  r3  r^  r  r  r   rX   rY   output_actual_tokensr   r   r   r   r   prefill_queryprefill_keyprefill_valueextend_tokens_sliceextend_querysextend_keysextend_valuesextend_outputsrQ   rR   r  descale_shaper   r    rv   r?   r   r   r   r   r   r   r>   nbytes_per_qo_elemnum_seqsmax_num_partitionsworkspace_bufferr  r   r   r   forward  s  

	




	





:


zAiterFlashAttentionImpl.forward)NNN)r   r   r   r   rZ  r   r   r   r   r   r   r   r   ro  r  nnModuler  r   r   r   r   r     s    

	

'	

=	

 	
r   )@__doc__dataclassesr   typingr   r   vllm._aiter_opsr   vllm.configr   r   vllm.loggerr   $vllm.model_executor.layers.attentionr   vllm.platformsr	   vllm.utils.math_utilsr
   vllm.utils.platform_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   r    vllm.v1.attention.backends.utilsr   'vllm.v1.attention.ops.merge_attn_statesr   vllm.v1.kv_cache_interfacer   r  r   is_rocmvllm.triton_utilsr   r   r    r#   jit	constexprrW   r   r   r   r   rn   r   r   r   loggerr   r   r   r   r   r   r   r;  r   r   r   r   r   <module>   s   $	a	

45
5

)  %