o
    i}V                     @   s2  d Z ddlmZ ddlmZ ddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ee'Z(dZ)dZ*eG dd dZ+G dd dee+ Z,G dd deZ-G dd deZ.dS )z-High-Performance Triton-only Attention layer.    )	dataclass)ClassVarN)CUDAGraphMode
VllmConfig)
CacheDType)init_logger)QuantKeykFp8StaticTensorSym)current_platform)DeviceCapability)next_power_of_2)AttentionBackendAttentionCGSupportAttentionImplAttentionLayerAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)context_attention_fwd)triton_reshape_and_cache_flash)unified_attention)AttentionSpec      c                   @   s  e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< eed< eed	< ejed
< ejed< ejed< eed< eed< ejdB ed< ejdB ed< ejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eeeeeef  f dB ed< edejdB fddZdS )TritonAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingseq_threshold_3Dnum_par_softmax_segmentssoftmax_segm_outputsoftmax_segm_maxsoftmax_segm_expsumuse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadatamm_prefix_rangereturnc                    sx   j du rdS jjd }jj fddt|D }tdd |D r'dS  fdd|D }tjj|tj	d
dS )	zConvert mm_prefix_range dict to padded tensor for Triton kernel.

        Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges.
        Empty ranges have start==end==0, which kernel skips via is_valid check.
        Nr   c                    s"   g | ]} j |d gpd gqS )r   r   )r/   get).0iself \/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/attention/backends/triton_attn.py
<listcomp>a   s    zBTritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<listcomp>c                 s   s    | ]}|d gkV  qdS )r1   Nr7   r3   rr7   r7   r8   	<genexpr>f   s    zATritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<genexpr>c                    s&   g | ]}t j|t j d ddqS )dtypedevice   )torchtensorint32viewr:   )r?   r7   r8   r9   j   s    )layout)r/   r    shaper?   rangeallrB   nestednested_tensorjaggedto_padded_tensor)r6   num_seqsrange_listsrange_tensorsr7   )r?   r6   r8   mm_prefix_range_tensorQ   s"   
	

z.TritonAttentionMetadata.mm_prefix_range_tensor)__name__
__module____qualname__int__annotations__rB   Tensorboolr-   r.   r/   dictlisttuplepropertyrQ   r7   r7   r7   r8   r   -   s.   
 







$r   c                	       sx   e Zd ZU ejZee ed< dede	e
 dedejf fddZded	efd
dZ	ddededed	efddZ  ZS )TritonAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configr?   c                    s  t  |||| |j _|j}||j _||j _|	  _
 jjjtjtjtjfv  _t j  _ jrP jjj}|sEJ dt| fddd _t _t j
}tj j j j|ftj|d _tj j j jftj|d _tj j j jftj|d _d S )Nz3CUDA Graphs enabled but no capture sizes specified.c                    s   t |  j S N)absr#   )xr5   r7   r8   <lambda>   s    z9TritonAttentionMetadataBuilder.__init__.<locals>.<lambda>)keyr=   ) super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimra   compilation_configcudagraph_moder   FULL_AND_PIECEWISEFULL_DECODE_ONLYFULLdecode_cudagraph_enabledMIN_LAUNCH_GRID_SIZE_2Dr#   cudagraph_capture_sizesminNUM_PAR_SOFTMAX_SEGMENTSr$   r   rB   emptyfloat32r%   r&   r'   )r6   r_   r`   ra   r?   rj   capture_sizesheaddim_padded	__class__r5   r8   rh   w   sX   




z'TritonAttentionMetadataBuilder.__init__common_attn_metadatar0   c                 C   s   |  d|}|jd |S )Nr      )buildr    fill_)r6   r   attn_metadatar7   r7   r8   build_for_cudagraph_capture   s   z:TritonAttentionMetadataBuilder.build_for_cudagraph_captureFr)   
fast_buildc                 C   s  |j }|j}|j}|j}|j}|j}	|j}
|dk}|r@tjd|gtj	| j
d}tj|gtj	| j
d}|j | }|| j
}nd }d }d }d }tdi d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|d| jd| jd| jd| jd| j}|S )Nr   r=   r   r   r   r   r    r!   r"   r(   r)   r*   r+   r,   r.   r#   r$   r%   r&   r'   r7   )r   r   r   r   r    block_table_tensorr"   rB   rC   rD   r?   cputor   r#   r$   r%   r&   r'   )r6   r)   r   r   r   r   r   r   r    r   r"   r(   r*   r+   r,   r.   r   r7   r7   r8   r      sv   	
z$TritonAttentionMetadataBuilder.buildF)rR   rS   rT   r   ALWAYSr^   r   rV   r   rZ   strr   rB   r?   rh   r   r   r   rU   rX   r   __classcell__r7   r7   r   r8   r]   t   s4   
 H
r]   c                   @   s  e Zd ZU dZeed< ejejej	gZ
eeej  ed< g dZeee  ed< edeeeB  fddZd	Zeed
< edefddZeded fddZe	d0dedededededeedf fddZe		d1dedeedf fddZedefddZeded fdd Zededefd!d"Zedefd#d$Z edefd%d&Z!ed'edefd(d)Z"edefd*d+Z#ed,e$defd-d.Z%d/S )2TritonAttentionBackendTaccept_output_buffersupported_dtypes)autobfloat16fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesr0   c                   C   s
   t dgS )Nr   )r   r7   r7   r7   r8    get_supported_kernel_block_sizes  s   
z7TritonAttentionBackend.get_supported_kernel_block_sizesF forward_includes_kv_cache_updatec                   C      dS )NTRITON_ATTNr7   r7   r7   r7   r8   get_name     zTritonAttentionBackend.get_nameTritonAttentionImplc                   C      t S rb   )r   r7   r7   r7   r8   get_impl_cls  r   z#TritonAttentionBackend.get_impl_clsr   
num_blocksri   num_kv_heads	head_sizecache_dtype_str.c                 C   s"   |d dkr
t d| d|||fS )Nr   r   z$Block size must be a multiple of 16.rA   )
ValueError)r   ri   r   r   r   r7   r7   r8   get_kv_cache_shape  s   z)TritonAttentionBackend.get_kv_cache_shapeinclude_num_layers_dimensionc                 C   s   | rdS dS )N)r   r   rA            )r   r   rA   r   r   r7   )r   r7   r7   r8   get_kv_cache_stride_order)  s   z0TritonAttentionBackend.get_kv_cache_stride_orderc                  O   r   )NFr7   )argskwargsr7   r7   r8   use_cascade_attention6  r   z,TritonAttentionBackend.use_cascade_attentionr]   c                   C   r   rb   )r]   r7   r7   r7   r8   get_builder_cls:  r   z&TritonAttentionBackend.get_builder_clsc                 C   s   |dkS )N    r7   )clsr   r7   r7   r8   supports_head_size>  s   z)TritonAttentionBackend.supports_head_sizec                 C   r   NTr7   r   r7   r7   r8   supports_mm_prefixB  r   z)TritonAttentionBackend.supports_mm_prefixc                 C   r   r   r7   r   r7   r7   r8   supports_sinkF  r   z$TritonAttentionBackend.supports_sink	attn_typec                 C   s   |t jt jt jt jfv S )z-TritonAttention supports all attention types.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)r   r   r7   r7   r8   supports_attn_typeJ  s   z)TritonAttentionBackend.supports_attn_typec                 C   r   r   r7   r   r7   r7   r8   supports_alibi_sqrtT  r   z*TritonAttentionBackend.supports_alibi_sqrt
capabilityc                 C   r   r   r7   )r   r   r7   r7   r8   supports_compute_capabilityX  r   z2TritonAttentionBackend.supports_compute_capabilityN)r   r   )&rR   rS   rT   r   rX   rV   rB   float16r   r}   r   r   rZ   r>   r   r   staticmethodrU   r   r   r   r   r   typer   r[   r   r   r   r   classmethodr   r   r   r   r   r   r   r7   r7   r7   r8   r      sh   
 

	r   c                   @   s4  e Zd ZdefddZdejdddfdededed	ed
e	e dB dedB de
dedB dededB dejdB deddfddZ			d%dejjdejdejdejdejdedejdB dejdB dejdB dejfddZdejdejdejdejdedejjdejfd d!Zdedejdejdejd"ejf
d#d$ZdS )&r   	quant_keyc                 C   s   |t kS rb   )r	   )r6   r   r7   r7   r8   fused_output_quant_supported^  s   z0TritonAttentionImpl.fused_output_quant_supportedNF	num_headsr   scaler   alibi_slopessliding_windowkv_cache_dtypelogits_soft_capr   kv_sharing_target_layer_namesinksuse_alibi_sqrtr0   c                 C   s   || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|	t
jt
jfv r7|d |d f| _	n|d df| _	|| _|d u rGd}|| _|
| _| j | j | _|	| _t | _|| _|d urv|jd |ksvJ d|j d| d|| _t | _d S )N)r>   )r@   r@   r   r   z[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   r   floatr   r   rB   rC   r}   r   r   r   r   r   r   r   r   num_queries_per_kvr   r
   	fp8_dtyper   rG   r   is_cudasupports_quant_query_input)r6   r   r   r   r   r   r   r   r   r   r   r   r   r7   r7   r8   rh   a  s>   

zTritonAttentionImpl.__init__layerqueryrf   valuekv_cacher   outputoutput_scaleoutput_block_scalec
                 C   s  |dusJ d|	durt d|du r|dS |jdu s J |j}
| jtjtjfv rF| |d|
 |d|
 |d|
 |d|
 ||S |	d\}}| j
drn|j| jkre|| j}|| j}|jdksnJ d	|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|jd d |jd
 f}|j}td&i d|d|
 d|d|d|d|
 d|d|d|d|d| jddd| jd| jd| j d|d| j!ddd|j"#|d|j$#|d|d|d |d!|d"|d#| j%d$|d%| |S )'a  Forward pass with Paged Attention impl. in Triton.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [num_blocks, 2, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zRfused block_scale output quantization is not yet supported for TritonAttentionImplr   Fr   r   g      ?z-A non 1.0 q_scale is not currently supported.rA   qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalTr   r   window_sizer!   softcap	q_descale	k_descale	v_descaler#   r$   r%   r&   r'   r   r   r/   r7   )&NotImplementedErrorr   r(   r   r   r   r   r   _forward_encoder_attentionunbindr   
startswithr>   r   rE   _q_scale_floatr   r    r   r   r!   r#   r$   r%   r&   r'   rG   rQ   r   r   r   r   r   r   _k_scaleexpand_v_scaler   )r6   r   r   rf   r   r   r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r!   r#   r$   r%   r&   r'   descale_shaperQ   r7   r7   r8   forward  s   





	
zTritonAttentionImpl.forwardc           
      C   sV   | j dr
td|j}|j}|j}	t|||||||	d| j| jd | jd d |S )a  Forward pass for encoder attention without KV cache.

        Args:
            query: shape = [num_encoder_tokens, num_heads, head_size]
            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
            output: shape = [num_encoder_tokens, num_heads, head_size]
            attn_metadata: Encoder attention metadata
            layer: The attention layer
        r   z3quantization is not supported for encoder attentionFr   r   )r   r   r   ob_start_loc	b_seq_lenmax_input_len	is_causalr   sliding_window_qsliding_window_k)	r   r   r   r   r    r   r   r   r   )
r6   r   rf   r   r   r   r   r   r    r   r7   r7   r8   r   	  s*   z.TritonAttentionImpl._forward_encoder_attentionr"   c              	   C   sh   | j tjtjfv rd S |d\}}| jdr$|| j}|| j}t	|||||| j|j
|j d S )Nr   r   )r   r   r   r   r   r   r   rE   r   r   r   r   )r6   r   rf   r   r   r"   r   r   r7   r7   r8   do_kv_cache_update7  s    z&TritonAttentionImpl.do_kv_cache_update)NNN)rR   rS   rT   r   r   r   r   rU   r   rZ   r   rB   rW   rX   rh   nnModuler   r   r   r   r   r7   r7   r7   r8   r   ]  s    
	

:	

v
.r   )/__doc__dataclassesr   typingr   rB   vllm.configr   r   vllm.config.cacher   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r	   vllm.platformsr
   vllm.platforms.interfacer   vllm.utils.math_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   r   r   .vllm.v1.attention.ops.triton_prefill_attentionr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   .vllm.v1.attention.ops.triton_unified_attentionr   vllm.v1.kv_cache_interfacer   rR   loggerrx   r{   r   r]   r   r   r7   r7   r7   r8   <module>   s2   (
F ]