o
    .i\U                     @   s.  d Z ddlmZ ddlmZ ddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ee&Z'dZ(dZ)eG dd dZ*G dd dee* Z+G dd deZ,G dd deZ-dS )z-High-Performance Triton-only Attention layer.    )	dataclass)ClassVarN)CUDAGraphMode
VllmConfig)
CacheDType)init_logger)QuantKeykFp8StaticTensorSym)current_platform)DeviceCapability)next_power_of_2)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)context_attention_fwd)triton_reshape_and_cache_flash)unified_attention)AttentionSpec      c                   @   s  e Zd ZU eed< eed< ejed< eed< ejed< ejed< ejed< eed< eed	< ejed
< ejed< ejed< eed< eed< ejdB ed< ejdB ed< ejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eeeeeef  f dB ed< edejdB fddZdS )TritonAttentionMetadatanum_actual_tokensmax_query_lenquery_start_locmax_seq_lenseq_lensblock_tableslot_mappingseq_threshold_3Dnum_par_softmax_segmentssoftmax_segm_outputsoftmax_segm_maxsoftmax_segm_expsumuse_cascadecommon_prefix_lenNcu_prefix_query_lensprefix_kv_lenssuffix_kv_lensscheduler_metadataprefix_scheduler_metadatamm_prefix_rangereturnc                    sx   j du rdS jjd }jj fddt|D }tdd |D r'dS  fdd|D }tjj|tj	d
dS )	zConvert mm_prefix_range dict to padded tensor for Triton kernel.

        Returns shape: (num_seqs, max_ranges, 2) with 0-padding for empty ranges.
        Empty ranges have start==end==0, which kernel skips via is_valid check.
        Nr   c                    s"   g | ]} j |d gpd gqS )r   r   )r.   get).0iself c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/triton_attn.py
<listcomp>`   s    zBTritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<listcomp>c                 s   s    | ]}|d gkV  qdS )r0   Nr6   r2   rr6   r6   r7   	<genexpr>e   s    zATritonAttentionMetadata.mm_prefix_range_tensor.<locals>.<genexpr>c                    s&   g | ]}t j|t j d ddqS )dtypedevice   )torchtensorint32viewr9   )r>   r6   r7   r8   i   s    )layout)r.   r   shaper>   rangeallrA   nestednested_tensorjaggedto_padded_tensor)r5   num_seqsrange_listsrange_tensorsr6   )r>   r5   r7   mm_prefix_range_tensorP   s"   
	

z.TritonAttentionMetadata.mm_prefix_range_tensor)__name__
__module____qualname__int__annotations__rA   Tensorboolr,   r-   r.   dictlisttuplepropertyrP   r6   r6   r6   r7   r   ,   s.   
 







$r   c                	       sx   e Zd ZU ejZee ed< dede	e
 dedejf fddZded	efd
dZ	ddededed	efddZ  ZS )TritonAttentionMetadataBuilder_cudagraph_supportkv_cache_speclayer_namesvllm_configr>   c                    s  t  |||| |j _|j}||j _||j _|	  _
 jjjtjtjtjfv  _t j  _ jrP jjj}|sEJ dt| fddd _t _t j
}tj j j j|ftj|d _tj j j jftj|d _tj j j jftj|d _d S )Nz3CUDA Graphs enabled but no capture sizes specified.c                    s   t |  j S N)absr"   )xr4   r6   r7   <lambda>   s    z9TritonAttentionMetadataBuilder.__init__.<locals>.<lambda>)keyr<   ) super__init__
block_sizemodel_configget_num_attention_headsparallel_confignum_heads_qget_num_kv_headsnum_heads_kvget_head_sizeheaddimr`   compilation_configcudagraph_moder   FULL_AND_PIECEWISEFULL_DECODE_ONLYFULLdecode_cudagraph_enabledMIN_LAUNCH_GRID_SIZE_2Dr"   cudagraph_capture_sizesminNUM_PAR_SOFTMAX_SEGMENTSr#   r   rA   emptyfloat32r$   r%   r&   )r5   r^   r_   r`   r>   ri   capture_sizesheaddim_padded	__class__r4   r7   rg   v   sX   




z'TritonAttentionMetadataBuilder.__init__common_attn_metadatar/   c                 C   s   |  d|}|jd |S )Nr      )buildr   fill_)r5   r   attn_metadatar6   r6   r7   build_for_cudagraph_capture   s   z:TritonAttentionMetadataBuilder.build_for_cudagraph_captureFr(   
fast_buildc                 C   s  |j }|j}|j}|j}|j}|j}	|j}
|dk}|r@tjd|gtj	| j
d}tj|gtj	| j
d}|j | }|| j
}nd }d }d }d }tdi d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|d| jd| jd| jd| jd| j}|S )Nr   r<   r   r   r   r   r   r    r!   r'   r(   r)   r*   r+   r-   r"   r#   r$   r%   r&   r6   )r   r   r   r   r   block_table_tensorr!   rA   rB   rC   r>   cputor   r"   r#   r$   r%   r&   )r5   r(   r   r   r   r   r   r   r   r   r!   r'   r)   r*   r+   r-   r   r6   r6   r7   r      sv   	
z$TritonAttentionMetadataBuilder.buildF)rQ   rR   rS   r   ALWAYSr]   r   rU   r   rY   strr   rA   r>   rg   r   r   r   rT   rW   r   __classcell__r6   r6   r   r7   r\   s   s4   
 H
r\   c                   @   s  e Zd ZU dZeed< ejejej	gZ
eeej  ed< g dZeee  ed< edeeeB  fddZedefd	d
Zeded fddZe	d/dedededededeedf fddZe	d0dedeedf fddZedefddZeded fddZededefd d!Zedefd"d#Zedefd$d%Z ed&edefd'd(Z!edefd)d*Z"ed+e#defd,d-Z$d.S )1TritonAttentionBackendTaccept_output_buffersupported_dtypes)autobfloat16fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesr/   c                   C   s
   t dgS )Nr   )r   r6   r6   r6   r7    get_supported_kernel_block_sizes  s   
z7TritonAttentionBackend.get_supported_kernel_block_sizesc                   C      dS )NTRITON_ATTNr6   r6   r6   r6   r7   get_name     zTritonAttentionBackend.get_nameTritonAttentionImplc                   C      t S ra   )r   r6   r6   r6   r7   get_impl_cls  r   z#TritonAttentionBackend.get_impl_clsr   
num_blocksrh   num_kv_heads	head_sizecache_dtype_str.c                 C   s"   |d dkr
t d| d|||fS )Nr   r   z$Block size must be a multiple of 16.r@   )
ValueError)r   rh   r   r   r   r6   r6   r7   get_kv_cache_shape  s   z)TritonAttentionBackend.get_kv_cache_shapeFinclude_num_layers_dimensionc                 C   s   | rdS dS )N)r   r   r@            )r   r   r@   r   r   r6   )r   r6   r6   r7   get_kv_cache_stride_order&  s   z0TritonAttentionBackend.get_kv_cache_stride_orderc                  O   r   )NFr6   )argskwargsr6   r6   r7   use_cascade_attention3  r   z,TritonAttentionBackend.use_cascade_attentionr\   c                   C   r   ra   )r\   r6   r6   r6   r7   get_builder_cls7  r   z&TritonAttentionBackend.get_builder_clsc                 C   s   |dkS )N    r6   )clsr   r6   r6   r7   supports_head_size;  s   z)TritonAttentionBackend.supports_head_sizec                 C   r   NTr6   r   r6   r6   r7   supports_mm_prefix?  r   z)TritonAttentionBackend.supports_mm_prefixc                 C   r   r   r6   r   r6   r6   r7   supports_sinkC  r   z$TritonAttentionBackend.supports_sink	attn_typec                 C   s   |t jt jt jt jfv S )z-TritonAttention supports all attention types.)r   DECODERENCODERENCODER_ONLYENCODER_DECODER)r   r   r6   r6   r7   supports_attn_typeG  s   z)TritonAttentionBackend.supports_attn_typec                 C   r   r   r6   r   r6   r6   r7   supports_alibi_sqrtQ  r   z*TritonAttentionBackend.supports_alibi_sqrt
capabilityc                 C   r   r   r6   )r   r   r6   r6   r7   supports_compute_capabilityU  r   z2TritonAttentionBackend.supports_compute_capabilityN)r   r   )%rQ   rR   rS   r   rW   rU   rA   float16r   r|   r   r   rY   r=   r   r   staticmethodrT   r   r   r   r   typer   rZ   r   r   r   r   classmethodr   r   r   r   r   r   r   r6   r6   r6   r7   r      sf   
 

	r   c                   @   s  e Zd ZdefddZdejdddfdededed	ed
e	e dB dedB de
dedB dededB dejdB deddfddZ			d"dejjdejdejdejdejdedejdB dejdB dejdB dejfddZdejdejdejdejdedejjdejfd d!ZdS )#r   	quant_keyc                 C   s   |t kS ra   )r	   )r5   r   r6   r6   r7   fused_output_quant_supported[  s   z0TritonAttentionImpl.fused_output_quant_supportedNF	num_headsr   scaler   alibi_slopessliding_windowkv_cache_dtypelogits_soft_capr   kv_sharing_target_layer_namesinksuse_alibi_sqrtr/   c                 C   s   || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|	t
jt
jfv r7|d |d f| _	n|d df| _	|| _|d u rGd}|| _|
| _| j | j | _|	| _t | _|| _|d urv|jd |ksvJ d|j d| d|| _t | _d S )N)r=   )r?   r?   r   r   z[Sinks must have the same number of heads as the number of heads in the layer. Sinks shape: z, num_heads: .)r   r   floatr   r   rA   rB   r|   r   r   r   r   r   r   r   r   num_queries_per_kvr   r
   	fp8_dtyper   rF   r   is_cudasupports_quant_query_input)r5   r   r   r   r   r   r   r   r   r   r   r   r   r6   r6   r7   rg   ^  s>   

zTritonAttentionImpl.__init__layerqueryre   valuekv_cacher   outputoutput_scaleoutput_block_scalec
              	   C   sh  |dusJ d|	durt d|du r|dS |jdu s J |j}
| jtjtjfv rF| |d|
 |d|
 |d|
 |d|
 ||S |	d\}}| j
du r{|dur{|dur{| jdrl|| j}|| j}t|||||j| j|j|j | jdr|j| jkr|| j}|| j}|jdksJ d	|j}|j}|j}|j}|j}|j}|j}|j}|j}|j}|jd d |jd
 f}|j }t!d&i d|d|
 d|d|d|d|
 d|d|d|d|d| j"ddd| j#d| j$d| j%d|d| j&ddd|j'|d|j'|d|d|d |d!|d"|d#| j(d$|d%| |S )'a  Forward pass with Paged Attention impl. in Triton.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: shape =
                [num_blocks, 2, block_size, num_kv_heads, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.zRfused block_scale output quantization is not yet supported for TritonAttentionImplr   Fr   r   g      ?z-A non 1.0 q_scale is not currently supported.r@   qkvoutcu_seqlens_qmax_seqlen_q	seqused_kmax_seqlen_ksoftmax_scalecausalTr   r   window_sizer    softcap	q_descale	k_descale	v_descaler"   r#   r$   r%   r&   r   r   r.   r6   ))NotImplementedErrorr   r'   r   r   r   r   r   _forward_encoder_attentionunbindr   r   
startswithrD   r   r   r!   _k_scale_v_scaler=   _q_scale_floatr   r   r   r   r    r"   r#   r$   r%   r&   rF   rP   r   r   r   r   r   r   expandr   )r5   r   r   re   r   r   r   r   r   r   r   	key_cachevalue_cacher   r   r   r   r    r"   r#   r$   r%   r&   descale_shaperP   r6   r6   r7   forward  s   






	
zTritonAttentionImpl.forwardc           
      C   sV   | j dr
td|j}|j}|j}	t|||||||	d| j| jd | jd d |S )a  Forward pass for encoder attention without KV cache.

        Args:
            query: shape = [num_encoder_tokens, num_heads, head_size]
            key: shape = [num_encoder_tokens, num_kv_heads, head_size]
            value: shape = [num_encoder_tokens, num_kv_heads, head_size]
            output: shape = [num_encoder_tokens, num_heads, head_size]
            attn_metadata: Encoder attention metadata
            layer: The attention layer
        r   z3quantization is not supported for encoder attentionFr   r   )r   r   r   ob_start_loc	b_seq_lenmax_input_len	is_causalr   sliding_window_qsliding_window_k)	r   r   r   r   r   r   r   r   r   )
r5   r   re   r   r   r   r   r   r   r   r6   r6   r7   r     s*   z.TritonAttentionImpl._forward_encoder_attention)NNN)rQ   rR   rS   r   r   r   r   rT   r   rY   r   rA   rV   rW   rg   nnModuler   r   r   r6   r6   r6   r7   r   Z  s    
	

:	

 r   ).__doc__dataclassesr   typingr   rA   vllm.configr   r   vllm.config.cacher   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r	   vllm.platformsr
   vllm.platforms.interfacer   vllm.utils.math_utilsr   vllm.v1.attention.backendr   r   r   r   r   r   r   .vllm.v1.attention.ops.triton_prefill_attentionr   4vllm.v1.attention.ops.triton_reshape_and_cache_flashr   .vllm.v1.attention.ops.triton_unified_attentionr   vllm.v1.kv_cache_interfacer   rQ   loggerrw   rz   r   r\   r   r   r6   r6   r6   r7   <module>   s2   $	F [