o
    
۾iY                 -   @   s  d Z ddlmZ ddlmZ ddlZddlZddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZLmMZM ddlNmOZO dZPe)Q ZRejSZTe!eUZVdaWdd  ZXe.jYd!e-jZd"e-jZfd#d$Z[d%ej\d&ej\d'ej\d(ej\d)ej]d*e^ej\ej\f fd+d,Z_G d-d. d.Z`G d/d0 d0e9ZaeG d1d2 d2ZbeG d3d4 d4ZceG d5d6 d6ZdeG d7d8 d8ZeeG d9d: d:ZfG d;d< d<e<ef ZgG d=d> d>e;Zh	?	@		A							B	@	Cd^dDej\dEej\dFej\dGej\dHeidIeidJeidKeidLejdMeidNekdB dOejej]B dB dPejej]B dB dQejej]B dB dRejej]B dB dSekdB dTekdB dUekdB dVeldWeidXeld*df,dYdZZme.jYd[e-jZfd\d]ZndS )_z Attention layer with FlashInfer.    )	dataclass)ClassVarN)"BatchDecodeWithPagedKVCacheWrapper#BatchPrefillWithPagedKVCacheWrapper$BatchPrefillWithRaggedKVCacheWrapper!MultiLevelCascadeAttentionWrapper)_get_range_buf!trtllm_batch_decode_with_kv_cache)"trtllm_batch_context_with_kv_cache)	FP4Tensor)override)envs)CUDAGraphMode
VllmConfigget_current_vllm_config)
CacheDType)get_dcp_group)init_logger)vllm_is_batch_invariant)QuantKeykFp8StaticTensorSymkNvfp4Dynamiccurrent_platformDeviceCapability)tltriton)can_use_trtllm_attentionuse_trtllm_attention)cdiv)is_pin_memory_available)is_strictly_contiguous)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)KVCacheLayoutTypeget_dcp_local_seq_lensget_kv_cache_layoutget_per_layer_parametersinfer_global_hyperparameterssplit_decodes_and_prefills)cp_lse_ag_out_rs)merge_attn_states)AttentionSpecUniformTypeKVCacheSpecs)CpuGpuBufferl        c                   C   s    t d u rtjtjtjdda t S )Ncudadtypedevice)trtllm_gen_workspace_buffertorchzerosr   %VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZEuint8 r>   r>   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/flashinfer.py _get_trtllm_gen_workspace_bufferL   s
   
r@   K_CACHE_STRIDEKV_CACHE_STRIDEc                 C   sN  t dt j}t dt j}	t |||  |	 t j}
|
dkr'd S |jj}t |}|
| t d| }t | | }|t j| }|| |	 d | t d| }||}t 	|| | t |}|
| | t d| }t | | }|t j| }|| |	 d | | t d| }||}t 	|| | d S )Nr      )
r   
program_idtoint64loadr7   
element_tyarangefloat32store)kv_cache_ptrblock_tables_prefill_ptrblock_table_stridemock_kv_cache_ptrk_scale_ptrv_scale_ptrrA   rB   	batch_idxmock_block_table_idxorig_page_numdequant_dtypek_scale_valoffsetfp8_valsdequantized_valsmock_cache_offsetv_scale_valr>   r>   r?   "_trtllm_prefill_attn_kvfp8_dequantU   sD   





r\   kv_cacheblock_tables_prefillk_scalev_scalerU   returnc              	   C   s   |j \}}| j }|d dksJ |tjtjfv sJ |d |d  |d  }||d  }	|| d |d |d |d |d f}
tj|
|| jd}tjd|| d tj|jd||}||f}t	| | |||||||	 ||fS )NrC            r6   )startendr7   r8   )
shaper:   bfloat16float16emptyr8   rI   int32reshaper\   )r]   r^   r_   r`   rU   
batch_sizenum_of_page_per_tokensk_cache_stridekv_cache_stridenew_smock_kv_cachemock_block_tablegridr>   r>   r?   !trtllm_prefill_attn_kvfp8_dequant   s8   
&

rv   c                    @   s   e Zd Z	ddejdB fddZdejdejdejdejd	ed
edededededededB dejdejdede	f ddZ
dejjdejdejdejdejdejfddZdS ) BatchDCPPrefillWrapperNworkspace_bufferc                 C   s    t |t | _t|t | _d S N)r   r,   _contextr   _new_tokensselfrx   r>   r>   r?   __init__   s   
zBatchDCPPrefillWrapper.__init__qo_indptr_cpupaged_kv_indptr_cpupaged_kv_indicespaged_kv_last_page_len_cpu	page_sizenum_qo_headsdcp_world_sizenum_kv_headshead_dimsm_scalewindow_leftlogits_soft_capq_data_typekv_cache_dtypeprefill_fixed_split_sizedisable_split_kvc                 C   sV   | j j|||||| ||	|d|
||||||d | jj|||||	|	d|
|||d dS )z1Plan the prefill operation with given parameters.F)causalr   r   r   r   kv_data_typefixed_split_sizer   T)	qo_indptr	kv_indptrr   r   head_dim_qkhead_dim_vor   r   r   r   r   N)rz   planr{   )r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   r>   r?   r      s>   
zBatchDCPPrefillWrapper.planlayerprefill_querykv_cache_permutekeyvalueoutc                 C   s   t  j| dd}| jj|||j|jdd\}}	t||	t  ddd\}
}|dd }| j	j|||dd\}}|dd }t
||
||| |S )	NrC   dimT)r_   r`   
return_lseF)r   is_lse_base_on_er   )r   )r   
all_gather
contiguousrz   run_k_scale_float_v_scale_floatr0   	transposer{   r1   )r}   r   r   r   r   r   r   prefill_query_across_dcpoutput_context_tmplse_context_tmpoutput_contextlse_contextoutput_query	lse_queryr>   r>   r?   r      sB   	


zBatchDCPPrefillWrapper.runry   )__name__
__module____qualname__r:   Tensorr~   intfloatr7   boolr   nnModuler   r>   r>   r>   r?   rw      sf    
	

4rw   c                   @   sh  e Zd ZU dZeed< ejejgZ	e
eej  ed< g dZe
ee  ed< edeeeB  fddZedefd	d
Zeded fddZeded fddZe	d+dedededededeedf fddZe	d,dedeedf fddZededejfdd Zedee fd!d"Zed#edefd$d%Zedefd&d'Z ede!d(B fd)d*Z"d(S )-FlashInferBackendTaccept_output_buffersupported_dtypes)autorh   fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesra   c                   C      g dS )N)       @   r>   r>   r>   r>   r?    get_supported_kernel_block_sizes"  s   z2FlashInferBackend.get_supported_kernel_block_sizesc                   C   s   dS )N
FLASHINFERr>   r>   r>   r>   r?   get_name(     zFlashInferBackend.get_nameFlashInferImplc                   C      t S ry   )r   r>   r>   r>   r?   get_impl_cls,  r   zFlashInferBackend.get_impl_clsFlashInferMetadataBuilderc                   C   r   ry   )r   r>   r>   r>   r?   get_builder_cls0  r   z!FlashInferBackend.get_builder_clsr   
num_blocks
block_sizer   	head_sizecache_dtype_str.c                 C   s   | d|||fS Nrb   r>   )r   r   r   r   r   r>   r>   r?   get_kv_cache_shape4  s   z$FlashInferBackend.get_kv_cache_shapeFinclude_num_layers_dimensionc                 C   sV   t  }|dkr| rdS |dkrd}|S |dkr| rdS |dkr#d}|S td| d)	NNHD)rC   r   rb   rc   rd      )r   rC   rb   rc   rd   HND)rC   rb   rd   r   rc   r   )r   rC   rc   rb   rd   zUnknown cache layout format .)r,   
ValueError)r   cache_layoutstride_orderr>   r>   r?   get_kv_cache_stride_order>  s   z+FlashInferBackend.get_kv_cache_stride_orderr   c                 C   s*   | dv rt jS | dkrt jS td|  )N)r   r   r   zUnrecognized FP8 dtype: )r:   float8_e4m3fnfloat8_e5m2r   )r   r>   r>   r?   get_fp8_dtype_for_flashinferS  s
   z.FlashInferBackend.get_fp8_dtype_for_flashinferc                 C   r   )N)r         r>   )clsr>   r>   r?   get_supported_head_sizes\  s   z*FlashInferBackend.get_supported_head_sizes
capabilityc                 C   s   |t ddko|t ddkS )N   r      rC   r   )r   r   r>   r>   r?   supports_compute_capabilitya  s   z-FlashInferBackend.supports_compute_capabilityc                 C   s$   ddl m}m} | du rdS | S )zEFlashInfer supports sinks when TRTLLM attention is available (SM100).r   )force_use_trtllm_attentionsupports_trtllm_attentionF)vllm.utils.flashinferr   r   )r   r   r   r>   r>   r?   supports_sinkg  s   
zFlashInferBackend.supports_sinkNc                 C   s.   ddl m} | }|d ur|jdkrdS d S )Nr   r   
   r   )vllm.platformsr   get_device_capabilitymajor)r   r   r   r>   r>   r?   get_required_kv_cache_layoutw  s
   z.FlashInferBackend.get_required_kv_cache_layout)r   F)#r   r   r   r   r   __annotations__r:   ri   rh   r   r   listr7   r   r   staticmethodr   r)   r   strr   typer   r   tupler   r   r   classmethodr   r   r   r   r*   r   r>   r>   r>   r?   r     sX   
 
	
r   c                   @   s   e Zd ZU dZeeB ed< dS )	FIPrefillz@Metadata for the native FlashInfer prefill pathway (non-TRTLLM).wrapperN)r   r   r   __doc__r   rw   r   r>   r>   r>   r?   r     s   
 r   c                   @   s   e Zd ZU dZeed< dS )FIDecodez?Metadata for the native FlashInfer decode pathway (non-TRTLLM).r   N)r   r   r   r   r   r   r>   r>   r>   r?   r     s   
 r   c                   @   sP   e Zd ZU dZejed< 	 ejed< 	 ejed< ejed< eed< 	 eed< dS )	TRTLLMPrefillz(Metadata for the TRTLLM prefill pathway.block_tablesseq_lenscum_seq_lens_qcum_seq_lens_kv	max_q_lenmax_seq_lenNr   r   r   r   r:   r   r   r   r>   r>   r>   r?   r     s   
 



r   c                   @   s2   e Zd ZU dZejed< 	 ejed< 	 eed< dS )TRTLLMDecodez'Metadata for the TRTLLM decode pathway.r   r   r  Nr  r>   r>   r>   r?   r    s   
 

r  c                   @   s   e Zd ZU eed< 	 ejed< 	 ejed< eed< eed< eed< eed< ee	B dB ed	< 	 e
eB dB ed
< 	 eed< 	 edB ed< dS )FlashInferMetadatanum_actual_tokensslot_mappingr   num_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensNprefilldecodeuse_cascadecascade_wrapper)r   r   r   r   r   r:   r   r7   r   r   r   r  r   r   r>   r>   r>   r?   r    s"   
 

r  c                       s*  e Zd ZU dZeed< dedee de	de
jf fddZe
jd	d
ee
jB de
jdefddZeeded  de	dedefddZdd Zde
jfddZdeeB fddZd.dedefddZdd Zd e j!d!e j!d"e
jd#ed$ede
jfd%d&Z"	d.d'ed(e#d)ede$fd*d+Z%defd,d-Z&  Z'S )/r   rC   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configr8   c                    s  t  |||| |j| _|j| _|j| _d | _d | _d | _t r+d| _	d| _
d| _n	d| _	d| _
d| _|j| _t| jj| jj}|jj}|| }|j}|d urS|jnd}	| jj tjk| _| jr{i | _d|	 | | _| jjd ur{t| j| jj| _zt j| _ t j!| _"|j#j$| _$W n t%y   d| _ d| _"d| _$Y nw | j dk| _&| j'| j(j#| _)| jj*| _*| jj+| _,| jj| _-| jj.| _.| j./drt01| j.| _2n| jj3| jj3ksJ | jj3| _2t4| j)| j*}
|j5d u| _6|
r| j6rt78d	 d}
|
r|jj9s| j2| _:n| jj3| _:|
| _;| j<d|
d
 d | _=t>t?||t@| _A| jAjB| _B| jAjC| _C| jAjD| _D| jAjE| _E| jEr?|
s?tFdtGjH oFtI | _J| K|d | _LtMjN| jLjO| jJd| _P| K|| _Q| K|| _R| j,dkr}tSTdr|jdksJ dd S d S d S )Ni   i   TFr   rC   r   zTRTLLM attention is disabled because KV transfer (P/D disaggregation) is enabled. TRTLLM attention requires strictly contiguous KV cache tensors which may not be guaranteed with KV transfer.)supports_spec_as_decodeFlashInfer backend currently does not support attention sinks, please use trtllm on blackwell or flash attention on earlier GPUs.)
pin_memoryr   d   r   zThere is a bug in FlashInfer block_size 16 head size 256 support. Please avoid this combination by passing --block-size 32 or --block-size 64.)Usuperr~   cache_configmodel_configattention_config_workspace_buffer_prefill_wrapper_decode_wrapperr   decode_fixed_split_sizer   r   compilation_configr    max_model_lenr  r   scheduler_configmax_num_seqsspeculative_confignum_speculative_tokenscudagraph_modedecode_moder   FULLenable_cuda_graph_decode_wrappers_cudagraph_decode_cudagraph_max_bsmax_cudagraph_capture_sizeminr   
world_sizer   rank_in_groupdcp_rankparallel_configdcp_kv_cache_interleave_sizeAssertionErroruse_dcpget_num_attention_headsr  r   r   r   r   r   cache_dtype
startswithr   r   r   r7   r   kv_transfer_config_kv_transfer_enabledlogger	info_once!disable_flashinfer_q_quantizationr   use_trtllm_decode_attention_init_reorder_batch_threshold_cascade_wrapperr.   r-   r   global_hyperparametersr   r   r   	has_sinksNotImplementedErrorr   VLLM_USE_V2_MODEL_RUNNERr!   r  _make_bufferpaged_kv_indptrr:   
zeros_likecpupaged_kv_indptr_cpu_bufferr   paged_kv_last_page_lenr   is_device_capability_family)r}   r  r  r  r8   max_num_pages_per_reqmax_num_reqsmax_num_pagesr$  num_spec_tokenscan_use_trtllm	__class__r>   r?   r~     s   

















z"FlashInferMetadataBuilder.__init__r7   sizer7   ra   c                G   s   t ||| j| jddS )NT)r7   r8   r  
with_numpy)r4   r8   r  )r}   r7   rS  r>   r>   r?   rD    s   z&FlashInferMetadataBuilder._make_bufferr   c                 C   sp   t |tr
|j n|g}|j|j}t|dk}|D ]}t |ts$qt	||j
ds/d} nq|r5tjS tjS )a  Get the cudagraph support level for FlashInfer attention.

        This depends on whether we can use TRTLLM attention for decodes, since we can
        only do UNIFORM_SINGLE_TOKEN_DECODE if it is unavailable.
        To check this, we must call can_use_trtllm_attention with the number of KV
        heads from the kv_cache_spec. We check all available KV cache specs and
        only return UNIFORM_BATCH if all of them support TRTLLM attention.
        r   )r   r   F)
isinstancer3   kv_cache_specsvaluesr  r5  r1  lenr2   r   r   r$   UNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODE)r   r  r  kv_specsr   has_trtllm_supportspecr>   r>   r?   get_cudagraph_support  s*   
z/FlashInferMetadataBuilder.get_cudagraph_supportc                 C   s6   | j d u rtj}t rt}tj|tj| jd| _ | j S )Nr6   )	r  r   r<   r   0FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANTr:   r;   r=   r8   )r}   buffer_sizer>   r>   r?   _get_workspace_buffer  s   

z/FlashInferMetadataBuilder._get_workspace_bufferrx   c                 C   s
   || _ d S ry   )r  r|   r>   r>   r?   set_workspace_buffer  s   
z.FlashInferMetadataBuilder.set_workspace_bufferc                 C   sH   | j d u r| jrt|  d| _ n	t|  t | _ | j d us!J | j S )N)rx   )r  r4  rw   ra  r   r,   r}   r>   r>   r?   _get_prefill_wrapper  s   


z.FlashInferMetadataBuilder._get_prefill_wrapperFrm   use_cudagraphc              	   C   s   |r
| j |d }n| j}|d u rJ|r*| jjd |d  }| jj}| jjd | }nd }d }d }t|  t	 ||||dd}|rG|| j |< |S || _|S )NrC   T)use_cuda_graphpaged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferuse_tensor_cores)
r*  getr  rE  gpur   rI  r   ra  r,   )r}   rm   re  decode_wrapperrE  r   rI  r>   r>   r?   _get_decode_wrapper  s2   
z-FlashInferMetadataBuilder._get_decode_wrapperc                 C   s$   | j d u rtd|  t | _ | j S r   )r?  r   ra  r,   rc  r>   r>   r?   _get_cascade_wrapper  s
   
z.FlashInferMetadataBuilder._get_cascade_wrappernum_blocks_npseq_lens_npblock_table_tensornum_reqsr   c           
      C   s   t j|t j| jj d|d  d | jjd|d  | jd|d < | jjd|d  }|j| jd|d  dd | jj | }| jjd| }t	|f |||
d|dd || }	t |	dk|dk@ ||	| jj d|< |S )	a=  
        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
        attention.

        Results are stored in self.paged_kv_indptr,
        self.paged_kv_indices, self.paged_kv_last_page_len buffers.

        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
        rC   )r7   r   NTnon_blockingr   i   )
BLOCK_SIZE)npcumsumrk   rE  rG  rH  rl  copy_r   _copy_page_indices_kernelstridewhererI  )
r}   rp  rq  rr  rs  r   rE  num_actual_pagesr   paged_kv_last_page_len_npr>   r>   r?   _compute_flashinfer_kv_metadata  s8   
	z9FlashInferMetadataBuilder._compute_flashinfer_kv_metadatacommon_prefix_lencommon_attn_metadata
fast_buildc           1      C   st  |j }|j}t|| jdd\}}}}	| j}
|j}|j}|j}|j}|j	}|dk}| jdk}t
| j| j|	|| j| j| jd| jj
| j|d}| jrId}| joP| jdk}|dksW|o\|dkp\|}|dkof|dkof|}|s| jrptd| jjsxtd| jjsJ d	| jj| _t||j| j||||	|d d d d
}| jp|p| }|r|jnd }|d ur| nd }|d ur||
d  |
 nd }| jr|d usJ |dkr||d  ||  }|dd  |d d  }||d  | ||d < t || j| j!| j"}|r|d usJ ||
 dksJ ||
 }||8 }|p| }|r1|d us J |d us'J | #|||||
}nd }|r||
 }t$j%d|gt$j&dd} t$j%d|gt$j&dd}!|dd |f }"t$j%|
gt$j&dd}#|d d |d f }||8 }|d uswJ | j'j(d d|  }$| j)j(d | }%| * |_+|j+j,| |g|!|$g|"|g|#|%g| j| j| j-| jd| j.| j/| j0| j| j1d |S |dkr|}&||&d  ||&  }|j2d |d ksJ |r||&d  ||&  }'| j'j3|&|d  }(|dd  |d d  }t4|5 6 })t7||&d  ||&d  |'|(|)|d|_8n| 9 }*| j)j(|&| }+|+j2d |ks'J | j'j(|&|d  },|,j2d |d ks=J | jrt:|*t;sIJ |*j,d&i d|d|,d|d|+d| jd| jd| jd| jd| j-d| j.d| j/d| j0d| jd| j1d| j<d| j= n+t:|*t>sJ |*j,||,||+| j| j| j-| jd| j.| j/| j0| j| j1| jj| j<| j=d  t?|*d!|_8|dkr8|r|| dksJ d"t@|d | |d | |d#|_A|S |dk}-| jBo|-o|| jCk}.|}/| D|/|.}0tE|0| j'j(d |/d  || j)j(d |/ |d |/ | j| j | j| j-| jd$| j.| j/| j0| j| j1| jj| jF| j=d% tG|0d!|_A|S )'NT)decode_thresholdrequire_uniformr   rC   )
is_prefillforce_use_trtllmrA  has_specFr  zcWindow left is not the same for all layers. One potential fix is to set disable_sliding_window=TruezFlashInfer backend currently only supports models in which all layers share the same values for the following hyperparameters: `window_left`, `logits_soft_cap`, `sm_scale`.)r  r  r   r  r  r	  r
  r  r  r  r  r  rG  r6   )r   r   r   r   r   r   )r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   o_data_typer   r   )r   z9TRTLLM decode requires uniform query lengths per request.)r   r   r  NONE)	pos_encoding_moder   r   r   r   r   r  r   r   r>   )Hrs  r  r/   r  r   r  r   rr  query_start_locquery_start_loc_cpur   r   r   r   r6  r   r  rA  r9  r=  rB  r@  has_same_window_leftsr   has_same_all_paramsr  r7   r  r  r4  seq_lens_cpunumpyr+   r0  r2  r  r:   tensorrk   rE  rG  rI  ro  r  r   r   r   r   r   r   rg   rl  r   maxitemr   r  rd  rU  rw   r   r   r   r   r  r  r)  r+  rn  fast_plan_decoder  r   )1r}   r  r  r  rs  r  r  r	  r  r
  r   r  r   rr  r   r   r  uses_spec_reorderprefill_use_trtllmdecode_use_trtllmall_uses_trtllmis_only_trtllm_decodeattn_metadataneeds_seq_lens_cpur  rq  rp  qo_indptr_prefill_cpuquery_lens_prefill_cpunum_common_kv_blocksneeds_paged_kv_indicesr   shared_qo_indptr_cpushared_kv_page_indptr_cpushared_kv_page_indices_cpushared_kv_last_page_len_cpur   r   prefill_startqo_indptr_prefill_gpupaged_kv_indptr_prefill_gpumax_q_len_prefillprefill_wrapper"paged_kv_last_page_len_prefill_cpupaged_kv_indptr_prefill_cpupure_decodere  num_input_tokensrm  r>   r>   r?   build5  s   





	







	

	




*

zFlashInferMetadataBuilder.buildc                 O   s   | j j| jjjkrdS dS )NF)r  r7   r  r  )r}   argskwargsr>   r>   r?   use_cascade_attention  s   z/FlashInferMetadataBuilder.use_cascade_attentionr   )(r   r   r   r  r   r   r2   r   r   r   r:   r8   r~   rk   SymIntr7   r4   rD  r   r   r   r$   r^  ra  r   rb  r   rw   rd  r   rn  ro  rw  ndarrayr  r(   r  r  r  __classcell__r>   r>   rP  r?   r     s   
  
(

$
:
  Pr   c                   @   s   e Zd ZU dZeed< dejddfdedede	dede
e	 dB d	edB d
ede	dB dededB dejdB ddfddZdefddZdejfddZ			d#dejjdejdejdejdejdedejdB dejdB d ejdB dejfd!d"ZdS )$r   Tcan_return_lse_for_decodeN	num_headsr   scaler   alibi_slopessliding_windowr   r   	attn_typekv_sharing_target_layer_namesinksra   c                 C   s  || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|d df| _	| j	d ur6| j	d nd| _
|| _|| _|
| _| j | j | _|	tjkrRtdd | _|d urq|jd |krntd| d|jd  d	|| _t||| _t }| jo|jj | _d | _d | _d | _d S )
NrR  )r  r  rC   r   r  zaEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferImplzWSinks must have the same number of heads as the number of heads in the layer. Expected z
, but got r   )r  r   r   r  r   r:   r  rJ   r  r  r   r   r   r  num_queries_per_kvr'   DECODERrB  r  rg   r   r   support_trtllm_attnr   r  r<  supports_quant_query_input
bmm1_scale
bmm2_scale
o_sf_scale)r}   r  r   r  r   r  r  r   r   r  r  r  r  r>   r>   r?   r~     sN   


zFlashInferImpl.__init__	quant_keyc                 C   s   | j o| jdo|ttfv S )Nr   )r  r   r7  r   r   )r}   r  r>   r>   r?   fused_output_quant_supported  s
   

z+FlashInferImpl.fused_output_quant_supported	act_dtypec                 C   s4   | j d ur| j jtjkr| j tj| _ d S d S d S ry   )r  r7   r:   rJ   rE   )r}   r  r>   r>   r?   process_weights_after_loading  s   z,FlashInferImpl.process_weights_after_loadingr   queryr   r   r]   r  outputoutput_scaleoutput_block_scalec
           #      C   s4  |dusJ d|du r| dS |j|jks#J d|j d|j | jdu r2|j|j | j | _| jdu r;|j| _t	|j
t}
t	|jt}|du rT|	du sSJ dna|jtks]J d|jdksd|
rk|jdkso|soJ d|jtkr}|	du s|J d	n|jtkr|	dusJ d
ntd|j |jdu r|  |_|jtkr| j|j | _n	|jtkr|j| _|j}| jdu rtjj|||dddf |dddf |j| j|j|j  | j!drt"#| j}|$|}|d| }|d| }|d| }|}|d| }|j%r|j&dusJ |'|j&(|| |S |j)}|j*}t"+ }|j,| }| j-dk}|dkr||d }|j.d |ksGJ |
st	|j
t/sSJ |j
j0}|dus^J |rt	|t1siJ |j2j3| j4kssJ |j2j5| j6p{dksJ |j2j7| jksJ |j2j8rJ |j9j3| j4ksJ |j9j5| j6pdksJ |j9j7| jksJ |j9j8sJ |j(|||||d ||d ||d d nt	|t:sJ |j3| j4ksJ |j5| j6pdksJ |j7| jksJ |j8sJ |j(|||j|j||d d nt	|j
tsJ |; <|j.}t= }|j
j>}|j
j?}t@ dks3J tA|s:J tA|sAJ tA|sHJ tA|sOJ tA|sVJ |jtkrr| jdusdJ tB||d |	||j.d}n| jdu szJ ||d }|jtkr| j!drtC|||j|j |j\}}n|}|}tDd+i d|d|d|d|d|d|j
jEd|j
jFd| jd| jd|jd|j
jGd|j
jHd| j4d | jId!| jd"| |dkr|d| }|j.d |ksJ |st	|jtJsJ |jj0}|dusJ |j3| j4ksJ |j5| j6pdks#J |j7| jks,J |rmtK jL|; d#d$}tM|}tjN|Od|OdftjP|jQd%}|j(|||j|j||d&d' tR||tK d(d)|d|< |S |j(|||j|j|d| d |S t	|jtsJ |; <|j.}t= }|jj>} |jj?}!t@ dksJ tA|sJ tA|sJ tA|sJ tA| sJ tA|!sJ |jtkr| jdusJ tB|d| |	d|j.d}n| jdu sJ |d| }||j dkrd}"n||j }"tS|||| |!|jjF| j| j| j4| jI| j||"d* |S ),aM  Forward pass with FlashInfer.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: KV cache tensor with different possible shapes:
                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.r   zQuery dtype mismatch: expected z, got z@output_block_scale is not supported when fusion has not happenedz2Query must be FP8 when attn+quant fusion happened.zMust use TRT-LLM attnz8output_block_scale should not be provided for fp8 outputz/output_block_scale is required for nvfp4 outputzUnsupported output dtype: rC   r           )r   )r_   r`   r   r   )datar  scale_start_indexoriginal_shaper  r]   rx   r   r   r   
max_kv_lenr  r  rm   r   r   r   r  r  r   r   r6   T)r_   r`   r   lser   F)r   )r  r]   rx   r   r   r  r  r  r   r  r  r   q_len_per_reqr>   )Tfill_r   r7   r  _q_scale_floatr   r  r  r   rU  r  r   r  r  	FP8_DTYPEr	  r  	FP4_DTYPEr   _o_scale_floatrG  r  r  r  r  r:   ops_C_cache_opsreshape_and_cache_flashr  r   _k_scale_v_scaler7  r   r   viewr  r  ry  r   r  r
  r   permuter   rg   r   r   rw   rz   _window_leftr   _logits_soft_capr   	_sm_scale_causalr{   r   r   rl   r@   r   r   r,   r"   r   rv   r
   r   r  r   r   r  r   r   r   
empty_likerj   rS  rJ   r8   r0   r	   )#r}   r   r  r   r   r]   r  r  r  r  r  r  r  torch_dtypeoutput_paddedr  r
  r   r   r4  r   r  rx   r^   seq_lens_prefillr   rs   rt   decode_queryrm  
output_tmpr  block_tables_decodeseq_lens_decoder  r>   r>   r?   forward  s  
























	






	


	C
<

zFlashInferImpl.forward)NNN)r   r   r   r  r   r   r'   r  r   r   r   r   r:   r   r~   r   r  r7   r  r   r   r  r  r>   r>   r>   r?   r     sr   
 
	

>	
r   r  r  ri   TF
indptr_cpuindiceslast_page_len_cpur  r   r   r   r   r  r   r   r   r   r  	data_typer   
rope_scale
rope_thetaru  r   r   c                 C   s  | j r	t| ddr(| ||||||||	|
|||||||||dd|| d| _dS | j s/J dt|}|du r9d}|durJ|du rC|}|du rI|}n|du rPd}|du rV|}t|tr`tt|n|}t|trltt|n|}|| jkr|t	d
|| jt|t| jkrt	d	| jj|dd
 | jj|dd
 t|d d}z3| j| j| j||||||||| j ||d|
g}| jdkr|| || |d | jj| | _W n ty } ztd| |d}~ww |	| _|
| _|| _|| _|| _|| _dS )ag  
    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for
    cudagraph capture/replay, while the no cudagraph version turns back
    to the original plan.
    using original plan after passing host-side buffers:
    - only host-to-device copy of indptr and last_page_len buffers
    Modifications for cudagraph:
    - only host-to-device copy of indptr and last_page_len buffers.
    - avoid device-to-device copy of indices buffer.

    Part of the code get inspiration from the original plan from FlashInfer repo
    and the implementation of fast_decode_plan for FlashInfer in SGlang repo.
    vllm_first_callTNFzShould be cudagraph only herer  ri   zThe batch size should be fixed in cudagraph mode, the runtime batch size {} mismatches the batch size set during initialization {}zHThe size of indices should be less than or equal to the allocated bufferrt  rC   rG  fa2r   zError in tensor core plan: )is_cuda_graph_enabledgetattrr   r  rX  rU  r   r:   _fixed_batch_sizer   format_paged_kv_indices_buf_paged_kv_indptr_bufry  _paged_kv_last_page_len_bufr   _float_workspace_buffer_int_workspace_buffer _pin_memory_int_workspace_buffer_backendappend_cached_module
_plan_info	ExceptionRuntimeError_pos_encoding_moder  r  r  _rope_scale_rope_theta)r}   r  r  r  r  r   r   r   r   r  r   r   r   r   r  r  r   r  r  ru  r   r   rm   qo_indptr_hostr  er>   r>   r?   r  C  s   (







r  rv  c                 C   s   t d}|||  }t || }t || d }|| }	t d|}
t d|	|D ]#}t j|| |
 ||
 |	k d}t j| | | |
 |||
 |	k d q,d S )Nr   rC   )mask)r   rD   rG   rI   rangerK   )page_indicesblock_tablerN   cu_num_blocksrv  req_idxrow_ptr	start_idxend_idxr   rW   i	block_idsr>   r>   r?   rz    s   

rz  )r  r  Nri   NNNNNNTr  F)or   dataclassesr   typingr   r  rw  r:   
flashinferr   r   r   r   flashinfer.decoder   r	   flashinfer.prefillr
   flashinfer.utilsr   typing_extensionsr   vllmr   vllm.configr   r   r   vllm.config.cacher   vllm.distributed.parallel_stater   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   vllm.platforms.interfacer   vllm.triton_utilsr   r   r   r   r   vllm.utils.math_utilsr    vllm.utils.platform_utilsr!   vllm.utils.torch_utilsr"   vllm.v1.attention.backendr#   r$   r%   r&   r'   r(   r)    vllm.v1.attention.backends.utilsr*   r+   r,   r-   r.   r/   vllm.v1.attention.ops.commonr0   'vllm.v1.attention.ops.merge_attn_statesr1   vllm.v1.kv_cache_interfacer2   r3   vllm.v1.utilsr4   r_  	fp8_dtyper  r=   r  r   r:  r9   r@   jit	constexprr\   r   r7   r   rv   rw   r   r   r   r   r  r  r   r   r   r   r   r   r  rz  r>   r>   r>   r?   <module>   s  $ 		.
%nj%     -   C	

 