o
    .iz                 -   @   s  d Z ddlmZ ddlmZ ddlZddlZddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlMmNZN dZOe)P ZQejRZSe!eTZUdaVdd  ZWe.jXd!e-jYd"e-jYfd#d$ZZd%ej[d&ej[d'ej[d(ej[d)ej\d*e]ej[ej[f fd+d,Z^G d-d. d.Z_G d/d0 d0e9Z`eG d1d2 d2ZaeG d3d4 d4ZbeG d5d6 d6ZceG d7d8 d8ZdeG d9d: d:ZeG d;d< d<e<ee ZfG d=d> d>e;Zg	?	@		A							B	@	Cd^dDej[dEej[dFej[dGej[dHehdIehdJehdKehdLeidMehdNejdB dOeiej\B dB dPeiej\B dB dQeiej\B dB dReiej\B dB dSejdB dTejdB dUejdB dVekdWehdXekd*df,dYdZZle.jXd[e-jYfd\d]ZmdS )_z Attention layer with FlashInfer.    )	dataclass)ClassVarN)"BatchDecodeWithPagedKVCacheWrapper#BatchPrefillWithPagedKVCacheWrapper$BatchPrefillWithRaggedKVCacheWrapper!MultiLevelCascadeAttentionWrapper)_get_range_buf!trtllm_batch_decode_with_kv_cache)"trtllm_batch_context_with_kv_cache)	FP4Tensor)override)envs)CUDAGraphMode
VllmConfigget_current_vllm_config)
CacheDType)get_dcp_group)init_logger)vllm_is_batch_invariant)QuantKeykFp8StaticTensorSymkNvfp4Dynamiccurrent_platformDeviceCapability)tltriton)can_use_trtllm_attentionuse_trtllm_attention)cdiv)is_pin_memory_available)is_strictly_contiguous)AttentionBackendAttentionCGSupportAttentionImplAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)KVCacheLayoutTypeget_dcp_local_seq_lensget_kv_cache_layoutget_per_layer_parametersinfer_global_hyperparameterssplit_decodes_and_prefills)cp_lse_ag_out_rs)merge_attn_states)AttentionSpec)CpuGpuBufferl        c                   C   s    t d u rtjtjtjdda t S )Ncudadtypedevice)trtllm_gen_workspace_buffertorchzerosr   %VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZEuint8 r=   r=   b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/attention/backends/flashinfer.py _get_trtllm_gen_workspace_bufferL   s
   
r?   K_CACHE_STRIDEKV_CACHE_STRIDEc                 C   sN  t dt j}t dt j}	t |||  |	 t j}
|
dkr'd S |jj}t |}|
| t d| }t | | }|t j| }|| |	 d | t d| }||}t 	|| | t |}|
| | t d| }t | | }|t j| }|| |	 d | | t d| }||}t 	|| | d S )Nr      )
r   
program_idtoint64loadr6   
element_tyarangefloat32store)kv_cache_ptrblock_tables_prefill_ptrblock_table_stridemock_kv_cache_ptrk_scale_ptrv_scale_ptrr@   rA   	batch_idxmock_block_table_idxorig_page_numdequant_dtypek_scale_valoffsetfp8_valsdequantized_valsmock_cache_offsetv_scale_valr=   r=   r>   "_trtllm_prefill_attn_kvfp8_dequantU   sD   





r[   kv_cacheblock_tables_prefillk_scalev_scalerT   returnc              	   C   s   |j \}}| j }|d dksJ |tjtjfv sJ |d |d  |d  }||d  }	|| d |d |d |d |d f}
tj|
|| jd}tjd|| d tj|jd||}||f}t	| | |||||||	 ||fS )NrB            r5   )startendr6   r7   )
shaper9   bfloat16float16emptyr7   rH   int32reshaper[   )r\   r]   r^   r_   rT   
batch_sizenum_of_page_per_tokensk_cache_stridekv_cache_stridenew_smock_kv_cachemock_block_tablegridr=   r=   r>   !trtllm_prefill_attn_kvfp8_dequant   s8   
&

ru   c                    @   s   e Zd Z	ddejdB fddZdejdejdejdejd	ed
edededededededB dejdejdede	f ddZ
dejjdejdejdejdejdejfddZdS ) BatchDCPPrefillWrapperNworkspace_bufferc                 C   s    t |t | _t|t | _d S N)r   r,   _contextr   _new_tokensselfrw   r=   r=   r>   __init__   s   
zBatchDCPPrefillWrapper.__init__qo_indptr_cpupaged_kv_indptr_cpupaged_kv_indicespaged_kv_last_page_len_cpu	page_sizenum_qo_headsdcp_world_sizenum_kv_headshead_dimsm_scalewindow_leftlogits_soft_capq_data_typekv_cache_dtypeprefill_fixed_split_sizedisable_split_kvc                 C   sV   | j j|||||| ||	|d|
||||||d | jj|||||	|	d|
|||d dS )z1Plan the prefill operation with given parameters.F)causalr   r   r   r   kv_data_typefixed_split_sizer   T)	qo_indptr	kv_indptrr   r   head_dim_qkhead_dim_vor   r   r   r   r   N)ry   planrz   )r|   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r=   r=   r>   r      s>   
zBatchDCPPrefillWrapper.planlayerprefill_querykv_cache_permutekeyvalueoutc                 C   s   t  j| dd}| jj|||j|jdd\}}	t||	t  ddd\}
}|dd }| j	j|||dd\}}|dd }t
||
||| |S )	NrB   dimT)r^   r_   
return_lseF)r   is_lse_base_on_er   )r   )r   
all_gather
contiguousry   run_k_scale_float_v_scale_floatr0   	transposerz   r1   )r|   r   r   r   r   r   r   prefill_query_across_dcpoutput_context_tmplse_context_tmpoutput_contextlse_contextoutput_query	lse_queryr=   r=   r>   r      sB   	


zBatchDCPPrefillWrapper.runrx   )__name__
__module____qualname__r9   Tensorr}   intfloatr6   boolr   nnModuler   r=   r=   r=   r>   rv      sf    
	

4rv   c                   @   sh  e Zd ZU dZeed< ejejgZ	e
eej  ed< g dZe
ee  ed< edeeeB  fddZedefd	d
Zeded fddZeded fddZe	d+dedededededeedf fddZe	d,dedeedf fddZededejfdd Zedee fd!d"Zed#edefd$d%Zedefd&d'Z ede!d(B fd)d*Z"d(S )-FlashInferBackendTaccept_output_buffersupported_dtypes)autorg   fp8fp8_e4m3fp8_e5m2supported_kv_cache_dtypesr`   c                   C      g dS )N)       @   r=   r=   r=   r=   r>    get_supported_kernel_block_sizes"  s   z2FlashInferBackend.get_supported_kernel_block_sizesc                   C   s   dS )N
FLASHINFERr=   r=   r=   r=   r>   get_name(     zFlashInferBackend.get_nameFlashInferImplc                   C      t S rx   )r   r=   r=   r=   r>   get_impl_cls,  r   zFlashInferBackend.get_impl_clsFlashInferMetadataBuilderc                   C   r   rx   )r   r=   r=   r=   r>   get_builder_cls0  r   z!FlashInferBackend.get_builder_clsr   
num_blocks
block_sizer   	head_sizecache_dtype_str.c                 C   s   | d|||fS Nra   r=   )r   r   r   r   r   r=   r=   r>   get_kv_cache_shape4  s   z$FlashInferBackend.get_kv_cache_shapeFinclude_num_layers_dimensionc                 C   sV   t  }|dkr| rdS |dkrd}|S |dkr| rdS |dkr#d}|S td| d)	NNHD)rB   r   ra   rb   rc      )r   rB   ra   rb   rc   HND)rB   ra   rc   r   rb   r   )r   rB   rb   ra   rc   zUnknown cache layout format .)r,   
ValueError)r   cache_layoutstride_orderr=   r=   r>   get_kv_cache_stride_order>  s   z+FlashInferBackend.get_kv_cache_stride_orderr   c                 C   s*   | dv rt jS | dkrt jS td|  )N)r   r   r   zUnrecognized FP8 dtype: )r9   float8_e4m3fnfloat8_e5m2r   )r   r=   r=   r>   get_fp8_dtype_for_flashinferS  s
   z.FlashInferBackend.get_fp8_dtype_for_flashinferc                 C   r   )N)r         r=   )clsr=   r=   r>   get_supported_head_sizes\  s   z*FlashInferBackend.get_supported_head_sizes
capabilityc                 C   s   |t ddko|t ddkS )N   r      rB   r   )r   r   r=   r=   r>   supports_compute_capabilitya  s   z-FlashInferBackend.supports_compute_capabilityc                 C   s$   ddl m}m} | du rdS | S )zEFlashInfer supports sinks when TRTLLM attention is available (SM100).r   )force_use_trtllm_attentionsupports_trtllm_attentionF)vllm.utils.flashinferr   r   )r   r   r   r=   r=   r>   supports_sinkg  s   
zFlashInferBackend.supports_sinkNc                 C   s.   ddl m} | }|d ur|jdkrdS d S )Nr   r   
   r   )vllm.platformsr   get_device_capabilitymajor)r   r   r   r=   r=   r>   get_required_kv_cache_layoutw  s
   z.FlashInferBackend.get_required_kv_cache_layout)r   F)#r   r   r   r   r   __annotations__r9   rh   rg   r   r   listr6   r   r   staticmethodr   r)   r   strr   typer   r   tupler   r   r   classmethodr   r   r   r   r*   r   r=   r=   r=   r>   r     sX   
 
	
r   c                   @   s   e Zd ZU dZeeB ed< dS )	FIPrefillz@Metadata for the native FlashInfer prefill pathway (non-TRTLLM).wrapperN)r   r   r   __doc__r   rv   r   r=   r=   r=   r>   r     s   
 r   c                   @   s   e Zd ZU dZeed< dS )FIDecodez?Metadata for the native FlashInfer decode pathway (non-TRTLLM).r   N)r   r   r   r   r   r   r=   r=   r=   r>   r     s   
 r   c                   @   sP   e Zd ZU dZejed< 	 ejed< 	 ejed< ejed< eed< 	 eed< dS )	TRTLLMPrefillz(Metadata for the TRTLLM prefill pathway.block_tablesseq_lenscum_seq_lens_qcum_seq_lens_kv	max_q_lenmax_seq_lenNr   r   r   r   r9   r   r   r   r=   r=   r=   r>   r     s   
 



r   c                   @   s2   e Zd ZU dZejed< 	 ejed< 	 eed< dS )TRTLLMDecodez'Metadata for the TRTLLM decode pathway.r   r   r   Nr  r=   r=   r=   r>   r    s   
 

r  c                   @   s   e Zd ZU eed< 	 ejed< 	 ejed< eed< eed< eed< eed< ee	B dB ed	< 	 e
eB dB ed
< 	 eed< 	 edB ed< dS )FlashInferMetadatanum_actual_tokensslot_mappingr   num_decodesnum_decode_tokensnum_prefillsnum_prefill_tokensNprefilldecodeuse_cascadecascade_wrapper)r   r   r   r   r   r9   r   r6   r   r   r   r  r   r   r=   r=   r=   r>   r    s"   
 

r  c                       s*  e Zd ZU dZeed< dedee de	de
jf fddZe
jd	d
ee
jB de
jdefddZeeded  de	dedefddZdd Zde
jfddZdeeB fddZd.dedefddZdd Zd e j!d!e j!d"e
jd#ed$ede
jfd%d&Z"	d.d'ed(e#d)ede$fd*d+Z%defd,d-Z&  Z'S )/r   rB   reorder_batch_thresholdkv_cache_speclayer_namesvllm_configr7   c                    s  t  |||| |j| _|j| _|j| _d | _d | _d | _t r+d| _	d| _
d| _n	d| _	d| _
d| _|j| _t| jj| jj}|jj}|| }|j}|d urS|jnd}	| jj tjk| _| jr{i | _d|	 | | _| jjd ur{t| j| jj| _zt j| _ t j!| _"|j#j$| _$W n t%y   d| _ d| _"d| _$Y nw | j dk| _&| j'| j(j#| _)| jj*| _*| jj+| _,| jj| _-| jj.| _.| j./drt01| j.| _2n| jj3| jj3ksJ | jj3| _2t4| j)| j*}
|
r|jj5s| j2| _6n| jj3| _6|
| _7| j8d|
d	 d | _9t:t;||t<| _=| j=j>| _>| j=j?| _?| j=j@| _@| j=jA| _A| jAr+|
s+tBd
tCjD o2tE | _F| G|d | _HtIjJ| jHjK| jFd| _L| G|| _M| G|| _N| j,dkritOPdrk|jdksmJ dd S d S d S )Ni   i   TFr   rB   r   )supports_spec_as_decodeFlashInfer backend currently does not support attention sinks, please use trtllm on blackwell or flash attention on earlier GPUs.)
pin_memoryr   d   r   zThere is a bug in FlashInfer block_size 16 head size 256 support. Please avoid this combination by passing --block-size 32 or --block-size 64.)Qsuperr}   cache_configmodel_configattention_config_workspace_buffer_prefill_wrapper_decode_wrapperr   decode_fixed_split_sizer   r   compilation_configr    max_model_lenr  r   scheduler_configmax_num_seqsspeculative_confignum_speculative_tokenscudagraph_modedecode_moder   FULLenable_cuda_graph_decode_wrappers_cudagraph_decode_cudagraph_max_bsmax_cudagraph_capture_sizeminr   
world_sizer   rank_in_groupdcp_rankparallel_configdcp_kv_cache_interleave_sizeAssertionErroruse_dcpget_num_attention_headsr  r   r   r   r   r   cache_dtype
startswithr   r   r   r6   r   !disable_flashinfer_q_quantizationr   use_trtllm_decode_attention_init_reorder_batch_threshold_cascade_wrapperr.   r-   r   global_hyperparametersr   r   r   	has_sinksNotImplementedErrorr   VLLM_USE_V2_MODEL_RUNNERr!   r  _make_bufferpaged_kv_indptrr9   
zeros_likecpupaged_kv_indptr_cpu_bufferr   paged_kv_last_page_lenr   is_device_capability_family)r|   r  r  r  r7   max_num_pages_per_reqmax_num_reqsmax_num_pagesr#  num_spec_tokenscan_use_trtllm	__class__r=   r>   r}     s   
















z"FlashInferMetadataBuilder.__init__r6   sizer6   r`   c                G   s   t ||| j| jddS )NT)r6   r7   r  
with_numpy)r3   r7   r  )r|   r6   rN  r=   r=   r>   r?  u  s   z&FlashInferMetadataBuilder._make_bufferr   c                 C   s(   t |j|j|jd}|rtjS tjS )N)r   r   )r   r  r4  r0  r   r$   UNIFORM_BATCHUNIFORM_SINGLE_TOKEN_DECODE)r   r  r  has_trtllm_supportr=   r=   r>   get_cudagraph_support  s   z/FlashInferMetadataBuilder.get_cudagraph_supportc                 C   s6   | j d u rtj}t rt}tj|tj| jd| _ | j S )Nr5   )	r  r   r;   r   0FLASHINFER_WORKSPACE_BUFFER_SIZE_BATCH_INVARIANTr9   r:   r<   r7   )r|   buffer_sizer=   r=   r>   _get_workspace_buffer  s   

z/FlashInferMetadataBuilder._get_workspace_bufferrw   c                 C   s
   || _ d S rx   )r  r{   r=   r=   r>   set_workspace_buffer  s   
z.FlashInferMetadataBuilder.set_workspace_bufferc                 C   sH   | j d u r| jrt|  d| _ n	t|  t | _ | j d us!J | j S )N)rw   )r  r3  rv   rV  r   r,   r|   r=   r=   r>   _get_prefill_wrapper  s   


z.FlashInferMetadataBuilder._get_prefill_wrapperFrl   use_cudagraphc              	   C   s   |r
| j |d }n| j}|d u rJ|r*| jjd |d  }| jj}| jjd | }nd }d }d }t|  t	 ||||dd}|rG|| j |< |S || _|S )NrB   T)use_cuda_graphpaged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferuse_tensor_cores)
r)  getr  r@  gpur   rD  r   rV  r,   )r|   rl   rZ  decode_wrapperr@  r   rD  r=   r=   r>   _get_decode_wrapper  s2   
z-FlashInferMetadataBuilder._get_decode_wrapperc                 C   s$   | j d u rtd|  t | _ | j S r   )r:  r   rV  r,   rX  r=   r=   r>   _get_cascade_wrapper  s
   
z.FlashInferMetadataBuilder._get_cascade_wrappernum_blocks_npseq_lens_npblock_table_tensornum_reqsr   c           
      C   s   t j|t j| jj d|d  d | jjd|d  | jd|d < | jjd|d  }|j| jd|d  dd | jj | }| jjd| }t	|f |||
d|dd || }	t |	dk|dk@ ||	| jj d|< |S )	a=  
        Compute paged_kv_indptr, paged_kv_indices, paged_kv_last_page_len for FlashInfer
        attention.

        Results are stored in self.paged_kv_indptr,
        self.paged_kv_indices, self.paged_kv_last_page_len buffers.

        Returns paged_kv_indices, a GPU tensor with shape [num_actual_pages].
        rB   )r6   r   NTnon_blockingr   i   )
BLOCK_SIZE)npcumsumrj   r@  rB  rC  ra  copy_r   _copy_page_indices_kernelstridewhererD  )
r|   re  rf  rg  rh  r   r@  num_actual_pagesr   paged_kv_last_page_len_npr=   r=   r>   _compute_flashinfer_kv_metadata  s8   
	z9FlashInferMetadataBuilder._compute_flashinfer_kv_metadatacommon_prefix_lencommon_attn_metadata
fast_buildc           1      C   sn  |j }|j}t|| jdd\}}}}	| j}
|j}|j}|j}|j}|j	}|dk}| jdk}t
| j| j|	|| j| j| jd| jj
| j|d}| joK| jdk}|dksR|oW|dkpW|}|dkoa|dkoa|}|s| jrktd| jjsstd| jjs{J d| jj| _t||j| j||||	|d d d d	}| jp|p| }|r|j nd }|d ur| nd }|d ur||
d  |
 nd }| jr|d usJ |dkr||d  ||  }|dd  |d d
  }||d  | ||d < t|| j| j | j!}|r|d usJ ||
 dksJ ||
 }||8 }|p| }|r.|d usJ |d us$J | "|||||
}nd }|r||
 }t#j$d|gt#j%dd} t#j$d|gt#j%dd}!|dd |f }"t#j$|
gt#j%dd}#|d d |d f }||8 }|d ustJ | j&jd d|  }$| j'jd | }%| ( |_)|j)j*| |g|!|$g|"|g|#|%g| j| j| j+| jd| j,| j-| j.| j| j/d |S |dkr|}&||&d  ||&  }|j0d |d ksJ |r||&d  ||&  }'| j&j1|&|d  }(|dd  |d d
  }t2|3 4 })t5||&d  ||&d  |'|(|)|d|_6n| 7 }*| j'j|&| }+|+j0d |ks$J | j&j|&|d  },|,j0d |d ks:J | jrt8|*t9sFJ |*j*d%i d|d|,d|d|+d| jd| jd| jd| jd| j+d| j,d| j-d| j.d| jd| j/d| j:d| j; n+t8|*t<sJ |*j*||,||+| j| j| j+| jd| j,| j-| j.| j| j/| jj| j:| j;d t=|*d |_6|dkr5|r|| dksJ d!t>|d | |d | |d"|_?|S |dk}-| j@o|-o|| jAk}.|}/| B|/|.}0tC|0| j&jd |/d  || j'jd |/ |d |/ | j| j | j| j+| jd#| j,| j-| j.| j| j/| jj| jD| j;d$ tE|0d |_?|S )&NT)decode_thresholdrequire_uniformr   rB   )
is_prefillforce_use_trtllmr<  has_specr  zcWindow left is not the same for all layers. One potential fix is to set disable_sliding_window=TruezFlashInfer backend currently only supports models in which all layers share the same values for the following hyperparameters: `window_left`, `logits_soft_cap`, `sm_scale`.)r  r  r   r  r  r  r	  r  r
  r  r  r  rB  r5   )r   r   r   r   r   r   )r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   o_data_typer   r   )r   z9TRTLLM decode requires uniform query lengths per request.)r   r   r   NONE)	pos_encoding_moder   r   r   r   r   r}  r   r   r=   )Frh  r  r/   r  r   r   r   rg  query_start_locquery_start_loc_cpur   r   r   r   r5  r   r  r<  r8  r=  r;  has_same_window_leftsr   has_same_all_paramsr  r6   r  r  r3  rB  numpyr+   r/  r1  rt  r9   tensorrj   r@  rD  rd  r  r   r   r   r   r   r   rf   ra  r   maxitemr   r
  rY  
isinstancerv   r   r   r   r   r  r  r(  r*  rc  fast_plan_decoder  r   )1r|   ru  rv  rw  rh  r  r  r  r  r	  r   r   r   rg  r   r~   r  uses_spec_reorderprefill_use_trtllmdecode_use_trtllmall_uses_trtllmis_only_trtllm_decodeattn_metadataneeds_seq_lens_cpuseq_lens_cpurf  re  qo_indptr_prefill_cpuquery_lens_prefill_cpunum_common_kv_blocksneeds_paged_kv_indicesr   shared_qo_indptr_cpushared_kv_page_indptr_cpushared_kv_page_indices_cpushared_kv_last_page_len_cpur   r   prefill_startqo_indptr_prefill_gpupaged_kv_indptr_prefill_gpumax_q_len_prefillprefill_wrapper"paged_kv_last_page_len_prefill_cpupaged_kv_indptr_prefill_cpupure_decoderZ  num_input_tokensrb  r=   r=   r>   build  s  





	







	

	




*

zFlashInferMetadataBuilder.buildc                 O   s   | j j| jjjkrdS dS )NF)r  r6   r  r  )r|   argskwargsr=   r=   r>   use_cascade_attention\  s   z/FlashInferMetadataBuilder.use_cascade_attentionr   )(r   r   r   r  r   r   r2   r   r   r   r9   r7   r}   rj   SymIntr6   r3   r?  r   r   r   r$   rS  rV  r   rW  r   rv   rY  r   rc  rd  rl  ndarrayrt  r(   r  r  r  __classcell__r=   r=   rK  r>   r     s   
  


$
:
  Or   c                   @   s   e Zd ZU dZeed< dejddfdedede	dede
e	 dB d	edB d
ede	dB dededB dejdB ddfddZdefddZdejfddZ			d#dejjdejdejdejdejdedejdB dejdB d ejdB dejfd!d"ZdS )$r   Tcan_return_lse_for_decodeN	num_headsr   scaler   alibi_slopessliding_windowr   r   	attn_typekv_sharing_target_layer_namesinksr`   c                 C   s  || _ || _t|| _|| _|d urtj|tjd}|| _|d u r%d| _	n|d df| _	| j	d ur6| j	d nd| _
|| _|| _|
| _| j | j | _|	tjkrRtdd | _|d urq|jd |krntd| d|jd  d	|| _t||| _t }| jo|jj | _d | _d | _d | _d S )
NrM  )r  r  rB   r   r  zaEncoder self-attention and encoder/decoder cross-attention are not implemented for FlashInferImplzWSinks must have the same number of heads as the number of heads in the layer. Expected z
, but got r   )r  r   r   r  r   r9   r  rI   r  r  r   r   r   r  num_queries_per_kvr'   DECODERr=  r  rf   r   r   support_trtllm_attnr   r  r7  supports_quant_query_input
bmm1_scale
bmm2_scale
o_sf_scale)r|   r  r   r  r   r  r  r   r   r  r  r  r  r=   r=   r>   r}   i  sN   


zFlashInferImpl.__init__	quant_keyc                 C   s   | j o| jdo|ttfv S )Nr   )r  r   r6  r   r   )r|   r  r=   r=   r>   fused_output_quant_supported  s
   

z+FlashInferImpl.fused_output_quant_supported	act_dtypec                 C   s4   | j d ur| j jtjkr| j tj| _ d S d S d S rx   )r  r6   r9   rI   rD   )r|   r  r=   r=   r>   process_weights_after_loading  s   z,FlashInferImpl.process_weights_after_loadingr   queryr   r   r\   r  outputoutput_scaleoutput_block_scalec
           #      C   s4  |dusJ d|du r| dS |j|jks#J d|j d|j | jdu r2|j|j | j | _| jdu r;|j| _t	|j
t}
t	|jt}|du rT|	du sSJ dna|jtks]J d|jdksd|
rk|jdkso|soJ d|jtkr}|	du s|J d	n|jtkr|	dusJ d
ntd|j |jdu r|  |_|jtkr| j|j | _n	|jtkr|j| _|j}| jdu rtjj|||dddf |dddf |j| j|j|j  | j!drt"#| j}|$|}|d| }|d| }|d| }|}|d| }|j%r|j&dusJ |'|j&(|| |S |j)}|j*}t"+ }|j,| }| j-dk}|dkr||d }|j.d |ksGJ |
st	|j
t/sSJ |j
j0}|dus^J |rt	|t1siJ |j2j3| j4kssJ |j2j5| j6p{dksJ |j2j7| jksJ |j2j8rJ |j9j3| j4ksJ |j9j5| j6pdksJ |j9j7| jksJ |j9j8sJ |j(|||||d ||d ||d d nt	|t:sJ |j3| j4ksJ |j5| j6pdksJ |j7| jksJ |j8sJ |j(|||j|j||d d nt	|j
tsJ |; <|j.}t= }|j
j>}|j
j?}t@ dks3J tA|s:J tA|sAJ tA|sHJ tA|sOJ tA|sVJ |jtkrr| jdusdJ tB||d |	||j.d}n| jdu szJ ||d }|jtkr| j!drtC|||j|j |j\}}n|}|}tDd+i d|d|d|d|d|d|j
jEd|j
jFd| jd| jd|jd|j
jGd|j
jHd| j4d | jId!| jd"| |dkr|d| }|j.d |ksJ |st	|jtJsJ |jj0}|dusJ |j3| j4ksJ |j5| j6pdks#J |j7| jks,J |rmtK jL|; d#d$}tM|}tjN|Od|OdftjP|jQd%}|j(|||j|j||d&d' tR||tK d(d)|d|< |S |j(|||j|j|d| d |S t	|jtsJ |; <|j.}t= }|jj>} |jj?}!t@ dksJ tA|sJ tA|sJ tA|sJ tA| sJ tA|!sJ |jtkr| jdusJ tB|d| |	d|j.d}n| jdu sJ |d| }||j dkrd}"n||j }"tS|||| |!|jjF| j| j| j4| jI| j||"d* |S ),aM  Forward pass with FlashInfer.

        Args:
            query: shape = [num_tokens, num_heads, head_size]
            key: shape = [num_tokens, num_kv_heads, head_size]
            value: shape = [num_tokens, num_kv_heads, head_size]
            kv_cache: KV cache tensor with different possible shapes:
                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
            attn_metadata: Metadata for attention.
        Returns:
            shape = [num_tokens, num_heads * head_size]
        NzOutput tensor must be provided.r   zQuery dtype mismatch: expected z, got z@output_block_scale is not supported when fusion has not happenedz2Query must be FP8 when attn+quant fusion happened.zMust use TRT-LLM attnz8output_block_scale should not be provided for fp8 outputz/output_block_scale is required for nvfp4 outputzUnsupported output dtype: rB   r           )r   )r^   r_   r   r   )datar  scale_start_indexoriginal_shaper  r\   rw   r   r   r   
max_kv_lenr  r  rl   r   r   r   r  r  r   r   r5   T)r^   r_   r   lser   F)r   )r  r\   rw   r   r   r   r  r  r   r  r  r   q_len_per_reqr=   )Tfill_r   r6   r  _q_scale_floatr   r  r  r   r  r
  r   r  r  	FP8_DTYPEr  r  	FP4_DTYPEr   _o_scale_floatrB  r  r  r  r  r9   ops_C_cache_opsreshape_and_cache_flashr  r   _k_scale_v_scaler6  r   r   viewr  r  rn  r   r  r	  r   permuter   rf   r   r   rv   ry   _window_leftr   _logits_soft_capr   	_sm_scale_causalrz   r   r   rk   r?   r   r   r,   r"   r   ru   r
   r   r   r   r   r  r   r   r   
empty_likeri   rN  rI   r7   r0   r	   )#r|   r   r  r   r   r\   r  r  r  r  r  r  r  torch_dtypeoutput_paddedr  r	  r   r   r3  r   r  rw   r]   seq_lens_prefillr   rr   rs   decode_queryrb  
output_tmpr  block_tables_decodeseq_lens_decoder  r=   r=   r>   forward  s  
























	






	


	C
<

zFlashInferImpl.forward)NNN)r   r   r   r  r   r   r'   r  r   r   r   r   r9   r   r}   r   r  r6   r  r   r   r  r  r=   r=   r=   r>   r   f  sr   
 
	

>	
r   r~  r  rh   TF
indptr_cpuindiceslast_page_len_cpur  r   r   r   r   r  r   r   r   r   r}  	data_typer   
rope_scale
rope_thetarj  r   r   c                 C   s  | j r	t| ddr(| ||||||||	|
|||||||||dd|| d| _dS | j s/J dt|}|du r9d}|durJ|du rC|}|du rI|}n|du rPd}|du rV|}t|tr`tt|n|}t|trltt|n|}|| jkr|t	d
|| jt|t| jkrt	d	| jj|dd
 | jj|dd
 t|d d}z3| j| j| j||||||||| j ||d|
g}| jdkr|| || |d | jj| | _W n ty } ztd| |d}~ww |	| _|
| _|| _|| _|| _|| _dS )ag  
    A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for
    cudagraph capture/replay, while the no cudagraph version turns back
    to the original plan.
    using original plan after passing host-side buffers:
    - only host-to-device copy of indptr and last_page_len buffers
    Modifications for cudagraph:
    - only host-to-device copy of indptr and last_page_len buffers.
    - avoid device-to-device copy of indices buffer.

    Part of the code get inspiration from the original plan from FlashInfer repo
    and the implementation of fast_decode_plan for FlashInfer in SGlang repo.
    vllm_first_callTNFzShould be cudagraph only herer  rh   zThe batch size should be fixed in cudagraph mode, the runtime batch size {} mismatches the batch size set during initialization {}zHThe size of indices should be less than or equal to the allocated bufferri  rB   rB  fa2r   zError in tensor core plan: )is_cuda_graph_enabledgetattrr   r  lenr  r   r9   _fixed_batch_sizer   format_paged_kv_indices_buf_paged_kv_indptr_bufrn  _paged_kv_last_page_len_bufr   _float_workspace_buffer_int_workspace_buffer _pin_memory_int_workspace_buffer_backendappend_cached_module
_plan_info	ExceptionRuntimeError_pos_encoding_moder  r  r  _rope_scale_rope_theta)r|   r  r  r  r  r   r   r   r   r  r   r   r   r   r}  r  r   r  r  rj  r   r   rl   qo_indptr_hostr  er=   r=   r>   r    s   (







r  rk  c                 C   s   t d}|||  }t || }t || d }|| }	t d|}
t d|	|D ]#}t j|| |
 ||
 |	k d}t j| | | |
 |||
 |	k d q,d S )Nr   rB   )mask)r   rC   rF   rH   rangerJ   )page_indicesblock_tablerM   cu_num_blocksrk  req_idxrow_ptr	start_idxend_idxr   rV   i	block_idsr=   r=   r>   ro    s   

ro  )r~  r  Nrh   NNNNNNTr  F)nr   dataclassesr   typingr   r  rl  r9   
flashinferr   r   r   r   flashinfer.decoder   r	   flashinfer.prefillr
   flashinfer.utilsr   typing_extensionsr   vllmr   vllm.configr   r   r   vllm.config.cacher   vllm.distributed.parallel_stater   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   r   r   r   r   vllm.platforms.interfacer   vllm.triton_utilsr   r   r   r   r   vllm.utils.math_utilsr    vllm.utils.platform_utilsr!   vllm.utils.torch_utilsr"   vllm.v1.attention.backendr#   r$   r%   r&   r'   r(   r)    vllm.v1.attention.backends.utilsr*   r+   r,   r-   r.   r/   vllm.v1.attention.ops.commonr0   'vllm.v1.attention.ops.merge_attn_statesr1   vllm.v1.kv_cache_interfacer2   vllm.v1.utilsr3   rT  	fp8_dtyper  r<   r  r   loggerr8   r?   jit	constexprr[   r   r6   r   ru   rv   r   r   r   r   r  r  r   r   r   r   r   r   r  ro  r=   r=   r=   r>   <module>   s  $ 		.
%nj%        C	

 