o
    پiG                    @  s  d dl mZ 	 d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlZd dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) erd dlm*Z* d dl+m,Z, e-e.Z/ej01 rej2j3ej4d dej5j6_7e' rd dl8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> G dd deZ?eG dd dZ@eG dd dZAeG dd dZBdaCdaDG dd  d eZEG d!d" d"ZFG d#d$ d$ZGG d%d& d&ZHd0d.d/ZIdS )1    )annotationsN)	dataclass)Enumauto)partial)TYPE_CHECKINGCallableListOptionalUnion)
DllmConfig)envs)AttentionBackend)#create_flashinfer_kv_indices_triton)get_attention_tp_size)AttentionType)SWATokenToKVPoolAllocator)ForwardBatchForwardMode)	SpecInput)get_int_env_varis_flashinfer_availableis_sm100_supportednext_power_of_2)RadixAttention)ModelRunner)dynamoT)"BatchDecodeWithPagedKVCacheWrapper#BatchPrefillWithPagedKVCacheWrapper$BatchPrefillWithRaggedKVCacheWrapperfast_decode_plan)merge_statec                   @  s   e Zd Ze Ze ZdS )WrapperDispatchN)__name__
__module____qualname__r   SLIDING_WINDOWCROSS_ATTENTION r(   r(   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/flashinfer_backend.pyr"   8   s    
r"   c                   @  sL   e Zd ZU dZdZded< dZded< dZded< dZded	< dddZ	dS )MultiItemScoringParamsa  Parameters for multi-item scoring in attention computation.

    Used when processing sequences with multiple items separated by delimiters,
    where each item needs specific attention patterns that respect item boundaries.

    Attributes:
        prefix_len_ptr: A uint32 1D tensor indicating the prefix length of each prompt.
                       The tensor size is equal to the batch size.
        token_pos_in_items_ptr: A uint16 1D tensor indicating the token position of each item
                               starting from 0 (delimiter) for each item. For batch size > 1,
                               sequences are concatenated with zero padding to ensure same length.
        token_pos_in_items_len: Zero padding length for token_pos_in_items_ptr to handle
                               batch_size > 1 case. Defines the padded length for each sequence.
        max_item_len_ptr: A uint16 tensor containing the max token length of all items
                         for each prompt in the batch.

    NOptional[torch.Tensor]prefix_len_ptrtoken_pos_in_items_ptrr   inttoken_pos_in_items_lenmax_item_len_ptrreturnboolc                 C  s
   | j duS )z'Check if multi-item scoring is enabled.N)r,   selfr(   r(   r)   
is_enabledV   s   
z!MultiItemScoringParams.is_enabled)r1   r2   )
r#   r$   r%   __doc__r,   __annotations__r-   r/   r0   r5   r(   r(   r(   r)   r*   =   s   
 r*   c                   @  s   e Zd ZU ded< dS )DecodeMetadata(List[BatchDecodeWithPagedKVCacheWrapper]decode_wrappersN)r#   r$   r%   r7   r(   r(   r(   r)   r8   [   s   
 r8   c                   @  s2   e Zd ZU ded< ded< ded< dZded< dS )	PrefillMetadata)List[BatchPrefillWithPagedKVCacheWrapper]prefill_wrappersr2   
use_raggedextend_no_prefixN Optional[MultiItemScoringParams]multi_item_params)r#   r$   r%   r7   rA   r(   r(   r(   r)   r;   `   s
   
 r;   c                      s   e Zd ZdZ				d:d; fddZd<ddZd=ddZ	d>d?ddZd@d&d'ZdAd*d+Z	d,d- Z
	.dBdCd4d5Z	.dBdCd6d7ZdDd8d9Z  ZS )EFlashInferAttnBackendzFlashinfer attention kernels.FNmodel_runnerr   skip_prefillr2   kv_indptr_bufr+   kv_last_page_len_bufinit_new_workspacec           	   	     sV  t    d| _d| _jj| _tj| _| jd u| _	t
jjjt  jt d| _jj| _|| _jj| _jd urJjjrJJ djd urWd| _tj| _njjrcd| _tj| _nd| _d | _djjjv sdjjjv sdjjjv sd	jjjv sd
jjjv rtj d jj!| _"d | _#d | _$d| _%| j"rd| _t&dd| _#t&dd| _$d| _%tj d t'd u rtj( }t)j*|t)j+j,da'|rt)j*tj( t)j+j,d| _-nt'| _-j.j/ |d u r fddt0| jD | _1n| jdksJ |g| _1|d u rt)j2 ft)j3j,d| _4n| jdksJ || _4| js4 fddt0| jD | _5d}t6 rGjj7rEt89d nd}t:| j-d|d| _;g | _<g | _=g | _>t0| jD ]/}|s}| j<?t@| j-d| jd | j=?t@| j-d| jd | j>?tA| j-d| j| jd q^|stB| | _CtD| | _Ed | _Fi | _Gi | _Hi | _Id S )Nfa2)kv_cache_dtypenum_attention_headsnum_kv_headsz=Sliding window and cross attention are not supported together      Qwen2ForCausalLMQwen3ForCausalLMMiMoForCausalLMQwen3VLForConditionalGeneration"Qwen3VLMoeForConditionalGenerationi    FT)SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZEi   (SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZEi   l        dtypedevicec                   &   g | ]}t j d  ft jjdqS rM   rU   torchzerosint32rW   .0_max_bsrC   r(   r)   
<listcomp>       z2FlashInferAttnBackend.__init__.<locals>.<listcomp>c                   rX   rY   rZ   r^   ra   r(   r)   rc      rd   r   zCUTLASS backend is disabled when piecewise cuda graph is enabled due to TMA descriptor initialization issues on B200. Using auto backend instead for stability.cutlassNHD)backend)rg   use_tensor_cores)Jsuper__init__prefill_backenddecode_backendserver_argsmulti_item_scoring_delimiterr   from_server_argsdllm_configis_dllm_modelshould_use_tensor_corerI   model_configrJ   r   get_num_kv_headsdecode_use_tensor_corescontext_lenmax_context_lenrD   is_multimodalsliding_window_sizeis_encoder_decodernum_wrappersr"   r&   dispatch_reasonr'   	hf_configarchitecturesr    SGLANG_FLASHINFER_WORKSPACE_SIZEsetenable_deterministic_inferenceenable_deterministicprefill_split_tile_sizedecode_split_tile_sizedisable_cuda_graph_kv_splitr   global_workspace_buffergetr[   emptyuint8rW   workspace_bufferreq_to_token_poolsizerange	kv_indptronesr]   kv_last_page_len	qo_indptrr   enable_piecewise_cuda_graphloggerwarningr   prefill_wrapper_raggedprefill_wrappers_pagedprefill_wrappers_verifyr:   appendr   r   FlashInferIndicesUpdaterPrefillindices_updater_prefillFlashInferIndicesUpdaterDecodeindices_updater_decodeforward_metadatadecode_cuda_graph_metadataprefill_cuda_graph_metadata draft_extend_cuda_graph_metadata)	r4   rC   rD   rE   rF   rG   global_workspace_sizefmha_backendr`   	__class__ra   r)   rj   s   s  














zFlashInferAttnBackend.__init__forward_batchr   r1   r*   c                   s  | j }|du s|jtjkrt S |j|k}t|dd}t|dd}g g }}d|du s2t|dkr9|jdg}d}t	|D ]t\}	}
||
 }||| }|j
|| }tj|ddd }t|dkr|d }||durp||	 nd }|t|r~| n| ||d t||d dd  }|||  tj}|| |d ||d< ||j
||< |}q?|rtdd	 |D |jj  fd
d|D }|r|st S |jj ttj|tj dtj|ddd@ tjdd |D dddS )u  Process multi-item scoring tensors for FlashInfer attention.

        This method handles sequences containing multiple "items" separated by delimiter tokens,
        where each item needs specific attention patterns that respect item boundaries.

        The method produces four key tensors for FlashInfer:
        - prefix_len_ptr: uint32 tensor with prefix length for each prompt in batch
        - token_pos_in_items_ptr: uint16 tensor with token positions starting from 0 at delimiters
        - token_pos_in_items_len: padding length for batch processing
        - max_item_len_ptr: uint16 tensor with max item length for each prompt

        Args:
            forward_batch: The forward batch containing input sequences and delimiter info

        Returns:
            MultiItemScoringParams: The processed multi-item scoring parameters

        Examples:
            Following FlashInfer definition: for 3 items of length 3, 2, 4 respectively:
            token_pos_in_items_ptr = [0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]

            Case 1: Single sequence
            Text: "What is the capital of France? <delim> London <delim> Paris <delim> Berlin <delim>"
            Tokens: [What, is, the, capital, of, France, ?, <delim>, London, <delim>, Paris, <delim>, Berlin, <delim>]
            Indices: [ 0,   1,  2,   3,      4,  5,     6,   7,     8,      9,     10,    11,    12,     13]
            - prefix_len_ptr: [7] (query length before first delimiter)
            - token_pos_in_items_ptr: [0, 1, 0, 1, 0, 1, 0] (delim=0, London=1, delim=0, Paris=1, delim=0, Berlin=1, delim=0)
            - token_pos_in_items_len: 7 (actual length)
            - max_item_len_ptr: [1] (max item length is 1 token - all options are single tokens)

            Case 2: Batch processing (batch_size=2)
            Sequence 1: 2 items of length 2, 1 → [0, 1, 2, 0, 1, 0] (6 elements)
            Sequence 2: 3 items of length 1, 3, 2 → [0, 1, 0, 1, 2, 3, 0, 1, 2, 0] (10 elements)
            After padding both to length 10:
            - token_pos_in_items_ptr: [0, 1, 2, 0, 1, 0, 0, 0, 0, 0,    0, 1, 0, 1, 2, 3, 0, 1, 2, 0]
            - token_pos_in_items_len: 10 (padded length for batch processing)
            - max_item_len_ptr: [2, 3] (max lengths per sequence)
        Nextend_prefix_lensextend_seq_lensr   rM   T)as_tuplec                 s  s    | ]}|  V  qd S N)numelr_   tr(   r(   r)   	<genexpr>  s    zDFlashInferAttnBackend._process_multi_item_scoring.<locals>.<genexpr>c              
     s0   g | ]}t |t j|  t j d gqS )rU   )r[   catr\   r   uint16r   rW   r/   r(   r)   rc     s    
zEFlashInferAttnBackend._process_multi_item_scoring.<locals>.<listcomp>rU   diml    c                 S  s$   g | ]}| tj  tjqS r(   )tor[   r]   maxr   r   r(   r(   r)   rc     s    )r,   r-   r/   r0   )rn   forward_moder   DECODEr*   	input_idsgetattrlenr   	enumerate	positionsr[   nonzeror   	is_tensoritemcummaxr   r   r   rW   tensoruint32r   stack)r4   r   	delimiterdelimiter_maskprefix_cache_lensr   r,   r-   	seq_startiseq_lenseq_endmaskposdelimiter_indicesfirst_delim
prefix_lendiff	token_posr(   r   r)   _process_multi_item_scoring.  sh   *

$
z1FlashInferAttnBackend._process_multi_item_scoringc                 C  sp  |j  r$| jj|j|j|j|j| j|j	|j
| jdd	 t| j| _d S |j  rI| jj|j|j|j|jd | jd|j	|j
d	 t| jdd| _d S |j  rn| jj|j|j|j|jd | jd|j	|j
d	 t| jdd| _d S |j}| jsy| jd ur~d}d}n
| j }t|j }t }| jd ur| |}| jj|j|j|j|j|| j||j	d | j|d t| j|||| _d S )NFr:   encoder_lens	spec_infofixed_split_sizedisable_split_kvprefix_lensr=   r>   r   r   )r=   r>   r   r   r   rA   )r   is_decode_or_idler   updatereq_pool_indicesseq_lensseq_lens_cpuseq_lens_sumr:   r   r   r   r8   r   is_draft_extendr   r   r;   is_target_verifyr   r   rx   rn   r   anyextend_prefix_lens_cpur*   r   r   )r4   r   r   r>   r?   rA   r(   r(   r)   init_forward_metadata  s   







z+FlashInferAttnBackend.init_forward_metadatarb   r.   max_num_tokenskv_indices_bufc                   s   |d u rt j|| j ft jdd n|  g fddt| jd D  | _t| jD ]}t| j| dkr=d| j| d< q+| jsbt j|| j t j	dd| _
dd | jD | _dd | jD | _d S d S )	NcudarU   c                   s   g | ]}   qS r(   cloner^   cuda_graph_kv_indicesr(   r)   rc     s    z?FlashInferAttnBackend.init_cuda_graph_state.<locals>.<listcomp>rM   r   c                 S     g | ]}|  qS r(   r   r_   xr(   r(   r)   rc         c                 S  r   r(   r   r   r(   r(   r)   rc     r   )r[   r\   rw   r]   r   r{   r   r   rD   r   cuda_graph_custom_maskr   cuda_graph_qk_indptrcuda_graph_qo_indptr)r4   rb   r   r   r   r(   r   r)   init_cuda_graph_state  s.   

z+FlashInferAttnBackend.init_cuda_graph_statebs
num_tokensr   torch.Tensorr   r   r   r   r   Optional[SpecInput]c                 C  s6  |  rgg }t| jD ]%}	|t| jd| jd| j| j|	 d |d  | j	|	 | j
d | d q|  }
| jj||| |
|||d | jd	 || j|< t|| _t| jD ]}	tt||	 ||	 _qXd S | rg }t| jD ]9}	|t| jdd| j| j|	 d |d  | j|	 d |d  | j	|	 | j
d | | j| j|	 d |d  d
 qr|  }
| jj||| |
d |d||d	 || j|< t|dd| _d S |  r0g }t| jD ]-}	|t| jd| jd| j|	 d |d  | j|	 d |d  | j	|	 | j
d | d	 q|  }
| jj||| |
d |d||d	 || j|< t|dd| _d S |! rg }t| jD ].}	|t| jd| jd| j|	 d |d  | j|	 d |d  | j	|	 | j
d | d	 q<|  }
| jj||| |
|| j"j# |d|d d	 || j|< t|dd| _d S t$d
|)Nrf   TrM   )rg   use_cuda_graphrh   paged_kv_indptr_bufferpaged_kv_indices_bufferpaged_kv_last_page_len_bufferr   )r   rg   qo_indptr_bufpaged_kv_indptr_bufpaged_kv_indices_bufpaged_kv_last_page_len_bufcustom_mask_bufmask_indptr_bufFr   )rg   r   r   r   r   r   zInvalid mode: forward_mode=)%r   r   r{   r   r   r   rl   ru   r   r   r   sumr   r   r   cpur   r   r8   r   r   r    begin_forwardr   r   rk   r   r   r   r   r   r;   r   is_dllm_extendrp   
block_size
ValueError)r4   r   r   r   r   r   r   r   r:   r   r   r=   r(   r(   r)   (init_forward_metadata_capture_cuda_graph!  s   








z>FlashInferAttnBackend.init_forward_metadata_capture_cuda_graphr   r   c	           	      C  s  |  r5| jj|d | |d | |d ur|d | nd || j| |d ur+|d | nd |d | jd	 d S | ri| jj|d | |d | |d urP|d | nd |d | j| d|d urb|d | nd |d	 d S | r| jj|d | |d | |d ur|d | nd |d | j| d|d ur|d | nd |d	 d S |	 r| jj|d | |d | |d ur|d | nd ||| j
j | j| d|d ur|d | nd d d	 d S td)Nr   Fr   TzInvalid forward mode)r   r   r   r   r   r   r   r   r   r  rp   r  r  )	r4   r   r   r   r   r   r   r   r   r(   r(   r)   'init_forward_metadata_replay_cuda_graph  sb   












z=FlashInferAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS NrM   r(   r3   r(   r(   r)   !get_cuda_graph_seq_len_fill_value  s   z7FlashInferAttnBackend.get_cuda_graph_seq_len_fill_valueTqkvlayerr   c              
   C  s@  | j j| | }|js|jn|j}|j}	| }| j jsb|d ur6|d us'J |r6|j	
|||||j|j |j|d|j|j|j	|j|j |j| j jrU| j j sX|jnd|	|j|jd}
n|d u r||d u r||j	|jd }|j	|jd }d}|js|jtjkrd}| js|jtjkrd}| j jr| jj|d|j|j|d|j|j|d|j |j||j|	d}
nK| jsd}| jj!|d|j|j|d|j|j|d|j |j||j|	d\}}|j!|d|j|j|j	|jd|j|	d\}}t"||||\}
}|r|j	
|||||j|j |
d|j|j S )N)causalsm_scalewindow_leftlogits_soft_capk_scalev_scaler   rM   TF)r  r  r  )#r   r=   _get_wrapper_idxis_cross_attentionout_cache_locencoder_out_cache_loc	logit_cap
contiguousr>   token_to_kv_poolset_kv_bufferr  r  forwardviewtp_q_head_numhead_dimget_kv_bufferlayer_idscalingrA   r5   ry   k_scale_floatv_scale_float	attn_typer   ENCODER_ONLYrq   r?   r   tp_k_head_numtp_v_head_numforward_return_lser!   )r4   r  r  r  r  r   save_kv_cacheprefill_wrapper_paged	cache_locr  or  o1s1o2s2r`   r(   r(   r)   forward_extend  s   	




z$FlashInferAttnBackend.forward_extendc           
      C  s   | j j| | }|js|jn|j}|d ur+|d usJ |r+|j|||||j|j	 |j
| d|j|j|j|j|j|j|j|jd}	|	d|j|j S )Nr  )r  r  r  r  )r   r:   r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r  r%  r&  )
r4   r  r  r  r  r   r,  decode_wrapperr.  r/  r(   r(   r)   forward_decode^  s,   	
z$FlashInferAttnBackend.forward_decodec                 C  sF   | j dkrdS | jtjkr|jdkS | jtjkr|jS td| j )NrM   r   r  zUnknown dispatch reason: )r{   r|   r"   r&   ry   r'   r  r  )r4   r  r(   r(   r)   r    s   

z&FlashInferAttnBackend._get_wrapper_idx)FNNF)
rC   r   rD   r2   rE   r+   rF   r+   rG   r2   )r   r   r1   r*   r   r   r   )rb   r.   r   r.   r   r+   )r   r.   r   r.   r   r   r   r   r   r+   r   r   r   r   )r   r.   r   r   r   r   r   r.   r   r+   r   r   r   r   r   r+   )T)
r  r   r  r   r  r   r  r   r   r   )r  r   )r#   r$   r%   r6   rj   r   r   r   r  r  r
  r4  r6  r  __classcell__r(   r(   r   r)   rB   p   s*     
<
z[
" 
>
y&rB   c                   @  sh   e Zd Zd+ddZ		d,d-ddZ		d,d-ddZ		d,d-ddZ		d,d-ddZ	 		d.d/d)d*ZdS )0r   rC   r   attn_backendrB   c                 C  s   |j jt  | _|j t | _|j j| _|j| _|j	| _
|j| _|| _|j| _|j| _|jj| _|j| _| jjtjkrB| j| _d S | jjtjkrO| j| _d S | jjdksWJ | j| _d S r	  )rs   rJ   r   num_qo_headsrt   rK   r!  rI   	data_typerV   q_data_typery   r9  r   r   r   req_to_tokentoken_to_kv_pool_allocatorr|   r"   r&   update_sliding_windowr   r'   update_cross_attentionr{   update_single_wrapperr4   rC   r9  r(   r(   r)   rj     s(   

z'FlashInferIndicesUpdaterDecode.__init__Nr   r   r   r   r+   r   r.   r:   r9   r   r   r   r   Optional[int]r   Optional[bool]c
           
      C     t  r   NotImplementedError
r4   r   r   r   r   r:   r   r   r   r   r(   r(   r)   r     s   z%FlashInferIndicesUpdaterDecode.updatec
           
      C  s6   |p| j }| j|d |||| jd d ||||	d
 d S )Nr   )r   r   )r:   call_begin_forwardr   rH  r(   r(   r)   rA    s   

z4FlashInferIndicesUpdaterDecode.update_single_wrapperc
                 C  s   | j d usJ tdD ]U}
|
dkr;tj|| j d d}|d ur0tj|| j d d}|  }n|  }|| }n|}|}|}d }|
dkoLt| jt}| j	||
 |||| j
|
 ||||d	 qd S )NrL   r   rM   )r   )r   use_sliding_window_kv_pool)ry   r   r[   clampr  r   
isinstancer>  r   rI  r   )r4   r   r   r   r   r:   r   r   r   r   
wrapper_idpaged_kernel_lens_tmpseq_lens_cpu_tmppaged_kernel_lens_sum_tmpkv_start_idx_tmprJ  r(   r(   r)   r?    s@   



z4FlashInferIndicesUpdaterDecode.update_sliding_windowc
                 C  sb   t dD ]*}
|
dkr|}|}n|}t|}|  }| j||
 |||| j|
 |||d qd S )NrL   r   )r   )r   r[   
zeros_liker  r   rI  r   )r4   r   r   r   r   r:   r   r   r   r   rM  paged_kernel_lenskv_start_idxr(   r(   r)   r@  	  s$   
z5FlashInferIndicesUpdaterDecode.update_cross_attentionFwrapperr   rS  paged_kernel_lens_sumr   rT  rJ  r2   c                 C  s  |d u r@t |}tj|dd|d|d < |d |d  }|jr$|j}n	tj|tjdd}t|f | j|||||| jj	d  n|j
|j}}|j	d d }|	rb|d }| j|d | |d |< d}|d urtd u rd}tj|d	d
adtd< tj|ddtd|d < t|jdo|jjtk}|r|j||| jd | | j| j| jd| j| jd|
|d ur|ndtd n!|j||| jd | | j| j| jd| j| jd|
|d ur|ndd |rd ad S d S )Nr   r   rM   r   rU   r  FTr  )rW   func)r;  r<  non_blockingr   r   global_override_indptr_cpu)r;  r<  rX  r   r   )r   r[   cumsumis_cuda_graph_enabled_paged_kv_indices_bufr   r]   r   r=  shaper   
kv_indicesr>  translate_loc_from_full_to_swarY  
empty_likehasattrr  rW  r    r   r:  rK   r!  r;  r<  )r4   rU  r   rS  rV  r   rT  r   r   rJ  r   r   r   r^  kv_last_indexlocally_overridewrapper_uses_fast_decode_planr(   r(   r)   rI  +  s   




z1FlashInferIndicesUpdaterDecode.call_begin_forwardrC   r   r9  rB   NN)r   r   r   r   r   r+   r   r.   r:   r9   r   r+   r   r   r   rC  r   rD  FNN)rU  r   r   r   rS  r   rV  r.   r   r   rT  r   r   r   r   r+   rJ  r2   r   rC  r   rD  	r#   r$   r%   rj   r   rA  r?  r@  rI  r(   r(   r(   r)   r     s$    
&#;,r   c                   @  sf   e Zd Zd0ddZ	d1d2ddZ		d3d4ddZ		d3d4dd Z		d3d4d!d"Z	#		d5d6d.d/ZdS )7r   rC   r   r9  rB   c                 C  s   |j jt  | _|j t | _|j j| _|j| _|j	| _
|j| _|| _|j| _|j| _|j| _|jj| _|j| _|j| _| jjtjkrJ| j| _d S | jjtjkrW| j| _d S | jjdks_J | j| _d S r	  )rs   rJ   r   r:  rt   rK   r!  rI   r;  rV   r<  ry   r9  r   r   r   r   r=  r>  r   r|   r"   r&   r?  r   r'   r@  r{   rA  rB  r(   r(   r)   rj     s,   

z(FlashInferIndicesUpdaterPrefill.__init__Nr   r   r   r   r+   r   r.   r   r=   r<   r>   r2   r   r   r   r   rC  c                 C  rE  r   rF  )r4   r   r   r   r   r   r=   r>   r   r   r   r(   r(   r)   r     s   z&FlashInferIndicesUpdaterPrefill.updaterA   r@   c                 C  sZ   |r|}|   }n|}|}| j| j|d |||||d | jd | jd ||	|
|d d S )Nr   )r   rA   )r  r   rI  r   r   r   )r4   r   r   r   r   r   r=   r>   r   r   r   rA   rS  rV  r(   r(   r)   rA    s*   
z5FlashInferIndicesUpdaterPrefill.update_single_wrapperc                 C  s   t dD ]I}|dkrt|t| j| | }|  }n|}|}|| }|dko0t| jt	}| j
| j|| ||||||| j| | j| ||	||d qd S )NrL   r   )rJ  rA   )r   r[   minimumr   ry   r  r   rL  r>  r   rI  r   r   r   )r4   r   r   r   r   r   r=   r>   r   r   r   rA   rM  rS  rV  rT  rJ  r(   r(   r)   r?    s<   
z5FlashInferIndicesUpdaterPrefill.update_sliding_windowc                 C  sx   t dD ]5}|dkr|}|}|}n|}t|}|  }| j| j|| ||||||| j| | j| ||	|d qd S )NrL   r   )rA   )	r   r[   rR  r  r   rI  r   r   r   )r4   r   r   r   r   r   r=   r>   r   r   r   rA   rM  rS  rT  rV  r(   r(   r)   r@    s0   
z6FlashInferIndicesUpdaterPrefill.update_cross_attentionFwrapper_raggedr   wrapper_pagedr   rS  rV  rT  r   r   rJ  c                 C  s  t |}|d u r_t |t |ksJ tj|dd|	d|d < |	d |d  }	tj|d tj|jd}t|f | j|||	||| jjd  tj|| dd|
d|d < |
d |d  }
d }nt	|t
sfJ ||||| j\}}	}
}|r|j|
|
| j| j| j| jd |r|	d }| j|d | |d |< |d ur| rd }|j}|j}|j}|j}n
|}d }d }d}d }|j|
|	|| jd | | j| j| jd| j| j|d|||||d	 d S )
Nr   r   rM      rU   )r<  r  T)	r<  kv_data_typecustom_maskrX  r   r,   r-   r/   r0   )r   r[   rZ  r   r]   rW   r   r=  r]  rL  r   generate_attn_arg_prefillr  r:  rK   r!  r<  r>  r_  r5   r,   r-   r/   r0   r   r;  )r4   rj  rk  r   rS  rV  r   r   rT  r   r   r>   r   rJ  r   rA   r   r^  rn  rb  use_custom_maskr,   r-   r/   r0   r(   r(   r)   rI  G  s   
	

	


z2FlashInferIndicesUpdaterPrefill.call_begin_forwardre  r   )r   r   r   r   r   r+   r   r.   r   r   r=   r<   r>   r2   r   r+   r   r   r   rC  rf  )r   r   r   r   r   r+   r   r.   r   r   r=   r<   r>   r2   r   r+   r   r   r   rC  rA   r@   rg  )rj  r   rk  r   r   r   rS  r   rV  r.   r   r   r   r   rT  r   r   r   r   r   r>   r2   r   r   rJ  r2   r   rC  rA   r@   rh  r(   r(   r(   r)   r     s"    
*3<8r   c                   @  sL   e Zd ZdZdddZdddZdddZd ddZdddZd!ddZ	dS )"FlashInferMultiStepDraftBackendzo
    Wrap multiple flashinfer attention backends as one for multiple consecutive
    draft decoding steps.
    rC   r   topkr.   speculative_num_stepsc              	   C  s   ddl m} || _|| _|| _|j| _|jj| j }tj| j|d ftj	|j
d| _tj|ftj	|j
d| _g | _t| jd D ]}| jt|d| j| | jd q@| jd j| _|jjjd | _d S )Nr   ) generate_draft_decode_kv_indicesrM   rU   T)rD   rE   rF   )!sglang.srt.speculative.spec_utilsrt  rr  rs  	page_sizer   r   r[   r\   r]   rW   r   r   r   attn_backendsr   r   rB   rw   r=  r]  pool_len)r4   rC   rr  rs  rt  rb   r   r(   r(   r)   rj     s8   	z(FlashInferMultiStepDraftBackend.__init__r   r   kv_indices_bufferr   call_fnr   c           	      C  s  |j }| j| }|j}| j| j|| jf |j|jj|j|| j	|j
| j|jd | j	jd t|t| jt|| j |jd us@J |j sGJ | j	d d d |d f  }t| jd D ]+}| j	|d |d f |j_	|| d || j ||d    |j_|| a||| q]d ad S r	  )
batch_sizerr  r   rt  rs  r   r   r=  r   r   r   rx  r]  r   rv  r   is_draft_inputr  r   r^  rY  )	r4   r   ry  rz  num_seqsr   r   indptr_cpu_wholer   r(   r(   r)   common_template  s>   

z/FlashInferMultiStepDraftBackend.common_templatec                   sD   t j j|j j  j ft jdd} fdd} ||| d S )Nr   rU   c                   s4   |j j |j _|j j |j _ j|  | d S r   )r   r   r   r^  rw  r   r   r   r3   r(   r)   rz    s
   

zFFlashInferMultiStepDraftBackend.init_forward_metadata.<locals>.call_fn)r[   r   rs  r{  rr  rw   r]   r  )r4   r   r^  rz  r(   r3   r)   r     s   		z5FlashInferMultiStepDraftBackend.init_forward_metadatarb   r   c                 C  sT   t j| j|| j ft jdd| _t| jd D ]}| j| j||| j| d qd S )Nr   rU   rM   )r   )	r[   r\   rs  rw   r]   r   r   rw  r   )r4   rb   r   r   r(   r(   r)   r   %  s   
z5FlashInferMultiStepDraftBackend.init_cuda_graph_statec                   s     fdd}  | j| d S )Nc              	     s4    j |  j|j|j j |j|jd tj|jd d S )N)r   r   r   )	rw  r  r{  rr  r   r   r   r   r   r  r3   r(   r)   rz  2  s   


zYFlashInferMultiStepDraftBackend.init_forward_metadata_capture_cuda_graph.<locals>.call_fnr  r   )r4   r   rz  r(   r3   r)   r  1  s   zHFlashInferMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphr   c                   s"    fdd} |j| d S )Nc              
     s.   j |  j |j|jdd tj|j|jd d S )Nr  )r   r   r   r   r   )rw  r  r   r   r   r   r   r   r  r   r4   r(   r)   rz  B  s   

zXFlashInferMultiStepDraftBackend.init_forward_metadata_replay_cuda_graph.<locals>.call_fnr  )r4   r   r   rz  r(   r  r)   r  ?  s   zGFlashInferMultiStepDraftBackend.init_forward_metadata_replay_cuda_graphN)rC   r   rr  r.   rs  r.   )r   r   ry  r   rz  r   r7  )rb   r.   r   r.   )r   r   r   r.   )
r#   r$   r%   r6   rj   r  r   r   r  r  r(   r(   r(   r)   rq    s    

)
-

rq  rI   torch.dtyperJ   r.   rK   r1   r2   c              	   C  s   t jd}|dur| dkS zddlm} |||sW dS W dS  ttfy,   Y nw || }| tj	tj
fv r;dS | tjtjtjfv rI|dkS dS )	a&  
    Determine whether to use tensor cores for attention computation.

    Args:
        kv_cache_dtype: Data type of the KV cache
        num_attention_heads: Number of attention heads
        num_kv_heads: Number of key/value heads

    Returns:
        bool: Whether to use tensor cores
    !SGLANG_FLASHINFER_USE_TENSOR_CORENtruer   ))_grouped_size_compiled_for_decode_kernelsTF   )osenvironr   lowerflashinfer.decoder  ImportErrorAttributeErrorr[   float8_e4m3fnfloat8_e5m2float16halfbfloat16)rI   rJ   rK   env_overrider  gqa_group_sizer(   r(   r)   rr   Q  s(   rr   )rI   r  rJ   r.   rK   r.   r1   r2   )J
__future__r   loggingr  dataclassesr   enumr   r   	functoolsr   typingr   r   r	   r
   r   r[   sglang.srt.dllm.configr   sglang.srt.environr   -sglang.srt.layers.attention.base_attn_backendr   !sglang.srt.layers.attention.utilsr   sglang.srt.layers.dp_attentionr   !sglang.srt.layers.radix_attentionr   $sglang.srt.mem_cache.swa_memory_poolr   ,sglang.srt.model_executor.forward_batch_infor   r    sglang.srt.speculative.spec_infor   sglang.srt.utilsr   r   r   r   r   &sglang.srt.model_executor.model_runnerr   	getLoggerr#   r   SGLANG_ENABLE_TORCH_COMPILEr   _loggingset_logsERROR_dynamoconfigsuppress_errors
flashinferr   r   r   r    flashinfer.cascader!   r"   r*   r8   r;   r   rY  rB   r   r   rq  rr   r(   r(   r(   r)   <module>   sl    


      &    " 