o
    پi                     @  s  d dl mZ d dlmZ d dlmZmZ d dlZd dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ erfd dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z& d dl$m'Z( e&Z%e(Z'd dl)m%Z* d dl)m'Z+ eG dd dZ,	 dPdQddZ-dRd"d#Z.e	j/0 d$d% Z1G d&d' d'eZ2e
j3dSd-d.Z4dTd5d6Z5G d7d8 d8Z6		dUdVdFdGZ7e	j8dHe dIdWdNdOZ9dS )X    )annotations)	dataclass)TYPE_CHECKINGOptionalN)AttentionArch)AttentionBackend)AttentionType)	SWAKVPool)ForwardBatchForwardMode)get_global_server_args)	SpecInput)get_compiler_backend)RadixAttention)ModelRunnermerge_state_v2)flash_attn_varlen_func)flash_attn_with_kvcachec                   @  s   e Zd ZU dZdZded< dZded< dZded	< dZded
< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< eG dd dZdZded< dZded< dS )FlashAttentionMetadatazMetadata to be init once in the model forward pass,
    each layer's forward pass can reuse the metadata.

    For each init metadata function, we will try set up them in below order
    Ntorch.Tensorcache_seqlens_int32   intmax_seq_len_qr   max_seq_len_kcu_seqlens_qcu_seqlens_kr   tuplewindow_size
page_tableswa_page_tableencoder_cu_seqlens_kencoder_max_seq_len_kencoder_lens_int32encoder_page_tablec                   @  sJ   e Zd ZU dZded< dZded< dZded< dZded< dZded	< dS )
z-FlashAttentionMetadata.LocalAttentionMetadataNr   local_query_start_loclocal_seqused_klocal_block_tabler   r   local_max_query_lenlocal_max_seq_len)	__name__
__module____qualname__r(   __annotations__r)   r*   r+   r,    r1   r1   f/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/flashattention_backend.pyLocalAttentionMetadataJ   s   
 r3   z Optional[LocalAttentionMetadata]local_attn_metadata Optional[FlashAttentionMetadata]swa_spec_metadata)r-   r.   r/   __doc__r   r0   r   r   r   r   r!   r"   r#   r$   r%   r&   r'   r   r3   r4   r6   r1   r1   r1   r2   r   '   s$   
 r   attn_chunk_sizer   query_start_loc_np
np.ndarrayseq_lens_npblock_tabler   	page_sizereturn7tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]c                 C  s>  |  }t| |}|| | }||k r|}|} |dd |dd  }|jd }t| || |   |tj}	| ||    }
dt||	 |  }t|}|d }t	|| |}tj
|tjd| }t	||| d }t	||	 |}|	||dk< t|| |d   | |dk ||dk< tt|dtj}tj|d | tjd}|
||d < t	||||  t	|
|  }|| }| | dksJ d|  d| | | }ttj
|tjd||ftj|dd	 }| j|jd d d
}t	tj
|tjd|| }t|}t|}|||f |d}||||fS )a$  
    Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
    local attention blocks, where each block is passed to the attention kernel
    as an independent local ("virtual") batch item.

    Args:
        attn_chunk_size: Size of local attention chunks
        query_start_loc_np: Cumulative sum of query lengths (numpy array)
        seq_lens_np: Sequence lengths (numpy array)
        block_table: Block table for KV cache
        page_size: Size of each page in the KV cache

    Returns:
        seqlens_q_local: Query sequence lengths for local attention
        cu_seqlens_q_local: Cumulative sum of query sequence lengths for local attention
        seqlens_k_local: Key sequence lengths for local attention
        block_table_local: Block table for local attention
    r   Nr   r   dtyper   r   zattn_chunk_size z is not divisible by page_size )axis)max)rD   minshapenpminimumastypeint32cdivcumsumrepeatarangepadfullbroadcast_toexpand_dimsflattencliptorch
from_numpyview)r8   r9   r;   r<   r=   max_seq_leneffective_chunk_size	q_seqlensactual_batch_sizeq_tokens_in_first_blocktokens_in_last_blocklocal_blockscu_num_blocksvirtual_batchesblock_offsetsrN   rarangeseqlens_q_localcu_seqlens_q_localseqlens_k_localk_seqstarts_absoluteblock_startspages_per_local_batchblock_indicesbatch_indicesbatch_indices_torchblock_indices_torchblock_table_localr1   r1   r2   $make_local_attention_virtual_batches   sr   




	
rn   abc                 C  s   | |   S )zCeiling division.r1   )ro   rp   r1   r1   r2   rK   &  s   rK   c                 C  s   t | |||S Nr   )os_ao_exps_br1   r1   r2   merge_state_v2_wrapper,  s   rv   c                      s   e Zd ZdZ					dEdF fd	d
ZdGddZ				dHdIddZ				dHdJddZdKd#d$ZdLd.d/Z		dMdNd3d4Z
d5d6 ZdOd:d;ZdPd<d=ZdPd>d?Z	dMdQdCdDZ  ZS )RFlashAttentionBackenda8  FlashAttention backend implementation.

    Note about the init:
    - If no spec decoding
        - FlashAttentionBackend will be init once when the server starts.
    - If spec decoding
        - FlashAttentionBackend will be init once for the target worker
        - FlashAttentionMultiStepBackend will be once for the draft worker
            - It will spawn num_steps FlashAttentionBackend for the draft worker

    Note about CUDA Graph:
    - We only support CUDA Graph for Decode (Normal Decode and Draft Decode) and Target Verify.
    - We don't support CUDA Graph for Extend and Draft Extend.
    - When server init, init_cuda_graph_state will be called first and then init_cuda_graph_capture will be called.
    - For each forward batch, init_replay_cuda_graph will be called first and then replay the graph.
    Fr      model_runnerr   skip_prefillboolc                   sZ  t    |jd ur|jjrJ d|jj| _d | _d | _|jj| _|j	| _	i | _
i | _|jj| _|j| _|jj| _|j| _|jjtjk| _|| _t|jtoT|jjdk| _| jr]|j| _|jjpbd| _|| _|jj| _|| _ || _!|jj"| _#| j#r|j$d usJ d|j$| _$|j| _| jd uo| jdk| _%|jj&s| j!dkr|jj'sd| _(d S d| _(d S )Nz=Sliding window and cross attention are not supported togetherr   z4Attention chunk size is required for local attentionr      r   ))super__init__sliding_window_sizemodel_configis_encoder_decoderforward_metadata#forward_metadata_spec_decode_expandcontext_lenmax_context_lendevicedecode_cuda_graph_metadatatarget_verify_metadatareq_to_token_poolreq_to_tokenkv_cache_dtypeserver_argskv_cache_dtype_strr=   attention_archr   MLAuse_mlarz   
isinstancetoken_to_kv_poolr	   swa_layer_numsuse_sliding_window_kv_poolspeculative_eagle_topktopkspeculative_num_stepsspeculative_num_draft_tokensspeculative_step_idfa_impl_veris_local_attention_modelhas_local_attentionattention_chunk_sizehas_swaenable_deterministic_inferencedisable_cuda_graph
num_splits)selfry   rz   r   r   r   r   	__class__r1   r2   r~   C  s`   
	








zFlashAttentionBackend.__init__forward_batchr
   c              	   C  s,  t  }|j}|j}|j}|j rC|jdur| jdkrb|| jd  	t
j|_|j  | jd  |_t
jd|d t
j|d|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_n|	t
j|_| j|_|j  |_t
jd|| j d | jt
j|d|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_t  }| jd }t
j| | j f||t
jd|_d|_t
jd|j d t
j|d|_t
jd|j | d |t
j|d|_|j d	| j!}|ddd|f " 	t
j|_|| _#n9|	t
j|_|j  |_t
jd|d t
j|d|_t
jjt
j|dt
jdd|_|jj|jd|jf |_| $||| n|j% r| jdkr|j| j& 	t
j|_| j&|_|j  | j& |_t
jd|| j& d | j&t
j|d|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_| $||| n|j	t
j|_| j&|_|j  |_t
jd|| j& d | j&t
j|d|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_t  }d|_t
jd|j | j& d t
j|d|_t
j| j&|d
'd}	|	(|j d	|j'd }
t
jjt
j|j| j& )| j&ddddd	 }|
j)| j&dd|dddf   dd	}|jj*|  d	| j&}|	(|j+d | j&}t
,|||| j& }t
j-|dd\}}|jj|jddf .d|
j)| j&dd}|.d||_|j/dd	t
j|_t
jjt
j|jdt
jdd|_|| _#| j0r| 1|| nr|jj2ddr&|	t
j|_|j  |_t
jjt
j|dt
jdd|_|jj|jd|jf |_t3|j4s|jj5ddr|j6}t|j7|_t
jjt
j|dt
jdd|_n|j|_|j|_|jt8j9kr&| $||| |j:durw|j: dks8J d|j:	t
j|_;t
jjt
j|j;dt
jdd|_<|j;  |_=|jj|jd|j=f |_>|jj|j|j=|j=|j f |_| j?r| j@A|j|_B| jCdkrt
jd|jj+d | jC| jd
| _D| j?r|jBdd| jDf | jC |_B|jdd| jDf | jC |_| jdkr|j r|jdur|j| jC }| j|8  _|)| j}| j# j|7  _t
jE|j| j | j!t
j| jd}|j d	| j!}tF| j#j||||| j| jCd || j#_|| _GdS )zNInitialize forward metadata hence all layers in the forward pass can reuse it.Nr   r   rA   r   dimrA   rB   steprA   r   r   rA   r   r   r   T)include_draft_extend_v2
include_v2z(Only encoder size 1 is supported for nowr   r"   last_page_lensdecode_length	cache_locr   r=   )Hr   seq_lens
batch_sizer   forward_modeis_decode_or_idle	spec_infor   r   torU   rJ   r   seq_lens_cpurD   itemr   rN   r   nn
functionalrO   rL   r   r   r   req_pool_indicesr"   r   rP   numelout_cache_locrW   r   
contiguousr   _maybe_init_local_attn_metadatais_target_verifyr   	unsqueezeexpandrepeat_interleavecustom_maskrF   wheresortgathersumr   '_init_sliding_window_attn_spec_metadata"is_extend_or_draft_extend_or_mixedanyextend_prefix_lens_cpuis_draft_extendextend_seq_lensextend_seq_lens_cpur   EXTENDencoder_lensr&   r$   r%   r'   r   r   translate_loc_from_full_to_swar#   r=   strided_indiceszeros draft_decode_set_expand_metadatar   )r   r   metadataseqlens_in_batchr   r   metadata_expandr   r   offsetscolscum_lenmask_extraction_indicesmaskcol_indiceskeys_
sort_ordernon_masked_page_tabler   r   expanded_last_page_lensexpand_page_tabler1   r1   r2   init_forward_metadata  sF  






	






	
z+FlashAttentionBackend.init_forward_metadataTNqr   kvlayerr   q_ropeOptional[torch.Tensor]k_ropesinksc
           4      C  s  |d ur/|d us
J |r/|j s|jn|j}
| js&|j||
|||j|j n	|j||
|| | j	}|j
d uo;|j
dk}|rC|j
dfnd}d\}}| jdkr|jdkr| jdkr|jd uro|j|jf}|j|}|j|}|| j}|d ur|| jnd }|d ur|| jnd }d}|j s|jtjkrd	}| jo| jd uo|jd uot|d
o|j}|j o| jdko| }t}t}| jdkrt n|}| jdkrt!n|}i }|	d ur|	|d< |r|j}|j"}|j#}|j$}|j%}nC|r
|j&d ur
|j&}|j'}|j(}|j)}|j*}|j+}n'|j'}|r%| j,r%|j-d ur|j-}n| j.|j'}|j(}|j)}|j*}|j+}| js-|j/|j0\} }!| 1d| j2|j|j} |!1d| j2|j3|j4}!|j rc|j5}|j6}|j7}d}|d"i d|8 1d|j9|jd| d|!d|d|d|d|s|nd d|d|j:d|rd	n|d|d|j;d|d|d|d| j<|}"|r)|"^}#}$}%|d"i d|8 1d|j9|jd| 1dd|j|jd|!1dd|j3|jd| j=j'd| j=j)d| j=j(d| j=j+d| j=j*d|j:dd	d|d|j;d|d|ddd| j<|^}&}'}(t>|#|$j?8 |&|'j?8 \}#})n|"}#n|j@d ur|j s|jjAdds|j@rtB jCrLJ |jDd usTJ |jEd us\J |jFd usdJ |jD}*|*dksnJ |jGstJ |d"|1d|j9|j|1d|j|j|jH|1d|j|j4|jH|j(|jE|* |j*|jF|* |j:d	dd
|}+nG|jIs|j(n|j+}|jIs|j*n|jJ},|d"|1d|j9|j|1d|j|j|jH|1d|j|j4|jH|j(||j*|,|j:d|jGd
|}+|jGr
|+^}+}-}%tKL|-dd8 }-|+|-fS |+S | jdv sJ d |jM|j0|jH}.|.d d d d |j4d f }|.d d d d d |j4f }/|1d| j2|j|j|j4 }0|/1d| j2|j3|j4}1|d urq|1d|j9|j4}2|1d|j9|j|j4 }n'|8 1d|j9|j}3|3d d d d d |j4f }2|3d d d d |j4d f }|d"i d|d|0d|1d!|2d|d|d|d|s|nd d|d|j:d|rd	n|d|j;d|d|d|d| j<}"|r:|"^}#}$}%|d"i d|d|0d|1d!|2d| j=j'd| j=j)d| j=j(d| j=j+d| j=j*d|j:dd	d|d|j;d|d|ddd| j<^}&}'}(t>|#|$j?8 |&|'j?8 \}#})n|"}#|#1d|j9|j4 S )#Nr   r   r   NNauto   r|   TF	use_iroper   r   r   k_cachev_cacher"   cache_seqlensr   cu_seqlens_k_newmax_seqlen_qsoftmax_scalecausalr!   softcap	k_descale	v_descalereturn_softmax_lser   r   )
r   r   r   r   r   r   max_seqlen_kr   r   r   )rx   zOnly FA3 support hereqvr1   )Nis_cross_attentionr   encoder_out_cache_locr   r   set_kv_bufferk_scalev_scaleset_mla_kv_bufferr   r   r   head_dimr   r   tp_k_head_numr   r   r   	attn_typer   ENCODER_ONLYr   r   r4   hasattrr   r   r   r   flash_attn_varlen_func_fa3flash_attn_with_kvcache_fa3flash_attn_varlen_func_fa4flash_attn_with_kvcache_fa4r*   r(   r)   r+   r6   r"   r   r   r   r   r   r#   r   get_kv_bufferlayer_idrW   r=   tp_v_head_num
v_head_dimr'   r&   r$   r   tp_q_head_numscaling	logit_capr   r   rv   Tattn_attend_prefix_cacher   r   disable_chunked_prefix_cacheprefix_chunk_idxprefix_chunk_cu_seq_lensprefix_chunk_max_seq_lensmha_return_lserA   mha_one_shotr   rU   	transposeget_key_buffer)4r   r   r   r   r   r   save_kv_cacher   r   r   r   r   is_swa_layerr!   r   r   descale_shaper   use_local_attnuse_cascade_attnflash_attn_varlen_func_baseflash_attn_with_kvcache_baser   r   kwargslocal_metadatar"   r   r   r   r6   r   	key_cachevalue_cacheresultrr   softmax_lseresto_expandsoftmax_lse_expandrest_expandr   	chunk_idxoutputr  lsekv_cachec_kvk_rope_cache
c_kv_cacheq_nopeq_allr1   r1   r2   forward_extend  s  






	

	





	

	

z$FlashAttentionBackend.forward_extendr>   c
           +      C  sH  |d ur/|d us
J |r/|j s|jn|j}
| js&|j||
|||j|j n	|j||
|| | j	}t
|dd }| joK| jd uoK|d uoKt|doK|j}|jd uoU| jdk}|jd uo_|jdk}|rg|jdfnd}d}|j st|jtjkrvd}i }|	d ur|	|d	< d
\}}| jdkr|jdkr|jd ur|j|jf}|j|}|j|}|| j}|d ur|| jnd }|d ur|| jnd }| js|j|j\}}|d| j|j|j}|d| j|j |j!}|j rt"d |# d|j$|j|||j%|j&|j'|j(d|j)dd|j*||| j+d|}n|r?t"d |# d|j$|j|||j,|j-|j.d |j/|j)dd|j*||| j+d|}n|j0}|rZ| j1rZ|j2d urS|j2}n| j3|j0}|j4}|j5}|j6}|# d|j$|j}t"d ||||||j'||j)|rdn|||j*|||| j+d|}|r|^}}} t"d i d|d|d|d| j7j0d| j7j4d| j7j'd| j7j5d| j7j6d|j)ddd|d|j*d|d|ddd| j+|^}!}"}#t8||j9# |!|"j9# \}}$n,|}n(|j:|j|j;}%|%d d d d |j!d f }|%d d d d d |j!f }&|d| j|j|j|j! }'|&d| j|j |j!}(|d urM|d|j$|j!})|d|j$|j|j! }n'|# d|j$|j}*|*d d d d d |j!f })|*d d d d |j!d f }|j6}t"d i d|d|'d|(d|)d|j0d|j4d|j'd|j5d|d|j)d|rdn|d|j*d|d|d|d| j+}|r|^}}} t"d i d|d|'d|(d|)d| j7j0d| j7j4d| j7j'd| j7j5d| j7j6d|j)ddd|d|j*d|d|ddd| j+^}!}"}#t8||j9# |!|"j9# \}}$n|}|d|j$|j! S )!Nr4   r   r   r   r   r   TFr   r   r   r   )r   r   r   r"   r   r   r   r   r   r   r!   r   r   r   r   )r   r   r   r"   r   r   r   r   r   r!   r   r   r   r   r   r   r   r   r"   r   r   r   r   r   r   r!   r   r   r   r   r   r  r1   )<r  r   r  r   r   r  r  r  r  r   getattrr   r   r  r   r   r   r   r  r   r  r   r	  r   r
  r   r   r   r  r  rW   r=   r  r  r   r   r  r'   r&   r   r$   r  r  r   r*   r)   r(   r+   r"   r   r#   r   r   r   r   r   r   r  r"  rA   )+r   r   r   r   r   r   r#  r   r   r   r   r   r4   r&  r'  r$  r!   r   r*  r   r   r%  r,  r-  rr   r"   r   r   r   
q_reshapedr.  r/  r0  r1  r2  r3  r   r7  r8  r9  r:  r;  r<  r1   r1   r2   forward_decode3  s$  





	


	

	


z$FlashAttentionBackend.forward_decodemax_bsr   max_num_tokensc           
   
   C  s  | j | j d | j }tj|tj| jdtjd|d tj| jdtj|d tj| jdtj||tj| jdtjd| j | j| jdd| _| jr| j }| jpNd}| j	}||| d |  }|| d | }tj|d tj| jdtj|tj| jdtj||tj| jdd| _
| jrtj||tj| jd| jd< | jdkrtj|tj| jdtjd|| j d | jtj| jdtj|d tj| jdtj|| j tj| jdd	| _| jd }	tj|| j f|	| jtjd
tjd|| j d tj| jdtjd|| j |	 d |	tj| jdtj|| j |	d tj| jdd	| _| jdur| jdkrtj||tj| jd| jd< tj|tj| jdtjd|| j d | jtj| jdtj|d tj| jdtj||tj| jdtjd| j | j| jdd| _tj|tj| jdtj|d tj| jdtj|d tj| jdtj||tj| jdtjd| j | j| jdd| _| jdkr\tj|tj| jdtjd|| j d | jtj| jdtj|d tj| jdtj|| j tj| jdd	| _tj|| j tj| jdtj|| j d tj| jdtjd|| j d tj| jdtj|| j | jtj| jdd| _| jr\tj|| j tj| jdtj|| j d tj| jdtjd|| j d tj| jdtj|| j | j tj| jdd| _| jrtj|| j tj| jdtj|tj| jdtj|d tj| jdd| _dS i | _dS )a  Initialize CUDA graph state for the attention backend.

        Args:
            max_bs (int): Maximum batch size to support in CUDA graphs

        This creates fixed-size tensors that will be reused during CUDA graph replay
        to avoid memory allocations.
        r   r   r   r   )r   r   r   r"   r   )r(   r)   r*   r#   r   )r   r   r   r"   r   Npage_table_draft_decode)r   r   r   r"   )r'   r&   r$   )r   r=   rU   r   rJ   r   rN   r   r   r   %decode_cuda_graph_local_attn_metadatar   r   !draft_decode_metadata_topk_normalr   rP   !draft_decode_metadata_topk_expandr   r   draft_extend_metadata"target_verify_metadata_topk_normal"target_verify_metadata_topk_expandr   target_verify_metadata_topk_swar   encoder_metadata)
r   rA  rB  max_num_pagesrX   r=   r8   max_virtual_batchesmax_pages_per_blockr   r1   r1   r2   init_cuda_graph_stateF  s  	









z+FlashAttentionBackend.init_cuda_graph_statebs
num_tokensr   r   r   r   r   r   Optional[SpecInput]c                 C  s  t  }t  }	|j}
| r@|dur| jdkr_| jd d| |_|  | jd  |_	| jd d|d  |_
tjjtj|jdtjdd|_| jd d|ddf |_|| j|< nX| jd d| |_| j|_|  |_	| jd d|d  |_
| jd	 d|d  |_| jd
 d|ddf |_| jd d|| j  |	_d|	_| jd d|| j d  |	_
| jd	 d|| j d  |	_| jd
 d|| j  |	_|| j|< |	| j|< n|tj|_t|}|j}
tjjtj|dtjdd|_|  |_	| jd
 d|ddf |_| jr&| jd d|ddf |_tjd|d tj|
d|_
|| j|< | || nw| rc| jdkr| jd d| |_|j|| j  | j|_|  | j |_	tjd|| j d | jtj|
d|_
| jd	 d|d  |_| jd
 d|ddf |_|| j|< n| j d d| |_| j|_| j d d|d  |_
| j d	 d|d  |_| j d
 d|ddf |_| j!d d|| j  |	_d|	_| j!d d|| j d  |	_
| j!d	 d|| j d  |	_| j!d
 d|| j  |	_|| j |< |	| j!|< | j"rbt  }| j#d d|| j  |_d|_| j#d d|| j d  |_
| j#d	 d|| j d  |_| j#d
 d|| j  |_|| j#|< ||_$nT|j%ddr| j&d d| |_|j| || }||_|  |_	tjd|| d |tj|
d|_
| j&d	 d|d  |_| j&d
 d|ddf |_|| j&|< |dur|' }| j(d d| |_)| j(d d|d  |_*| j(d d|ddf |_+|| _,|	| _-dS )z5Initialize forward metadata for capturing CUDA graph.Nr   r   r   r   r   rB   rC  r   r"   r#   r   Tr   r&   r$   r'   ).r   r   r   r   r   r   rD   r   r   r   r   rU   r   r   rO   rL   rJ   r   r"   rE  r   rF  r   lenr   r#   rN   -_maybe_update_local_attn_metadata_for_capturer   r   copy_r   rH  rI  r   rJ  r6   r   rG  r   rK  r&   r$   r'   r   r   )r   rP  rQ  r   r   r   r   r   r   r   r   r   metadata_swanum_tokens_per_bs
encoder_bsr1   r1   r2   (init_forward_metadata_capture_cuda_graphW  s  





















z>FlashAttentionBackend.init_forward_metadata_capture_cuda_graphseq_lens_sumr   r   c
           &      C  s  |d| }|d| }|d| }|j }
d}d}| r<|dur| jdkr_| j| }|  }|| j d |_|j| j d | j }t	|j
|j|j| j|| jd ||| jd | j
 n| j| }| jdkrr|| j }|| }|j
| |  |_|j| j d | j }| jd }|d| }| j|dddf |dddf f | j }|jddd|f | | j| }| jd }|	d| j}| jdkrt|j
|j|||| j| jd n*|jd }|jd|d|f |ddd|f  n| j| }|  }|| j d | j }||_t	|j
|j|j| j|| jd ||d| j|j| jr1| jnd | || n| r| jdkr| j| }|j
|| j  |  | j |_|jdd tj|j
dtjd |j| j d | j }| j|dddf | jd d| f }|| j }|jddd|f | ne| j| }|j
| |  |_|jdd tj|j
dtjd |j| j d | j }| j|dddf | jd d| f }|| j }|jddd|f | | j | }tj!| j|
d"d}|#|$ d|"d }tj%j&'tj|| j (| jdd	d
dd }|j(| jdd	|dddf  dd}|dd|j)$ | j df *d |j+| d| j}|#|jd | j}t,|||| j }tj-|dd	\}}| j|ddf .d|j(| jdd	}|j|.d| |j
|j/dd	 |jdd tj|j
dtjd | j0r| j1| } | 2|||  nA|3 rK| j4| }|j
| |  |_|jdd tj|j
dtjd |j5d| }!|j6rt|j6d |_7nd|_7|j8dd tj|!dtjd |j| j d | j }| j|dddf | j4d d| f }|jddd|f || j  n|9 r
| j4| }|j
| |  |_|jdd tj|j
dtjd t:|dd}"t:|dd}#|"dur|";tj}$n(|#durtj<|#tj|
d}$nt:|d| jd }%tj=|f|%tj|
d}$|%g| }#|#rt>t|#|_7n
t:|d| jd |_7|j8dd tj|$dtjd |j| j d | j }| j|dddf | j4d d| f }|jddd|f || j  |durd|d |_?|j@|dd  |jAdd tj|j@dtjd |jBddd|j?f | j|d|j?f  | j||j?|j?|j f }|jddd|jf | || _C|| _DdS )z5Initialize forward metadata for replaying CUDA graph.Nr   r   r   r   r   r   r   r   rB   extend_seq_lens_tensorr   r   num_tokens_per_req)Er   r   r   r   rD   r   r   r   r=   normal_decode_set_metadatar   r   r"   r   rE  rU  rF  rW   r   r   rF   r#   r   r   ,_maybe_update_local_attn_metadata_for_replayr   r   r   rU   rL   rJ   rH  rI  rN   r   r   r   r   r   rO   r   	positionsfill_r   r   r   r   r   r   rJ  r   r   rG  accept_lengthaccept_length_cpur   r   is_draft_extend_v2r>  r   	as_tensorrP   r   r%   r&   r$   r'   r   r   )&r   rP  r   r   rZ  r   r   r   r   r   r   r   r   max_lenmax_seq_pagesr   r   r"   r   r   num_seqspage_indicesr   r   r   r   r   r   r   r   r   r   rV  ra  r[  r   r   default_extendr1   r1   r2   'init_forward_metadata_replay_cuda_graphE  s  
















 








$




"



z=FlashAttentionBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS )z5Get the fill value for sequence length in CUDA graph.r   r1   )r   r1   r1   r2   !get_cuda_graph_seq_len_fill_value  s   z7FlashAttentionBackend.get_cuda_graph_seq_len_fill_valueforwardbatchr   r   c                 C  s   | j sd|_dS |j}|j}| jr| j|j}n|j}|du s(|du s(|du r-d|_dS | 	 }| 	 }t
| j|||| j\}	}
}}tjt|
|t||||t|	 t| d}||_dS )zVCentralized utility to initialize local_attn_metadata if chunked attention is enabled.Nr(   r)   r*   r+   r,   )r   r4   r   r   r   r   r   r"   cpunumpyrn   r   r=   r   r3   rU   rV   r   r   rD   )r   rl  r   r   r   r   r"   cu_seqlens_q_npr;   seqlens_q_local_npcu_seqlens_q_local_npseqlens_k_local_nprm   r+  r1   r1   r2   r     sF   	


z5FlashAttentionBackend._maybe_init_local_attn_metadatac                 C  s   | j sdS |j}t|  }|j}|j  }|  }t	| j
|||| j\}}	}
}t|	}t|
}|jd dkrB|jd n|}|jd dkrP|jd nd}| jd d| }| jd d| }| jd d|d|f }tj|||d|d|_dS )a;  Update local attention metadata during CUDA graph capture phase.

        This method calculates the exact buffer sizes needed for local attention metadata
        during the CUDA graph capture phase, optimizing memory usage by creating views of
        pre-allocated buffers with exactly the sizes needed.
        Nr   r   r(   r)   r*   rm  )r   r   r   rD   r   r"   r   rn  ro  rn   r   r=   rS  rF   rD  r   r3   r4   )r   r   rP  seq_lens_capturerX   page_table_capturerp  
seqlens_nprq  rr  rs  block_table_local_npq_lenk_lenb0b1r(   r)   r*   r1   r1   r2   rT    sV   	
zCFlashAttentionBackend._maybe_update_local_attn_metadata_for_capturec                 C  s  | j sdS | jd }| jd }| jd }| jd }tj|d |j|jd}|jd| }t|	 
 }| jrG| j|jd|d|f }	n|jd|d|f }	|  }
|  }t| j|
||	| j\}}}}|j}t||}t||}||}|jd }|jd }|j\}}|d| | ||d d |d| | ||d d |d|d|f | ||dddf d |d||df d |jdur|j}t|	 |_t|	 |_dS dS )	zOUpdate preallocated local attention metadata in-place before CUDA graph replay.Nr(   r)   r*   r   r   r   r   )r   rD  r   rU   rN   r   rA   r   r   rD   r   r   r   r   r"   rn  ro  rn   r   r=   rV   r   rF   rU  r`  r4   r+   r,   )r   r   rP  local_q_buflocal_k_buflocal_block_bufr   seqlensrX   sliced_page_tablerp  rv  rq  rr  rs  rm   r   rd   re   rx  ry  rz  r{  lamr1   r1   r2   r^    sj   

	





zBFlashAttentionBackend._maybe_update_local_attn_metadata_for_replayr   rV  r5   c                 C  s   | j dks	J d|j| j|j }tjjtj|dtj	dd}|j
d }|d u r:|j||j|jj
d  fn|j}t||j|j|j|j| j |d u rct }d|_|j|_||_||_||_n|j| |j| ||_d S )Nr   zpFlashAttention backend doesn't support topk > 1 speculative decoding with page size > 1 sliding window attentionr   r   rB   )r=   r   r   r   rU   r   r   rO   rL   rJ   rF   r"   	new_zerosr   "prepare_swa_spec_page_table_tritonr   r   r   r   rU  r6   )r   r   r   rV  r   r   rP  r"   r1   r1   r2   r   I	  sJ   
	
z=FlashAttentionBackend._init_sliding_window_attn_spec_metadata)Fr   r   r   rx   )ry   r   rz   r{   r   r
   )TNNN)r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   r>   r   rA  r   rB  r   )rP  r   rQ  r   r   r   r   r   r   r   r   r   r   rR  rq   )rP  r   r   r   r   r   rZ  r   r   r   r   r   r   rR  r   r   r   r   )rl  r
   r   r   )r   r   rP  r   )r   r   r   r   rV  r5   )r-   r.   r/   r7   r~   r   r=  r@  rO  rY  rj  rk  r   rT  r^  r   __classcell__r1   r1   r   r2   rw   1  sP    
M  X  ]  
  
 y  K

,
;Trw   LEN_Atl.constexprLEN_BREPEAT_STEPBLOCK_Nc           $      C  s  t d}t d}|| }|}t || }t || }|| t d| }|| }|| |kr4d S ||k }| ||  ||  }|d | |k rl|||  ||  }|||k @ }t j||dd}t j|||d d S || |kr|| }|||	  ||
  }|||k @ }t j||dd}t j|||d d S |}||k ||k @ }|||  ||  }t j||dd} || }!|!dk|!|k @ |!|k @ }|||	  |!|
  }t j||dd}"t ||k | |"}#t j||#|d d S )Nr   r   )r   other)r   )tl
program_idloadrN   storer   )$dst_ptr	src_a_ptr	src_b_ptrseq_len_a_ptrseq_len_b_ptrdst_stride_mdst_stride_n
a_stride_m
a_stride_n
b_stride_m
b_stride_nr  r  r  r  pid_mpid_nidx_aidx_b	seq_len_a	seq_len_boffs_n	total_lenr   dsta_ptra_maskvaloffs_bb_ptrb_maska_offsa_valb_offsb_valr.  r1   r1   r2   #_prepare_swa_spec_page_table_kernel}	  sB   

r  page_table_dstpage_table_apage_table_br  r  r   c                 C  s   |  }|  }||| ksJ |jd }|jd }	||	 }
|}d}|t|
|f}t| | ||||| d| d|d|d|d|d||	||dd d S )Nr   r   r   r|   )r  r  r  r  	num_warps)r   rF   tritonrK   r  stride)r  r  r  r  r  r   rP  	bs_expandr  r  LEN_OUTr  r  gridr1   r1   r2   r  	  s6   	


r  c                   @  s>   e Zd ZdddZdd
dZdddZdddZdddZdS )FlashAttentionMultiStepBackendry   r   r   r   r   c              	   C  sL   || _ || _|| _g | _t| jd D ]}| jt||| j| jd qd S )Nr   )r   r   r   )ry   r   r   attn_backendsrangeappendrw   )r   ry   r   r   ir1   r1   r2   r~   	  s   z'FlashAttentionMultiStepBackend.__init__r   r
   c                 C  s(   t | jd D ]
}| j| | qd S Nr   )r  r   r  r   r   r   r  r1   r1   r2   r   	  s   z4FlashAttentionMultiStepBackend.init_forward_metadatarA  rB  c                 C  s*   t | jd D ]}| j| || qd S r  )r  r   r  rO  )r   rA  rB  r  r1   r1   r2   rO  	  s   z4FlashAttentionMultiStepBackend.init_cuda_graph_statec              
   C  sf   |j d usJ |j  sJ t| jd D ]}| j| j|j|j| j |j|j	|j
tj|j d qd S )Nr   )r   r   r   )r   is_draft_inputr  r   r  rY  r   r   r   r   r   r   DECODEr  r1   r1   r2   rY  	  s   

zGFlashAttentionMultiStepBackend.init_forward_metadata_capture_cuda_graphrP  c                 C  sf   |j d usJ |j  sJ t| jd D ]}| j| j||j|j|j|j	t
j|j |j|jd	 qd S )Nr   )r   r   r   r   r   )r   r  r  r   r  rj  r   r   rZ  r   r   r  r   r   )r   r   rP  r  r1   r1   r2   rj  
  s   
zFFlashAttentionMultiStepBackend.init_forward_metadata_replay_cuda_graphN)ry   r   r   r   r   r   r  r  )r   r
   rP  r   )r-   r.   r/   r~   r   rO  rY  rj  r1   r1   r1   r2   r  	  s    



r  r   r   r"   r   r   r   rf  r   seq_len_deltar#   r   r   Optional[SWAKVPool]c                 C  s   |  ||  |dd   tj| dtjd ||d d d f |d | d d d f f }|d d d |f  ||	  |
d ura|d urct|tsKJ ||}|
d d d |f  ||	  d S d S d S )Nr   r   r   )rU  rU   rL   rJ   r   r	   r   )r   r   r"   r   r   r   rf  r   r  r=   r#   r   rh  swa_page_indicesr1   r1   r2   r]  )
  s    
"r]  T)dynamicbackendr   r   r   r   c                 C  s   | |}| ||  || tj}| dkr|d}tj|tjd}|d d dd f |d d d df k|d d dd f< |j	ddd }	|j
d }
|d |
d d f d|	| d S )Nr   r   r@   r   r   )r   rU  r   rU   rJ   r   r   	ones_liker{   rL   rF   scatter_)r   r"   r   r   r   r   r=   r   r   r_  rg  r1   r1   r2   r   E
  s   


8
"r   )r   )r8   r   r9   r:   r;   r:   r<   r   r=   r   r>   r?   )ro   r   rp   r   r>   r   )r  r  r  r  r  r  r  r  )r  r   r  r   r  r   r  r   r  r   r   r   r   )r   r   r   r   r"   r   r   r   r   r   r   r   rf  r   r   r   r  r   r=   r   r#   r   r   r  )r   r   r"   r   r   r   r   r   r   r   r   r   r=   r   ):
__future__r   dataclassesr   typingr   r   ro  rG   rU   r  triton.languagelanguager  sglang.srt.configs.model_configr   -sglang.srt.layers.attention.base_attn_backendr   !sglang.srt.layers.radix_attentionr   $sglang.srt.mem_cache.swa_memory_poolr	   ,sglang.srt.model_executor.forward_batch_infor
   r   sglang.srt.server_argsr    sglang.srt.speculative.spec_infor   sglang.srt.utilsr   r   &sglang.srt.model_executor.model_runnerr   
sgl_kernelr   sgl_kernel.flash_attnr   r  r   r  $sglang.jit_kernel.flash_attention_v4r  r  r   rn   rK   _dynamodisablerv   rw   jitr  r  r  r]  compiler   r1   r1   r1   r2   <module>   sv    k 

                \
=(Q