o
    پi                     @  s  d dl mZ 	 d dlZd dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ eeZe rYd dlZermd dlmZ d dl m!Z! d dl"m#Z# dZ$da%eG dd dZ&G dd deZ'G dd deZ(dS )    )annotationsN)	dataclass)TYPE_CHECKINGOptional)envs)FlashInferAttnBackendFlashInferMultiStepDraftBackend)fused_fp8_set_kv_buffer)canonicalize_stride)	SWAKVPoolSWATokenToKVPoolAllocator)ForwardBatchForwardMode)is_flashinfer_available)RadixAttention)ModelRunner)	SpecInputi   c                   @  sb   e Zd ZU dZded< dZded< dZded< dZded	< dZded
< dZ	ded< dZ
ded< dS )TRTLLMMHAMetadataNtorch.Tensorcache_seqlens_int32   intmax_seq_len_qr   max_seq_len_kcu_seqlens_qcu_seqlens_k
page_tableswa_page_table)__name__
__module____qualname__r   __annotations__r   r   r   r   r   r    r"   r"   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/trtllm_mha_backend.pyr   .   s   
 r   c                      s   e Zd ZdZ				dOdP fddZdQddZdRddZdSddZdTd$d%ZdUd*d+Z		dVdWd.d/Z
dXd8d9ZdYd<d=ZdZd>d?Zd[dBdCZd\dFdGZd]dHdIZ	Jd^d_dKdLZ	Jd^d\dMdNZ  ZS )`TRTLLMHAAttnBackendz,TRTLLM MHA attention kernel from flashinfer.FNr   model_runnerr   skip_prefillboolkv_indptr_bufOptional[torch.Tensor]kv_last_page_len_bufspeculative_step_idr   c           
        s   t j}| r| ntd d }t |||| |j}|jj| _	|j
| _
|j| _|j| _|j| _|jj| _|j| _|| _td u rMtj| jtj|jdat| _i | _|jjpXd| _|| _i | _|jj| _|j }	t!|	t"| _#| j#ru|	$ nd | _%d | _&d S )Ni   dtypedevicer   )'r    SGLANG_FLASHINFER_WORKSPACE_SIZEis_setgetDEFAULT_WORKSPACE_SIZE_MBsuper__init__model_configcontext_lenmax_context_lenhidden_sizekv_cache_dtype	data_typer-   q_data_type	page_sizereq_to_token_poolreq_to_tokenr.   workspace_size!global_zero_init_workspace_buffertorchzerosuint8workspace_bufferdecode_cuda_graph_metadataserver_argsspeculative_eagle_topktopkr+   target_verify_metadataspeculative_num_draft_tokenstoken_to_kv_pool_allocator
isinstancer   use_sliding_window_kv_poolget_kvcache_swa_kv_poolforward_metadata)
selfr%   r&   r(   r*   r+   env_varworkspace_size_bytesconfig	allocator	__class__r"   r#   r4   C   sJ   





zTRTLLMHAAttnBackend.__init__token_indicesr   returnc                 C  s(   | j sdS |j}| j|d|S )zFTranslate full-pool token indices to SWA-pool indices, or return None.N)rM   shaperO   translate_loc_from_full_to_swareshape)rQ   rX   r[   r"   r"   r#   _maybe_translate_swa   s   z(TRTLLMHAAttnBackend._maybe_translate_swamax_bsmax_num_pagesc                 C  s    | j sdS tj||tj| jdS )zDAllocate a SWA page_table buffer, or return None for non-SWA models.Nr,   )rM   rA   rB   int32r.   )rQ   r_   r`   r"   r"   r#   _alloc_swa_page_table   s   z)TRTLLMHAAttnBackend._alloc_swa_page_tablemetadatar   page_indices	num_pagesc                 C  s>   |j du rdS | |}|j ddd|f || j  dS )zETranslate and copy SWA page indices into metadata. No-op for non-SWA.N)r   r^   copy_r<   )rQ   rc   rd   re   swa_indicesr"   r"   r#   _copy_swa_page_table   s   

&z(TRTLLMHAAttnBackend._copy_swa_page_tablesourcedictkeystrbsc                 C  s0   | |}|dur|d|ddf |_dS dS )zEBind a pre-allocated SWA page_table slice to metadata for CUDA graph.N)r1   r   )rQ   rc   ri   rk   rm   bufr"   r"   r#   _bind_swa_page_table   s   
z(TRTLLMHAAttnBackend._bind_swa_page_tablelayerr   forward_batchr   c                 C  s2   | j j}|dur| jj|j \}}|r|S | j jS )z@Return the correct page_table for the given layer (SWA or full).N)rP   r   rO   layers_mappinglayer_idr   )rQ   rp   rq   swa_pt_is_swar"   r"   r#   _get_layer_page_table   s   z)TRTLLMHAAttnBackend._get_layer_page_tablemax_num_tokenskv_indices_bufc                 C  s  | j | j d | j }tj|tj| jdtj||tj| jd| ||tjd| j | j| jdd| _| j	dur| j	dkrtjd|d tj| jd| jd< tj|d tj| jd| jd< tj||tj| jd| jd	< | ||| jd
< tj|tj| jdtjd|| j	 d | j	tj| jdtj|d tj| jdtj||tj| jd| ||tjd| j | j| jdd| _
tj|tj| jdtj|d tj| jdtj|d tj| jdtj||tj| jd| ||tjd| j | j| jdd| _dS dS dS )z+Initialize CUDA graph state for TRTLLM MHA.r   r,   r   r.   )cache_seqlensr   r   strided_indicesNr   r   page_table_draft_decodeswa_page_table_draft_decode)stepr-   r.   )r{   r   r   r   r   r|   )r7   r<   rA   rB   ra   r.   rb   arangerE   rJ   rI   draft_extend_metadata)rQ   r_   rx   ry   r`   r"   r"   r#   init_cuda_graph_state   s   







z)TRTLLMHAAttnBackend.init_cuda_graph_state
num_tokensreq_pool_indicesseq_lensencoder_lensforward_moder   	spec_infoOptional[SpecInput]c                 C  s  t  }|j}	| r|dur_| jd d| |_|  | jd  |_| jd d|d  |_	t
jjt
j|jdt
jdd|_| jd d|ddf |_| || jd	| || j|< n|d| t
j|_t|}
t
jjt
j|dt
jdd|_|  |_t
jd|
d t
j|	d
|_	| jd d|ddf |_| || jd| || j|< n| r| jd d| |_|j|| j  t
jd|| j d | jt
j|	d
|_	| jd d|d  |_| j|_|  | j |_| jd d|ddf |_| || jd| || j|< n_| ro| jd d| |_|j| || }t
jd|| d |t
j|	d
|_	| jd d|d  |_|| }||_|  |_| jd d|ddf |_| || jd| || j|< || _dS )z+Initialize metadata for CUDA graph capture.Nr{   r   r   r   dimr-   r   r   r}   r~   r,   r   r   r   )r   r.   is_decode_or_idlerE   r   maxitemr+   r   r   rA   nn
functionalpadcumsumra   r   r   ro   tolenr   is_target_verifyrI   rf   rJ   r   is_draft_extendr   rP   )rQ   rm   r   r   r   r   r   r   rc   r.   
batch_sizenum_tokens_per_bsr"   r"   r#   (init_forward_metadata_capture_cuda_graph  s   









z<TRTLLMHAAttnBackend.init_forward_metadata_capture_cuda_graphseq_lens_sumseq_lens_cpuc	                 C  sZ  |d| }|d| }|d| }d}	|  r|durF| j| }	|  }
|
| j d |	_|	j| j d | j }|	j|| j d  n| j| }	|  }
|
| j d | j }|
|	_|	j| |	j	dd t
j|	jdt
jd | j|dddf | jd d| dddf f }|	jddd|f || j  | |	|| n| r| j| }	|	j|| j  |  | j |	_|  }
|	j	dd t
j|	jdt
jd |	j| j d | j }| j|dddf | jd d| f }|	jddd|f || j  | |	|| | j|	_n| r| j| }	|	j| |  |	_|  }
|	j	dd t
j|	jdt
jd |jd| }|jr]t|jd |	_nd|	_|	jdd t
j|dt
jd |	j| j d | j }| j|dddf | jd d| f }|	jddd|f || j  | |	|| |	| _dS )z"Replay CUDA graph with new inputs.Nr   r   r   r|   )r   rE   r   r   r+   r   r<   r   rf   r   rA   r   ra   r>   r   rh   r   rI   rJ   r   r   r   accept_lengthaccept_length_cpur   rP   )rQ   rm   r   r   r   r   r   r   r   rc   max_lenmax_seq_pagesrd   r   r"   r"   r#   'init_forward_metadata_replay_cuda_graph  s   


"

"


"
z;TRTLLMHAAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS )z6Get the fill value for sequence lengths in CUDA graph.r   r"   )rQ   r"   r"   r#   !get_cuda_graph_seq_len_fill_value  s   z5TRTLLMHAAttnBackend.get_cuda_graph_seq_len_fill_valuesave_kv_cachekc                 C  s   |o|duo| j tjkS )z9Check if we should use the fused FP8 KV cache write path.N)r:   rA   float8_e4m3fn)rQ   r   r   r"   r"   r#   _should_use_fused_fp8_path  s   z.TRTLLMHAAttnBackend._should_use_fused_fp8_pathqvc           
   
   K  s:   |j }|j|j\}}	t||||	||j|j| jd dS )z*Fused FP8 quantization and KV cache write.)r   r   k_cachev_cache	cache_lock_scalev_scaler<   N)out_cache_loctoken_to_kv_poolget_kv_bufferrs   r	   r   r   r<   )
rQ   r   r   r   rp   rq   kwargsr   r   r   r"   r"   r#   _fused_fp8_set_kv_buffer  s   

z,TRTLLMHAAttnBackend._fused_fp8_set_kv_bufferc                 C  s,  t  }|j}|j}|j}|j r|jdur[|| jd  t	j
|_|j  | jd  |_t	jd|d t	j
|d|_t	jjt	j|jdt	j
dd|_|jj|jd|jf |_n|t	j
|_|j  |_t	jd|d t	j
|d|_t	jjt	j|dt	j
dd|_|jj|jd|jf |_n|j r|j| j t	j
|_| j|_|j  | j |_t	jd|| j d | jt	j
|d|_t	jjt	j|jdt	j
dd|_|jj|jd|jf |_nm|t	j
|_|j  |_t	jjt	j|dt	j
dd|_|jj|jd|jf |_t|js|jj ddrJ|j!}t|j"}t#|t	j$r4t%| nt%||_t	jjt	j|dt	j
dd|_n|j|_|j|_| &|j|_'| j(dkrt	jd|jj)d | j(| jd	| _*|jdd| j*f | j( |_|j'dur|j'dd| j*f | j( |_'|| _+dS )
z+Initialize the metadata for a forward pass.Nr   r   r,   r   r   T)
include_v2rz   ),r   r   r   r.   r   r   r   r+   r   rA   ra   r   r   r   r   r   r   r   r   r   r   r   r   r=   r>   r   r   r   rJ   r   anyextend_prefix_lens_cpur   extend_seq_lensextend_seq_lens_cpurL   Tensorr   r^   r   r<   r[   r|   rP   )rQ   rq   rc   seqlens_in_batchr   r.   r   max_qr"   r"   r#   init_forward_metadata/  s   





 
z)TRTLLMHAAttnBackend.init_forward_metadataTc                 K  s  |j }| ||}	|	r| j|||||d d}d}n|r-|dur-|j|||||j|j | jtj	kr9|
tj	}| d|j|j}|j|j\}
}|
d| j|j|jdddd}
|d| j|j|jdddd}|jdkrxt|
}
|jdkrt|}|
|f}d}t|d	ddur|jnd}|| |j }d}|d
d}| ||}tjj||| j|| jj | j!|||j"|| j#d}|d|j|j S )z/Run forward for decode using TRTLLM MHA kernel.r   r   r   rp   rq   NrZ   r      r            ?k_scale_floatsinks)querykv_cacherD   block_tablesr   max_seq_len
bmm1_scale
bmm2_scalewindow_leftr   	out_dtype)$r   r   r   r   set_kv_bufferr   r   r:   rA   r   r   
contiguousviewtp_q_head_numhead_dimr   rs   r<   tp_k_head_numpermutetp_v_head_numr
   getattrr   scalingr1   rw   
flashinferdecode!trtllm_batch_decode_with_kv_cacherD   rP   r   r7   sliding_window_sizer;   )rQ   r   r   r   rp   rq   r   r   r   use_fused_fp8_pathr   r   r   q_scaler   r   r   attention_sinkr   or"   r"   r#   forward_decode  st   

z"TRTLLMHAAttnBackend.forward_decodec                 K  s  |j }| ||}	|	r| j|||||d d }d }n|r-|d ur-|j|||||j|j | jtj	kr9|
tj	}| d|j|j}|j|j\}
}|
d| j|j|jdddd}
|d| j|j|jdddd}|jdkrxt|
}
|jdkrt|}|
|f}|dd }d}t|d	d d ur|jnd}|| |j }d}| ||}|j rtjj||| j || j!j"| j#|||j$|| j%| j!j&d
}n"tj'j(||| j || j!j"| j!j&| j#|||j)| j!j*| j!j+|j$|| j%d}|d|j|j S )Nr   rZ   r   r   r   r   r   r   r   )r   r   rD   r   r   r   r   r   r   r   r   q_len_per_req)r   r   rD   r   r   	max_q_len
max_kv_lenr   r   r   cum_seq_lens_qcum_seq_lens_kvr   r   r   ),r   r   r   r   r   r   r   r:   rA   r   r   r   r   r   r   r   rs   r<   r   r   r   r
   r1   r   r   r   rw   r   r   r   r   r   rD   rP   r   r7   r   r;   r   prefill"trtllm_batch_context_with_kv_cacher   r   r   )rQ   r   r   r   rp   rq   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r"   r#   forward_extend  s   



z"TRTLLMHAAttnBackend.forward_extend)FNNr   )
r%   r   r&   r'   r(   r)   r*   r)   r+   r   )rX   r   rY   r)   )r_   r   r`   r   rY   r)   )rc   r   rd   r   re   r   )rc   r   ri   rj   rk   rl   rm   r   )rp   r   rq   r   rY   r   )N)r_   r   rx   r   ry   r)   )rm   r   r   r   r   r   r   r   r   r)   r   r   r   r   )rm   r   r   r   r   r   r   r   r   r)   r   r   r   r   r   r)   )rY   r   )r   r'   r   r   rY   r'   )
r   r   r   r   r   r   rp   r   rq   r   rq   r   )T)r   r   r   r   r   r   rp   r   rq   r   r   r'   rY   r   )r   r   r    __doc__r4   r^   rb   rh   ro   rw   r   r   r   r   r   r   r   r   r   __classcell__r"   r"   rV   r#   r$   @   s2    
J




\ 


i


x\r$   c                      sJ   e Zd ZdZd fddZdddZdddZdddZdddZ  Z	S )!TRTLLMHAAttnMultiStepDraftBackendz5Multi-step TRTLLM MHA attention kernel used by EAGLE.r%   r   rH   r   speculative_num_stepsc                   sH   t  ||| t| jd D ]}t|d| j| | j|d| j|< qd S )Nr   T)r&   r(   r*   r+   )r3   r4   ranger   r$   	kv_indptrkv_last_page_lenattn_backends)rQ   r%   rH   r   irV   r"   r#   r4   ^  s   z*TRTLLMHAAttnMultiStepDraftBackend.__init__rq   r   c                 C  s(   t | jd D ]
}| j| | qd S Nr   )r   r   r   r   rQ   rq   r   r"   r"   r#   r   k  s   z7TRTLLMHAAttnMultiStepDraftBackend.init_forward_metadatar_   rx   c                 C  s*   t | jd D ]}| j| || qd S r   )r   r   r   r   )rQ   r_   rx   r   r"   r"   r#   r   o  s   z7TRTLLMHAAttnMultiStepDraftBackend.init_cuda_graph_statec              
   C  sf   |j d usJ |j  sJ t| jd D ]}| j| j|j|j| j |j|j	|j
tj|j d qd S )Nr   )r   r   r   )r   is_draft_inputr   r   r   r   r   rH   r   r   r   r   DECODEr   r"   r"   r#   r   s  s   

zJTRTLLMHAAttnMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphrm   c                 C  sb   |j d usJ |j  sJ t| jd D ]}| j| j||j|j|j|j	t
j|j |jd qd S )Nr   )r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )rQ   rq   rm   r   r"   r"   r#   r     s   
zITRTLLMHAAttnMultiStepDraftBackend.init_forward_metadata_replay_cuda_graph)r%   r   rH   r   r   r   r   )r_   r   rx   r   )rq   r   rm   r   )
r   r   r    r   r4   r   r   r   r   r   r"   r"   rV   r#   r   [  s    


r   ))
__future__r   loggingdataclassesr   typingr   r   rA   sglang.srt.environr   .sglang.srt.layers.attention.flashinfer_backendr   r   ;sglang.srt.layers.attention.triton_ops.trtllm_fp8_kv_kernelr	   !sglang.srt.layers.attention.utilsr
   $sglang.srt.mem_cache.swa_memory_poolr   r   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.utilsr   	getLoggerr   loggerr   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   r2   r@   r   r$   r   r"   r"   r"   r#   <module>   s@    
      !