o
    پiI                     @  s  d dl mZ 	 d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z#m$Z$ e# rpd dl%Z%erd dl&m'Z' d dl(m)Z) d dl*m+Z+ e,e-Z.dZ/dZ0ej1d"ddZ2ej1d"ddZ3dd Z4da5eG dd dZ6eG dd dZ7G dd deZ8G d d! d!eZ9dS )#    )annotationsN)	dataclass)TYPE_CHECKINGOptionalUnion)is_in_piecewise_cuda_graph)FlashInferMLAAttnBackend"FlashInferMLAMultiStepDraftBackend)concat_mla_absorb_q_general!create_flashmla_kv_indices_tritonget_num_page_per_block_flashmlamla_quantize_and_rope_for_fp8)get_attention_tp_size)scaled_fp8_quant)ForwardBatchForwardMode)get_global_server_args)is_flashinfer_availableis_float4_e2m1fn_x2)RadixAttention)ModelRunner)	SpecInput      
BLOCK_SIZEtl.constexprc	                 C  s  t d}	t d}
t d}|	| }|	| }||krdS t || }||kr*dS t || }|| }|
| }t || |}t d||| k }|| }t || |}t d||| k }|| | |t d| dddf |  |t d| dddf  }t j| | |dddf |dddf @ dd}|| | | || |  |t d| dddf |  |t d| dddf  }t j|| ||dddf |dddf @ d dS )z`Triton kernel for padding draft extended query tensor with parallelized head and dim processing.r         N        maskotherr    tl
program_idloadminimumarangestore)q_ptrpadded_q_ptrseq_lens_q_ptr
cumsum_ptr
batch_sizemax_seq_len	num_headshead_dimr   batch_seq_pidhead_piddim_pidbatch_idseq_posseq_leninput_start	input_pos
head_starthead_end	head_mask	dim_startdim_enddim_maskinput_offsetdataoutput_offset rC   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/trtllm_mla_backend.pypad_draft_extend_query_kernel7   sT   





rE   c	                 C  s  t d}	t d}
t d}|	| }|	| }||krdS t || }||kr*dS t || }|| }|
| }t || |}t d||| k }|| }t || |}t d||| k }|| | | || |  |t d| dddf |  |t d| dddf  }t j| | |dddf |dddf @ dd}|| | |t d| dddf |  |t d| dddf  }t j|| ||dddf |dddf @ d dS )zcTriton kernel for unpadding draft extended output tensor with parallelized head and dim processing.r   r   r   Nr   r   r"   r#   )raw_out_ptr
output_ptraccept_length_ptrr-   r.   token_per_batchtp_q_head_num
v_head_dimr   r2   r3   r4   r5   r6   
accept_lenoutput_start
output_posr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rC   rD    unpad_draft_extend_output_kernel   sT   





rO   c           	      C  s   |  tj} t|dd }|d u rd}|dkr7t|dsJ dt|d|jd  |j	\}}||j}n| tj}t|dd }|d u rId}|dkrnt|dsVJ dt|d|jd  |j
\}}||j}n| tj}| ||||fS )	Nk_scale_float      ?k_scalezk_scale is not setv_scale_floatv_scalezv_scale is not set)totorchfloat8_e4m3fngetattrhasattrr   reshapeshape
contiguousrR   rU   )	qkvlayerrR   k_2d_rU   v_2drC   rC   rD   _quantize_fp8_qkv   s,   re   c                   @  s6   e Zd ZU dZded< ded< ded< dZded	< d
S )TRTLLMMLAPrefillMetadataz+Metadata for TRTLLM MLA prefill operations.intr/   torch.Tensorcum_seq_lensseq_lensFboolfallback_to_flashinfer_implN)__name__
__module____qualname____doc____annotations__rl   rC   rC   rC   rD   rf      s   
 rf   c                   @  sf   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded	< dZ
ded
< dZded< dS )TRTLLMMLADecodeMetadataz*Metadata for TRTLLM MLA decode operations.NOptional[torch.Tensor]block_kv_indiceszOptional[int]max_seq_len_kmax_seq_len_qsum_seq_lens_qcu_seqlens_q
seq_lens_q
seq_lens_k)rm   rn   ro   rp   rt   rq   ru   rv   rw   rx   ry   rz   rC   rC   rC   rD   rr      s   
 rr   c                      s   e Zd ZdZ			dOdP fddZdQddZdRddZ	dSdT fddZdU fd'd(ZdV fd+d,Z	dWd-d.Z
dX fd1d2ZdX fd3d4ZdYd9d:ZdZd=d>Z	?					d[d\dKdLZ	?					d[d\ fdMdNZ  ZS )]TRTLLMMLABackendz,TRTLLM MLA attention kernel from flashinfer.FNmodel_runnerr   skip_prefillrk   kv_indptr_bufrs   q_indptr_decode_bufc                   s  t  |||| |j}|jt  | _|t | _|jt  | _|j	| _	|j
| _
|j| _|j| _| j	| j | _|j| _|j| _|j| _|j| _|jj| _td d | _td u rctj| jtj|jdat| _i | _d | _d | _ d | _!d | _"d | _#t$ j%| _%|j&j'| _(d S )Ni   dtypedevice))super__init__model_confignum_attention_headsr   num_q_headsget_num_kv_headsnum_kv_headsnum_local_headskv_lora_rankqk_nope_head_dimqk_rope_head_dimrK   kv_cache_dimscalingkv_cache_dtype	data_typer   q_data_type	page_sizereq_to_token_poolreq_to_tokenDEFAULT_WORKSPACE_SIZE_MBworkspace_size!global_zero_init_workspace_bufferrW   zerosuint8r   workspace_bufferdecode_cuda_graph_metadatadecode_cuda_graph_kv_indicespadded_q_bufferunpad_output_bufferforward_prefill_metadataforward_decode_metadatar   disable_chunked_prefix_cacheserver_argsspeculative_num_draft_tokensnum_draft_tokens)selfr|   r}   r~   r   config	__class__rC   rD   r     sJ   
zTRTLLMMLABackend.__init__r/   rg   returnc                 C  sN   t || j}t| j }t| j}t||}|| dkr%t ||| }|S )z
        Calculate padded block count that satisfies both TRT-LLM and Triton constraints.

        Args:
            max_seq_len: Maximum sequence length in tokens

        Returns:
            Number of blocks padded to satisfy all constraints
        r   )tritoncdivr   TRTLLM_BLOCK_CONSTRAINTr   mathlcm)r   r/   blockstrtllm_constrainttriton_constraintconstraint_lcmrC   rC   rD   _calc_padded_blocks>  s   


z$TRTLLMMLABackend._calc_padded_blocksr.   
max_blocksreq_pool_indicesrh   rj   r   torch.devicec              
   C  sF   t j||fdt j|d}t|f | j||d|| jd|| jd |S )ag  
        Create block KV indices tensor using Triton kernel.

        Args:
            batch_size: Batch size
            max_blocks: Maximum number of blocks per sequence
            req_pool_indices: Request pool indices
            seq_lens: Sequence lengths
            device: Target device

        Returns:
            Block KV indices tensor
        rS   r   Nr   
PAGED_SIZE)rW   fullint32r   r   strider   )r   r.   r   r   rj   r   rt   rC   rC   rD   _create_block_kv_indicesU  s   
z)TRTLLMMLABackend._create_block_kv_indicesmax_bsmax_num_tokenskv_indices_bufc                   s   |  | j}tj||fdtj| jd| _|| }t| jrFtj	| _
tj||d | j| jf| j
| jd| _tj|d | jdf| j
| jd| _n tj||| j| jf| j| jd| _tj|| jdf| j| jd| _t ||| dS )z+Initialize CUDA graph state for TRTLLM MLA.rS   r   r   i   N)r   max_context_lenrW   r   r   r   r   r   r   r   store_dtyper   r   r   r   r   r   init_cuda_graph_state)r   r   r   r   max_blocks_per_seqnum_tokens_per_bsr   rC   rD   r   {  s8   


z&TRTLLMMLABackend.init_cuda_graph_statebs
num_tokensencoder_lensforward_moder   	spec_infoOptional[SpecInput]c              
     s  |  s| s|jddst |||||||S t }| r>|| j }tj|ftj	|j
d|_|j|jtj	d nO|jddr|| }	|	|_|	| |_tjd||	 d |	tj	|j
d|_tj|f|	tj	|j
d|_||j |j }tj|ftj	|j
d|_|j|jtj	d | | j}
| jd|d|
f }t|f | j||d|| jd|
| jd ||_| j|_|| j|< || _dS )	z+Initialize metadata for CUDA graph capture.T
include_v2r   r   r   r   Nr   )is_decode_or_idleis_target_verifyis_draft_extendr   (init_forward_metadata_capture_cuda_graphrr   r   rW   r   r   r   rz   copy_rV   rv   rw   r(   rx   r   ry   r   r   r   r   r   r   r   rt   ru   r   r   )r   r   r   r   rj   r   r   r   metadatar   r   rt   r   rC   rD   r     sr   







z9TRTLLMMLABackend.init_forward_metadata_capture_cuda_graphseq_lens_sumseq_lens_cpuc	              
     st  |  s| s|jddst ||||||||S | j| }	| r:|d| | j }|	j|j	t
jd ~n`|jddr|jd| }
|jrct|jd| d |	_t|jd| | |	_nd|	_||	_|
d }|	jdd t
j|dt
jd |	j| |d| |	j |	j }|	j|	t
j t|f | j|d| |d|	j| jd|	jjd | jd dS )	z"Replay CUDA graph with new inputs.Tr   Nr   r   r   dimr   r   )r   r   r   r   'init_forward_metadata_replay_cuda_graphr   r   rz   r   rV   rW   r   accept_lengthaccept_length_cpumaxrv   sumrw   rx   cumsumry   r   r   rt   r   r\   r   )r   r   r   rj   r   r   r   r   r   r   r   extend_seq_lensr   rC   rD   r     sZ   





z8TRTLLMMLABackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS )z6Get the fill value for sequence lengths in CUDA graph.r   rC   )r   rC   rC   rD   !get_cuda_graph_seq_len_fill_value>  s   z2TRTLLMMLABackend.get_cuda_graph_seq_len_fill_valueforward_batchr   c                   s  |j  rR|j  sR|j jddsRt|j}| jo|pt }|r&t 	| |j
|j }ttjdtj|j
jdtj|ddf }t|j}t||||| _dS |j  sd|j  sd|j jddr|j}t | _t|dddur{|j  }n|j
  }|j
}|j  r|| j }|| j }|tj| j_ nA|j jddrt!|j}	t|j}
tj"j#$tj|j%dtjd	d
}||j% |
 }|
| j_&|	| j_'|| j_(|j%| j_)|tj| j_ | *|}| +|||j,||j}|| j_-t|| j_.|| j_| j|_/dS t 	|S )z+Initialize the metadata for a forward pass.Tr   r   r   r   r   r   Nr   r   r   )0r   	is_extendr   r   anyextend_prefix_lens_cpur   r   r   init_forward_metadatarj   extend_prefix_lensrW   catr   r   r   r   rg   r   extend_seq_lens_cpurf   r   r   r.   rr   r   rY   r   itemr   rV   rz   r   nn
functionalpadr   rv   rw   rx   ry   r   r   r   rt   ru   decode_trtllm_mla_metadata)r   r   
has_prefixrl   rj   cum_seq_lens_qr/   r   max_seqrw   rv   rx   max_seqlen_padrt   r   rC   rD   r   B  s   










z&TRTLLMMLABackend.init_forward_metadatac                   s   t  j|dd d S )NT)disable_flashinfer_ragged)r   init_mha_chunk_metadata)r   r   r   rC   rD   r     s   z(TRTLLMMLABackend.init_mha_chunk_metadatar^   padded_qry   rx   c                 C  sx   |j d d }|j d }|j d }|j d }d}	t||	}
t||	}|| |
|f}t| |||||||||	d	 |S )z-Pad draft extended query using Triton kernel.r   r   r      @   )	r*   r+   r,   r-   r.   r/   r0   r1   r   )r\   r   r   rE   )r   r^   r   ry   rx   r.   rv   r0   r1   r   num_head_blocksnum_dim_blocksgridrC   rC   rD   pad_draft_extend_query  s(   


z'TRTLLMMLABackend.pad_draft_extend_queryraw_outrw   c                 C  s   |j d }|j d }|j d }|j d }|}	| jdur/| jd|	ddddf j|jd}
ntj|	||f|j|jd}
d}t||}t||}|| ||f}t	| ||
|||||||d		 |
d|	ddddf S )
z0Unpad draft extended output using Triton kernel.r   r   r   r   Nr   r   r   )	rF   rG   rH   r-   r.   rI   rJ   rK   r   )
r\   r   rV   r   rW   emptyr   r   r   rO   )r   r   rx   ry   rw   r.   rI   rJ   rK   total_tokensoutputr   r   r   r   rC   rC   rD   unpad_draft_extend_output  s<   
	



z*TRTLLMMLABackend.unpad_draft_extend_outputTr_   r`   ra   r   save_kv_cacheq_ropek_ropecos_sin_cacheis_neoxOptional[bool]llama_4_scalingc                 C  s2  |du}| j tjkr4tdd |||	fD sJ dt|||d|d|j|	|
| j| j	\}}}d}|rL|dur>|dusBJ d|j	
||j|| |ri|d|j|j}|d|j|j|j }t||}n	|d|j|j}|dur|| j| }|| j }| d	kr|d}|j	|j}|d| j| jd}t|d
dp| j}t|dd}|dur||jk r| | |j}d}| j tjkrt|dddur|jnd}nt|dddurt d|j| j  d}|| |j! }t"j#j$||| j%| j&| j| j|j'|j(tj)|j*|d
}|d|j|j }|S )z/Run forward for decode using TRTLLM MLA kernel.Nc                 s      | ]}|d uV  qd S NrC   .0xrC   rC   rD   	<genexpr>       
z2TRTLLMMLABackend.forward_decode.<locals>.<genexpr>xFor FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None.r   FNFor populating trtllm_mla kv cache, both k_nope and k_rope should be not None.rS   r   r   r.   rQ   rP   lCheckpoint has k_scale but KV cache dtype is not FP8. Ignoring k_scale for BMM1 (k_scale=%.4f, kv_dtype=%s).
querykv_cacher   r   r   r   block_tablesrj   r/   
bmm1_scale)+r   rW   rX   allr   squeeze	positionsr   r   token_to_kv_poolset_mla_kv_bufferout_cache_locviewrJ   rK   r1   r
   rV   r   r   	unsqueezeget_key_bufferlayer_idr   r   rY   r   r.   r   r   rP   loggerwarning_oncer   
flashinferdecode%trtllm_batch_decode_with_kv_cache_mlar   r   rt   rj   r   ru   )r   r^   r_   r`   ra   r   r  r  r  r  r  r  merge_queryq_nopeq_rope_reshapedr  k_cacher  r   r.   q_scalerR   r  r   r   rC   rC   rD   forward_decode  s   





zTRTLLMMLABackend.forward_decodec           )        s  | j d ur| j jrt ||||||||S |d u}| jtjkrO|j rOt	dd |||	fD s5J dt
|||d|d|j|	|
| j| j	\}}}d}|rg|d urY|d us]J d|j||j|| |r|d|j|j}|d|j|j|j }t||}|d|j|j}|d ur|| j| }|| j}|j s|jjdd	rt|d
d p| j}t|dd }|d ur||jk r| | |j}|j}|j|j }|d| j!| j"#d}d}| jtjkrt|dd d ur|j$nd}nt|dd d ur	t%&d|j$| j d}|| j}|| |j' }|j r2|j(|j)j* }||d|j|j}d}n|j+d }|dkr@|| nd}|dkoL|| dk}|rb|j(| }||||j|j}d}n]|j,}t-|j.}|j(| }tj/j01tj2|dtj3dd}| j4d ur| j4d |d |d d d d f j|j5d}|6  ntj7|||j|jf|j5|j8d}| 9||||}d}|} |}!|}"|j5| jksJ t:j;j<||| j=| j>| j| j|j?|j@||d
}#|r| A|#|!| |"}$|$d|j|j }$|$S |#d|j|j }$|$S |d urtjB||gdd}|d|jC|j}|d|jC|j}d } }}%| jtjkr:tD||||\}}}}}%|||| j=|jdd| j jE|| |j' |%| j jFd}&|jGr|jHd us_J |jId usgJ |d u snJ |d u suJ |jH}'tj7|j+d |j|j| j|j8d}(t:jJjKdi |&|jL|' |jM|' d|jI|' dd|(dS tj7|j+d |j+d |j+d |j8| jd}(t:jJjKdi |&| j jN| j jEd| j jFd|jO|(dS )Nc                 s  r	  r
  rC   r  rC   rC   rD   r    r  z2TRTLLMMLABackend.forward_extend.<locals>.<genexpr>r  r   Fr  rS   Tr   r   r.   rQ   rP   r  r   r   r   r   r   r  r   )r  keyvaluer   r.   window_left
enable_pdl	max_q_lenr  
bmm2_scaler   g      )rj   
max_kv_len
o_sf_scalecum_seq_lens_kv	is_causal
return_lseoutr   )r   r   rC   )Pr   rl   r   forward_extendr   rW   rX   r   r   r  r   r  r  r   r   r  r  r  r  rJ   rK   r1   r
   rV   r   r   rY   r   r.   r   r   r   r!  r   r   r  rP   r"  r#  r   ru   r   draft_token_numr\   r   r   r   r   r   r   r   r   r   r   zero_r   r   r   r$  r%  r&  r   r   rt   rz   r  r   tp_k_head_numre   r/   ri   attn_attend_prefix_cacheprefix_chunk_idxprefix_chunk_cu_seq_lensprefill trtllm_ragged_attention_deepseekprefix_chunk_seq_lensprefix_chunk_max_seq_lensrj   mha_return_lse))r   r^   r_   r`   ra   r   r  r  r  r  r  r  r'  r(  r)  r   r.   r   r*  r  r+  rR   r  r/   needs_unpadr   tokens_per_seqcan_direct_viewactual_seq_lens_qactual_max_seq_len_qactual_cu_seqlens_qr   unpad_seq_lens_qunpad_cu_seqlens_qunpad_sum_seq_lens_qr   r   rU   common_trtllm_args	chunk_idxr8  r   rC   rD   r9  m  s  















zTRTLLMMLABackend.forward_extend)FNN)r|   r   r}   rk   r~   rs   r   rs   )r/   rg   r   rg   )r.   rg   r   rg   r   rh   rj   rh   r   r   r   rh   r
  )r   rg   r   rg   r   rs   )r   rg   r   rg   r   rh   rj   rh   r   rs   r   r   r   r   )r   rg   r   rh   rj   rh   r   rg   r   rs   r   r   r   r   r   rs   )r   rg   )r   r   )
r^   rh   r   rh   ry   rh   rx   rh   r   rh   )
r   rh   rx   rh   ry   rh   rw   rg   r   rh   )TNNNFN)r^   rh   r_   rh   r`   rh   ra   r   r   r   r  rk   r  rs   r  rs   r  rs   r  r  r  rs   r   rh   )rm   rn   ro   rp   r   r   r   r   r   r   r   r   r   r   r  r,  r9  __classcell__rC   rC   r   rD   r{      s@    
<
*/S
AX

 7 r{   c                      s"   e Zd ZdZd	 fddZ  ZS )
TRTLLMMLAMultiStepDraftBackendz7Multi-step draft backend for TRT-LLM MLA used by EAGLE.r|   'ModelRunner'topkrg   speculative_num_stepsc                   sF   t  ||| t| jd D ]}t|d| j| | jd| j|< qd S )Nr   T)r}   r~   r   )r   r   rangerT  r{   	kv_indptrq_indptr_decodeattn_backends)r   r|   rS  rT  ir   rC   rD   r   z  s   z'TRTLLMMLAMultiStepDraftBackend.__init__)r|   rR  rS  rg   rT  rg   )rm   rn   ro   rp   r   rP  rC   rC   r   rD   rQ  w  s    rQ  )r   r   ):
__future__r   loggingr   dataclassesr   typingr   r   r   rW   r   triton.languagelanguager$   0sglang.srt.compilation.piecewise_context_managerr   2sglang.srt.layers.attention.flashinfer_mla_backendr   r	   !sglang.srt.layers.attention.utilsr
   r   r   r   sglang.srt.layers.dp_attentionr   )sglang.srt.layers.quantization.fp8_kernelr   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.server_argsr   sglang.srt.utilsr   r   r$  !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   	getLoggerrm   r"  r   r   jitrE   rO   re   r   rf   rr   r{   rQ  rC   rC   rC   rD   <module>   sX    
HF	      ~