o
    پi|                     @  s8  d dl mZ d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZmZ erjd dlmZ d dl m!Z! d dl"m#Z# dd Z$eG dd dZ%G dd deZ&G dd dZ'e	j(dddZ)	d ddZ*	d ddZ+dS )!    )annotations)	dataclass)TYPE_CHECKINGListOptionalN)AttentionBackend)#create_flashinfer_kv_indices_triton)get_attention_tp_size)AttentionType)ForwardBatchForwardMode) generate_draft_decode_kv_indices)get_bool_env_varget_device_core_countget_int_env_varnext_power_of_2)RadixAttention)ModelRunner)	SpecInputc                 C  s   | dkr|S t  )Ntanh)
ValueError)logit_capping_method	logit_cap r   ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/triton_backend.pylogit_capping_mod   s   r   c                   @  sv   e Zd ZU ded< ded< ded< ded< ded< ded< ded	< ded
< ded< ded< ded< ded< ded< dS )ForwardMetadatatorch.Tensorattn_logitsattn_lseintmax_extend_lennum_kv_splits	kv_indptr
kv_indices	qo_indptrcustom_maskmask_indptrwindow_kv_indptrwindow_kv_indiceswindow_num_kv_splitswindow_kv_offsetsN)__name__
__module____qualname____annotations__r   r   r   r   r   %   s   
 r   c                      s   e Zd Z		dCdD fd	d
ZdEddZdFddZ		dGdHddZdId#d$ZdJd'd(Zd)d* Z	d+d, Z
dKd0d1Z	2	dLdMd8d9ZdNd?d@Z	2	dLdMdAdBZ  ZS )OTritonAttnBackendFNmodel_runnerr   skip_prefillboolkv_indptr_bufOptional[torch.Tensor]c           	        sv  ddl m} ddlm}m}m} t   tj	
|| _tj	
|| _tj	
|| _tj	
|| _|| _|jj}|j| _|jj| _|j| _|jj| _|jj| _|jjt  | _|jt | _|jd usi|jd urp|j | _n
|j dj!d | _|jj"| _#|j$| _$t%|j&| _'t(dd| _)|jj*| _+|jj,o|jj-dk| _.|jj/| _0| j0rt1dd| _2d	| _)n|jj3| _2| j2d ur| j#| j2 d
 | j2 | _+|jd ur|jj4rJ d|d u rtj5|d
 ftj6|j$d| _7n|| _7d | _8| jd ur| jdkr|d u rtj5|d
 ftj6|j$d| _8nt9|| _8| js3tj5|d
 ftj6|j$d| _:tj5|d
 ftj;|j$d| _<d | _=d | _>d S )Nr   )decode_attention_fwd)build_unified_kv_indicesextend_attention_fwdextend_attention_fwd_unified*SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITSfalse$SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE   F   z=Sliding window and cross attention are not supported togetherdtypedevice)?7sglang.srt.layers.attention.triton_ops.decode_attentionr6   7sglang.srt.layers.attention.triton_ops.extend_attentionr7   r8   r9   super__init__torchcompilerdisabler2   req_to_token_poolsizesliding_window_sizereq_to_tokentoken_to_kv_pool_allocatorserver_argsspeculative_num_draft_tokensnum_draft_tokensspeculative_num_stepsmodel_confignum_attention_headsr	   num_headget_num_kv_headsnum_kv_headhybrid_gdn_configkimi_linear_configtoken_to_kv_poolget_v_head_dim
v_head_dimget_value_buffershapecontext_lenmax_context_lenrB   r   gpu_iddevice_core_countr   static_kv_splitstriton_attention_num_kv_splitsmax_kv_splitsdisable_cuda_graphchunked_prefill_size'allow_bidirectional_attention_in_extendenable_deterministic_inferenceenable_deterministicr   split_tile_size triton_attention_split_tile_sizeis_encoder_decoderzerosint32r#   r(   
zeros_liker%   int64r'   forward_metadatacuda_graph_custom_mask)	selfr1   r2   r4   r6   r7   r8   r9   max_bs	__class__r   r   rF   8   s   














zTritonAttnBackend.__init__r"   r   seq_lensc                 C  s   |j d |j d }}|| }|| |ks J d| d| d| js(| jdkr3| js3|| j d S | jd urW| jrW|dkrE||}n|}|| j d | j |d d < d S |dk r^d}nt	|}t
d ||||| j| j| j| j|d	 d S )	Nr   znum_seq(z), num_token(z), something goes wrong!r?   r>   )r?   )MAX_NUM_SEQ)r^   rc   rb   rj   fill_re   rk   repeat_interleavetritonr   get_num_kv_splits_tritonrU   rW   )rt   r"   rx   	num_tokennum_seq	num_groupexpanded_seq_lensSCHEDULE_SEQr   r   r   get_num_kv_splits   sD   


z#TritonAttnBackend.get_num_kv_splitsforward_batchr   c                 C  sf  |j }| j}| j}d}d}d}|j}|j r|du rtj|jdd|d|d < |d|d  }tj	|j
tj| jd}	t|f | j|j|j|d|	| jd | jdur| jdkrt| j| j| j|j|j|| j| j\}}}
}tj	|ftj| jd}| ||
 n|j|j}}	|jd d }tj	|| j| j| jftj| jd}tj	|| j| jftj| jd}tj	|ftj| jd}| ||j d}d}d}d}nT|j rst|j}tjdd| | j | jtj| jd}tj|jdd|d|d < |d|d  }tj	|d tj| jd}	t|f | j|j|j|d|	| jd | jdurA| jdkrAt| j| j| j|j|j|| j| j\}}}
}|j }| j|j| j  }| j!}tj|d| dd|d|d < |d|d  }| j}d}d}d}n|j" r|#|j|jd| j\}	}}}|	$tj}	d}t%|j&' }d}d}d}ntj|j(dd|d|d < |d|d  }tj	t)|j*tj| jd}	t|f | j|j|j(|d|	| jd | jdur| jdkrt| j| j| j|j(|j|| j| j\}}}
}| j+}tj|j,dd|d|d < |d|d  }d}d}d}d}t%|j-}d}t.||||||	|||||||| _/dS )z6Init auxiliary variables for triton attention backend.Nr   dimr?   r@   steprA   rB   r:   )0
batch_sizer#   r(   	spec_infoforward_modeis_decode_or_idlerG   cumsumrx   emptyseq_lens_sumrq   rB   r   rM   req_pool_indicesstriderL   update_sliding_window_bufferrN   ro   r   r$   r^   rU   re   r\   float32is_target_verifylenarangerQ   r&   r'   is_draft_extendgenerate_attn_arg_prefilltomaxaccept_lengthitemextend_prefix_lenssumextend_prefix_lens_cpur%   extend_seq_lensextend_seq_lens_cpur   rr   )rt   r   bsr#   r(   r)   r*   r+   r   r$   window_kv_lens_r   r   r"   r%   r&   r'   r!   seq_mask_lenr   r   r   init_forward_metadata   sj  








"




z'TritonAttnBackend.init_forward_metadataru   r    max_num_tokenskv_indices_bufcuda_graph_num_kv_splits_bufc                 C  sD  t j|| j| j| jft j| jd| _t j|| j| jft j| jd| _|d u r5t j	|f| jt j
| jd| _n|| _|d u rKt j|| j t j| jd| _n|| _| js_t j|| j t j| jd| _| jd ur| jdkr|d u r|t j|| j t j| jd| _nt || _t j	|f| jt j
| jd| _t j|ft j
| jd| _d S d S d S )Nr@   r   )rG   rn   rU   re   r\   r   rB   cuda_graph_attn_logitscuda_graph_attn_lsefullro   cuda_graph_num_kv_splitsr`   rq   cuda_graph_kv_indicesr2   uint8rs   rL   cuda_graph_window_kv_indicesrp   cuda_graph_window_num_kv_splitscuda_graph_window_kv_offsets)rt   ru   r   r   r   r   r   r   init_cuda_graph_state  sf   


z'TritonAttnBackend.init_cuda_graph_stater   
num_tokensr   encoder_lensr   r   r   Optional[SpecInput]c                 C  sz  |d u sJ d| j }d }	d }
d }| r|d u rn| j}tj|dd|d|d < |d |d  }| j}t|f | j|||d || jd | j	d urm| j	dkrm| j
}	| j}
t| j |	| j| j	|d | ||| j\}}	}}n|j|j}}| j}| j}d }| j}d }d }d }n"| r?| jd |d  }tjdd| | j | jtj| jd|d |d < | jd |d  }tj|dd|d|d < | j}t|f | j|||d || jd | j	d ur	| j	dkr	| j
}	| j}
| j}t| j |	| j| j	|d | ||| j\}}	}|d |< | j}|j|d |jjd < | j|| j  }| jd |d  }tj|dd|d|d < | j}d }d }d }nk|jddr| jd }| jd |d  }tjd|| d |tj| jd|d |d < | jd |d  }tj|dd|d|d < | j}t|f | j|||d || jd d }d }|}d }d }d }ntd|d	t |||||||||||	|
|| _!d S )
NzNot supportedr   r   r?   r   T
include_v2#Invalid forward mode: forward_mode=z for CUDA Graph capture.)"r(   r   r#   rG   r   r   r   rM   r   rL   r   r   'update_sliding_window_buffer_cuda_graphrN   r$   r   r   r   r   r%   r   rQ   ro   rB   r   rs   r&   r^   r'   r   rR   r   r   rr   )rt   r   r   r   rx   r   r   r   r(   r)   r*   r+   r#   r$   r   r   r   r!   r"   r%   r&   r'   r   num_tokens_per_bsr   r   r   (init_forward_metadata_capture_cuda_graph  s  













	

z:TritonAttnBackend.init_forward_metadata_capture_cuda_graphr   seq_lens_cpuc	              
   C  sL  |  r| j}	| j}
| j}|d u rtj|d | dd|	d|d < |	d |d  }	t|f | j|d | |d | |	d |
| jd |}| j	d ur| j	dkr| j
}| j}t| j|| j| j	|d | |d | || j\}}}}| |d | |d |  nJ d| |d | |d |  d S | rGt|}| jd |d  }tjdd| | j | jtj| jd|d |d < | jd |d  }	tj|dd|	d|d < | j}
t|f | j|||	d |
| jd | j	d ur| j	dkr| j
}| j}| j}t| j|| j| j	|d | ||| j\}}}|d |< | j}|j|d |jjd < | j|| j  }| jd |d  }tj|dd|d|d < d S |jddr|d | }|jd | }| jd |d  }tj|dd|d|d < | jd |d  }	tj|dd|	d|d < | j}
t|f | j|||	d |
| jd d S td	|d
)Nr   r   r?   Fz,Multi-step cuda graph init is not done here.r   Tr   r   z for CUDA Graph replay.)r   r#   r   r   rG   r   r   rM   r   rL   r   r   r   r(   rN   r   r   r   r%   r   rQ   ro   rB   r   rs   r&   r^   r'   r   r   r   )rt   r   r   rx   r   r   r   r   r   r#   r$   r"   r~   r*   r)   r   r   r%   r+   r&   r   r'   accept_lensr   r   r   'init_forward_metadata_replay_cuda_graph  s   "


	




 

	



z9TritonAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS Nr?   r   rt   r   r   r   !get_cuda_graph_seq_len_fill_value  s   z3TritonAttnBackend.get_cuda_graph_seq_len_fill_valuec                 C  s
   | j dgS )z
        Return buffers for verify attention kernels that needs to be filled after draft.

        Typically, these are tree mask and position buffers.
        N)rs   r   r   r   r   &get_verify_buffers_to_fill_after_draft  s   
z8TritonAttnBackend.get_verify_buffers_to_fill_after_draftr   cuda_graph_bsOptional[int]c                 C  s   d S Nr   )rt   r   r   r   r   r   )update_verify_buffers_to_fill_after_draft  s   z;TritonAttnBackend.update_verify_buffers_to_fill_after_draftTqkvlayerr   c                 C  sn  |j |jkr||jd |j|j f}nt|}|r&|j||j	|| t
|j|j}	d}
|jsA|jtjksA|jtjkrC| jrCd}
| jrQ| |||||
|	|S |jd urk|jdkrk|j}| jj}| jj}| jj}nd}| jj}| jj}d }| j|d|j|j | | |d|j|j|j|j |j!|j | jj"||| jj#|
| jj$| jj%|j&|	||||j'd |S )Nr   TFr:   )r   rL   sinksr+   xai_temperature_len)(qk_head_dimr\   	new_emptyr^   tp_q_head_numrG   
empty_likerZ   set_kv_bufferout_cache_locr   r   r   is_cross_attention	attn_typer
   ENCODER_ONLYDECODER_BIDIRECTIONALrh   rj   _forward_extend_unifiedrL   rr   r(   r)   r+   r#   r$   r8   view
contiguousget_key_bufferlayer_idr]   r%   r&   r'   r!   scalingr   )rt   r   r   r   r   r   save_kv_cacher   ologits_soft_capcausalrL   r#   r$   r+   r   r   r   forward_extend  sh   


z TritonAttnBackend.forward_extendr   r   r   floatr   c                 C  s  |j }|jdurS|jdkrS|j}	| jj}
| jj}|
d|d  |
d|  }|jdur5|jd| | }n*|jdurPt|jdrP|jd| |jj	 }|| }nd}nd}	| jj
}
| jj}d}|j}|jdu r|jdurt|jdr|jj	}tj|f|tj| jd}ntd|j}|jdu rttjdtj| jdtj|dd ddg}n|j}| |
|||||\}}}|tj}| j|d|j|j|d|j|j|j|j |j!|j | jj"|||| jj#| jj$| jj%|j&|||	|||j'd	 |S )
z
        Unified 1-stage extend attention for deterministic inference.
        Both prefix and extend KV are accessed through unified kv_indices.
        Nr:   r?   draft_token_numr@   zfextend_seq_lens is None but cannot infer from spec_info. This should not happen in TARGET_VERIFY mode.r   r   )	r&   r'   sm_scaler   	is_causalrL   r   window_start_posr   )(r   rL   rr   r(   r)   r   r   hasattrrx   r   r#   r$   r   r   rG   r   ro   rB   RuntimeErrorextend_start_loccatrn   r   r7   r   r9   r   r   r   r\   rZ   r   r   r]   r%   r!   r&   r'   r   r   )rt   r   r   r   r   r   r   r   r   rL   prefix_kv_indptrprefix_kv_indicesr   r   r   extend_kv_indicesr   r   r   unified_kv_indptrunified_kv_indicesprefix_lensr   r   r   r   l  s   



z)TritonAttnBackend._forward_extend_unifiedc                 C  s  | d|j|j }|j|jkr||jd |j|j f}nt|}t|j	|j
}	|r7|j||j|| |jd urJ|jdkrJ| jj}
| jj}n| jj}
| jj}| j|d|j|j|j|j|j|j|d|j|j|
|| jj| jj| jj| j|j|	||jd |S )Nr:   r   )r   r   r   )reshaper   r   r\   r   r^   rG   r   r   r   r   rZ   r   r   rL   rr   r(   r)   r#   r$   r6   r   r   r   r]   r   r   r"   re   r   r   )rt   r   r   r   r   r   r   r   r   r   r#   r$   r   r   r   forward_decode  s>   


z TritonAttnBackend.forward_decode)FN)r1   r   r2   r3   r4   r5   )r"   r   rx   r   r   r   )NN)ru   r    r   r    r   r5   r   r5   )r   r    r   r    r   r   rx   r   r   r5   r   r   r   r   )r   r    r   r   rx   r   r   r    r   r5   r   r   r   r   r   r5   )r   r   r   r   )TN)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r3   r   r   r   r5   )r,   r-   r.   rF   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   rv   r   r0   7   s0    
~
4 R
C 
x

N r0   c                   @  sL   e Zd ZdZdddZdddZdddZdddZdddZd ddZ	dS )!TritonMultiStepDraftBackendzk
    Wrap multiple triton attention backends as one for multiple consecutive
    draft decoding steps.
    r1   r   topkr    rR   c                 C  s   || _ || _|jj| j  }tj| j|d ftj|jd| _g | _	t
| jd D ]}| j	t|d| j| d q'| j	d j| _|jjt  | _|j| _|jjjd | _|jj| _d S )Nr?   r@   T)r2   r4   r   )r   rR   rJ   rK   rG   rn   ro   rB   r#   attn_backendsrangeappendr0   r`   rS   rT   r	   rU   rM   r^   pool_lenrO   	page_size)rt   r1   r   rR   ru   ir   r   r   rF      s2   z$TritonMultiStepDraftBackend.__init__r   r   kv_indices_bufferr5   call_fnc                 C  s   |d u r| j }|j}| j| }|j}t| j|| jf |j|jj|j	|| j
|j| j|jd | j
jd t|t| jt|| j |d u rEd S t| jd D ]'}| j
|d |d f |j_
|| d || j ||d    |j_||| qLd S r   )r   r   r   r   r   rR   r   rJ   rM   rx   r#   	positionsr   r^   r   r   r   r   r$   )rt   r   r   r   num_seqsr   r   r   r   r   r   common_templateC  s>   

z+TritonMultiStepDraftBackend.common_templatec                   sF   t j j|j j  j ft j jd} fdd} ||| d S )Nr@   c                   s4   |j j |j _|j j |j _ j|  | d S r   )r   r#   cloner$   r   r   r   r   r   r   r   r   v  s
   

zBTritonMultiStepDraftBackend.init_forward_metadata.<locals>.call_fn)	rG   r   rR   r   r   r`   rq   rB   r  )rt   r   r$   r   r   r   r   r   l  s   		z1TritonMultiStepDraftBackend.init_forward_metadataru   r   c                 C  s|   t j| j|| j ft j| jd| _t j|f| jd j	t j
| jd| _t| jd D ]}| j| j||| j| | jd q)d S )Nr@   r   r?   )r   r   )rG   rn   rR   r`   rq   rB   r   r   r   re   ro   r   r   r   )rt   ru   r   r   r   r   r   r     s&   

z1TritonMultiStepDraftBackend.init_cuda_graph_statec                   s    fdd}  |d | d S )Nc              	     s4    j |  j|j|j j |j|jd tj|jd d S )N)r   r   r   )	r   r   r   r   r   rx   r   DECODEr   r  r   r   r   r     s   


zUTritonMultiStepDraftBackend.init_forward_metadata_capture_cuda_graph.<locals>.call_fn)r  )rt   r   r   r   r   r   r     s   zDTritonMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphr   c                 C  sJ   |  |d d  |j| j }| jd | jd jd | |jd |  d S )Nr:   )r  r   r   r   r   r   rx   )rt   r   r   r~   r   r   r   r     s   
zCTritonMultiStepDraftBackend.init_forward_metadata_replay_cuda_graphN)r1   r   r   r    rR   r    )r   r   r   r5   r   r    r   )ru   r    r   r    )r   r   r   r    )
r,   r-   r.   __doc__rF   r  r   r   r   r   r   r   r   r   r     s    

#
)

r   ry   tl.constexprc	                 C  sv  t d|}	|	|k }
t j||	 |
dd}t |}t j||	 |
|d}t |}|d |d k r2|}t t |||}t ||}t |t jd }t |t 	t 
|d t j}d|| }}|dkrm|| | }nt ||}|| t || }t t |||}t ||}t 	t ||t ||}|	| }||| k }td|D ]}t j| | | ||d	 qd S )
Nr   )maskother   
   g      P@g      ?   r?   )r  )tlr   loadr   minminimumcdivcastr   maximumlog2ro   r   store)num_kv_splits_ptrseq_lens_ptrr   r   rU   rW   re   rb   ry   offs_seqmask_seqrx   max_seq_lenmin_seq_lenmax_kv_splits_1kv_chunk_size_1ext_seq_lenext_device_core_countblock_hnum_kv_group
token_gridmax_kv_splits_2kv_chunk_size_2r"   
offs_token
mask_tokenr   r   r   r   r}     s>   

r}   c              
   C  s   t |t |}t j|dd| d|d < | d |d  } t j| d t j|d}	|| }
t|f |||| |
|	|d t|drS| d }|	|	d | |	d |< | |	||
fS )Nr   r   r?   r:   r@   translate_loc_from_full_to_swa)
rG   r  tensorr   r   rq   r   r   r   r'  )r(   rM   rL   rx   r   r   rB   rN   r   r)   window_kv_start_idxkv_last_indexr   r   r   r     s4   




r   c              
   C  s   t |t |}t j|dd| d|d < | d |d  } || }	t|f |||| |	||d t|drH| d }
||d |
 |d |
< | |||	fS )Nr   r   r?   r'  r:   )rG   r  r(  r   r   r   r   r'  )r(   r)   rM   rL   rx   r   r   rN   r   r)  r*  r   r   r   r     s.   




r   )ry   r  r   ),
__future__r   dataclassesr   typingr   r   r   rG   r|   triton.languagelanguager  -sglang.srt.layers.attention.base_attn_backendr   !sglang.srt.layers.attention.utilsr   sglang.srt.layers.dp_attentionr	   !sglang.srt.layers.radix_attentionr
   ,sglang.srt.model_executor.forward_batch_infor   r   !sglang.srt.speculative.spec_utilsr   sglang.srt.utilsr   r   r   r   r   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   r   r   r0   r   jitr}   r   r   r   r   r   r   <module>   sH           j ;
0