o
    پiD                     @  s^  d dl mZ d dlZd dlZd dlmZmZ d dlZd dlZd dl	m
Z
 d dlmZ eeZd dlmZ d dlm  mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1 e/ rd dl2m3Z3m4Z4m5Z5m6Z6 n	e0 rd dl2m6Z6 eG dd de'Z7dS )    )annotationsN)OptionalTuple)BaseGrammarObject)get_global_server_args)	dataclass)envs)#create_flashinfer_kv_indices_triton)LogitsProcessorOutput)apply_custom_logit_processor)ScheduleBatch)alloc_paged_token_slots_extendalloc_token_slotsget_last_loc)SamplingBatchInfo)	SpecInputSpecInputType)TREE_SPEC_KERNEL_AVAILABLEassign_req_to_token_poolget_src_tgt_cache_locget_target_cache_loc)is_cudais_hipnext_power_of_2)top_k_renorm_probtop_p_renorm_prob%tree_speculative_sampling_target_onlyverify_tree_greedy)r   c                      s   e Zd Z	d8d9 fddZd:ddZd;ddZd<ddZd=dd Zd>d"d#Zd?d%d&Z	d@d)d*Z
	d8dAd-d.ZdBdCd3d4ZdDd6d7Z  ZS )ENgramVerifyInputNdraft_tokentorch.Tensor	tree_mask	positionsretrive_indexretrive_next_tokenretrive_next_siblingdraft_token_numintgrammarr   c	           	        sL   t  tj || _|| _|| _|| _|| _|| _	|| _
| jj| _|| _d S N)super__init__r   NGRAM_VERIFYr   custom_maskr"   r#   r$   r%   r&   devicer(   )	selfr   r!   r"   r#   r$   r%   r&   r(   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/ngram_info.pyr+   4   s   

zNgramVerifyInput.__init__returnTuple[int, int]c                 C  s   | j | j fS r)   )r&   )r/   r2   r2   r3   !get_spec_adjust_token_coefficientJ   s   z2NgramVerifyInput.get_spec_adjust_token_coefficientbatchr   	page_sizec           	   
   C  s   |j  rd S | j|_|dkr t|jt|j|_|j| j	 }n+|j}|j
}|| j	 }|| j	 }t|jj|j|}t|j|||||t|j|_|| _| }t|f |j|jj|j||j|jjjd t| d S )N   )forward_modeis_idler   	input_idsr   
tree_cachelenout_cache_locseq_lensr&   seq_lens_cpur   req_to_token_poolreq_to_tokenreq_pool_indicesr   last_loc
batch_sizer   shapetritonr   )	r/   r7   r8   
end_offsetprefix_lensprefix_lens_cpuend_offset_cpurE   bsr2   r2   r3   prepare_for_verifyM   sL   


	z#NgramVerifyInput.prepare_for_verifyrD   paged_kernel_lenspaged_kernel_lens_sumrC   c              
   C  s   t |}tj|d ftj| jd}|| j }tj|dd|dd < tjd|d tj| jd| j | _tj	|d tj| jd}t
|f ||||d ||d ||| j| jfS )Nr9   dtyper.   r   dim)r>   torchzerosint32r.   r&   cumsumarange	qo_indptremptyr	   sizer-   )r/   rD   rO   rP   rC   rM   cum_kv_seq_len
kv_indicesr2   r2   r3   generate_attn_arg_prefillz   s*   
	z*NgramVerifyInput.generate_attn_arg_prefilllogits_outputc                 C  s  | j  }| j }d}tt|j|D ]\}\}}t|D ][\}	}
|
dkr( nR||
 }|j| |  |	 rId}d| j ||	d d f<  n1|j
d uryz|j
| W q tyx } ztd|d|d| j d| jd		 |d }~ww q| jd7  _td
d |D d }| j|7  _|| q|r| j dkjddd | _| j | j dk | _ |j| j  |_|jr|j| j  |_| j| j  | _d S )NFrU   Tr9   zi=z, req=z
self.accepted_indices=z
self.predict=
c                 s  s    | ]	}|d krdV  qdS )rU   r9   Nr2   ).0idxr2   r2   r3   	<genexpr>   s    z2NgramVerifyInput._fill_requests.<locals>.<genexpr>rS   )accepted_indicestolistpredict	enumeratezipreqs
output_idsappendcheck_finishedfinishedr(   accept_token
ValueErrorloggerinfospec_verify_ctsumspec_accepted_tokens update_spec_acceptance_histogramaccept_lengthnext_token_logitshidden_statesverified_id)r/   r7   ra   accept_index_cpupredict_cpuhas_finishedireqaccept_index_rowjrd   ideaccepted_draft_tokensr2   r2   r3   _fill_requests   sX   



zNgramVerifyInput._fill_requestsaccept_length_cpuc              
   C  s^  |  }|dkr(tj| jdtjd}d|| j< |j|j|  |j| j |_nHt	|j
|j| j| j| j|\}}}tj|  ftj|jd}	t|f ||	| j||j| jt| jt| |j|	 |j || ||_| }
t|jD ]\}}| j|
| d 7  _|j|_qyt|f |j|jj|j
|j
| j d |j|jjjd t | d S )Nr9   T)rR   FrQ   )!rF   rV   	full_liker   boolrf   token_to_kv_pool_allocatorfreer?   r   r@   rx   r&   r\   ru   itemint64r.   r   r   get_kvcachemove_kv_cacherg   ri   rk   kv_committed_lenkv_allocated_lenr   rD   rB   rC   rG   rH   )r/   r7   r8   r   rM   
evict_masksrc_cache_loctgt_cache_locto_free_num_slotsto_free_slotsaccept_length_listr   r   r2   r2   r3   _free_cache   s^   



zNgramVerifyInput._free_cacher
   c              
   C  s   |  }tj|jdd}||| j}| j|| j}t|jjd d }|d  d7  < tj	|tj
| jd| _tj|| jfdtj
| jd| _tj	|ftj
| jd| _t| j| j| j|| j| j| j|d d S )NrU   rS   r9   rQ   )predictsaccept_indexaccept_token_num
candidatesr#   r$   r%   target_predict)rF   rV   argmaxry   reshaper&   r   listrG   r\   rX   r.   rh   fullrf   rx   r   r#   r$   r%   )r/   r7   ra   rM   r   r   predict_shaper2   r2   r3   _greedy_verify  s*   
zNgramVerifyInput._greedy_verifysampling_infor   c                 C  s  |  }| j|| j}t|jjd d }|d  d7  < tj|tj	| j
d| _tj|| jfdtj	| j
d| _tj|ftj	| j
d| _tj|j| jdd}tj|j| dd}t|tj|j| jdd}|jrut|tj|j| jdd}||| jd}tj|jtj| j
d}	tj|tj| j
d}
tj|ftj| j
d}t| j| j| j|tj| jtj| j tj| j!tj|
|||	t" j#t" j$dd d S )NrU   r9   rQ   r   rS   T)r   r   r   r   r#   r$   r%   uniform_samples"uniform_samples_for_final_samplingtarget_probsdraft_probsthreshold_singlethreshold_accdeterministic)%rF   r   r   r&   r   ry   rG   rV   r\   rX   r.   rh   r   rf   rx   repeat_interleavetemperaturesFsoftmaxr   top_ksneed_top_p_samplingr   top_psrW   float32	rand_likerandr   tor   r#   r$   r%   r   #speculative_accept_threshold_single speculative_accept_threshold_acc)r/   r7   ra   r   rM   r   r   expanded_temperaturer   r   coinscoins_for_final_samplingr2   r2   r3   _sampling_verify5  sd   




z!NgramVerifyInput._sampling_verify
vocab_maskOptional[torch.Tensor]c                 C  sl  | j jd }|j}|t|krt|}|| j  | j  |jr*t	|j
|| jd |jjrPtj||j
jd ftj| jd}|| |j
tj|| jdd |d urd| jd us[J | jj|j
|d |jpktj }|sutsutd |syts| || n| ||| |  || | j!" }	|	# $ }
| %|||	 |j&| j!d  |j'|	d  || j(|
fS )Nr   )num_tokens_in_batchr9   rQ   rS   )logitsr   ziTree speculative sampling kernel unavailable (likely AMD/HIP build). Falling back to greedy verification.))r#   rG   r   r>   copydeepcopyfilter_batchrg   has_custom_logit_processorr   ry   r&   penalizer_orchestratoris_requiredrV   rW   r   r.   apply_logits_biasadd_r   r(   apply_vocab_maskis_all_greedyr    SGLANG_NGRAM_FORCE_GREEDY_VERIFYgetr   rr   warningr   r   r   rx   cpuru   r   r   r@   rA   r{   )r/   r7   ra   r8   r   rM   r   linear_penaltyr   r   num_accepted_tokensr2   r2   r3   verifyy  sT   


zNgramVerifyInput.verifyTnew_indiceshas_been_filteredr   c                 C     d S r)   r2   )r/   r   r   r2   r2   r3   r        zNgramVerifyInput.filter_batch	spec_infoc                 C  r   r)   r2   )r/   r   r2   r2   r3   merge_batch  r   zNgramVerifyInput.merge_batchr)   )r   r    r!   r    r"   r    r#   r    r$   r    r%   r    r&   r'   r(   r   )r4   r5   )r7   r   r8   r'   )rD   r    rO   r    rP   r'   rC   r    )r7   r   ra   r    )r7   r   r8   r'   r   r    )r7   r   ra   r
   )r7   r   ra   r
   r   r   )
r7   r   ra   r
   r8   r'   r   r   r4   r    )T)r   r    r   r   )r   r   )__name__
__module____qualname__r+   r6   rN   r`   r   r   r   r   r   r   r   __classcell__r2   r2   r0   r3   r   2   s    


-
"
4
H
IGr   )8
__future__r   r   loggingtypingr   r   rV   rH   +sglang.srt.constrained.base_grammar_backendr   sglang.srt.server_argsr   	getLoggerr   rr   dataclassesr   torch.nn.functionalnn
functionalr   sglang.srt.environr   !sglang.srt.layers.attention.utilsr	   "sglang.srt.layers.logits_processorr
   sglang.srt.layers.samplerr   "sglang.srt.managers.schedule_batchr   sglang.srt.mem_cache.commonr   r   r   'sglang.srt.sampling.sampling_batch_infor    sglang.srt.speculative.spec_infor   r   !sglang.srt.speculative.spec_utilsr   r   r   r   sglang.srt.utilsr   r   r   
sgl_kernelr   r   r   r   r   r2   r2   r2   r3   <module>   s6    
