o
    پi!                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dlZd dl	m
  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9m:Z: e9 rd dl;m<Z<m=Z=m>Z> e ?e@ZAeG dd de+e'ZBeG dd de+e&ZCeG dd dZDdS )    N)copy)	dataclass)ListOptionalTuple)BaseGrammarObject)envs)#create_flashinfer_kv_indices_triton)LogitsProcessorOutput)apply_custom_logit_processor)FutureIndices)ScheduleBatch)BaseTokenToKVPoolAllocator)alloc_paged_token_slots_extendalloc_token_slotsget_last_loc)CaptureHiddenMode)get_global_server_args)EagleDraftInputV2MixinEagleVerifyInputV2Mixin)verify_tree_greedy_func)	SpecInputSpecInputType)
SIMULATE_ACC_LENTREE_SPEC_KERNEL_AVAILABLEalign_evict_mask_to_page_sizeassign_req_to_token_pool_funccreate_accept_length_filter$create_extend_after_decode_spec_info filter_finished_cache_loc_kernelgenerate_simulated_accept_indexget_src_tgt_cache_locget_target_cache_loc)is_cudanext_power_of_2)top_k_renorm_probtop_p_renorm_prob%tree_speculative_sampling_target_onlyc                       s>  e Zd ZU ejed< ejed< ejed< ejed< ejed< ejed< ejed< eed< eed	< eed
< eed< eed< ejed< dZe	ed< dZ
eed<  fddZdeeef fddZed	ededefddZdedefddZdejdejd ed!ejfd"d#Z	d)ded$ed%eded&eej dejfd'd(Z  ZS )*EagleVerifyInputdraft_tokencustom_mask	positionsretrive_indexretrive_next_tokenretrive_next_siblingretrive_cum_len
spec_stepstopkdraft_token_numcapture_hidden_modeseq_lens_sumseq_lens_cpuNgrammarnum_tokens_per_reqc                       t  tj d S N)super__init__r   EAGLE_VERIFYself	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/eagle_info.py__post_init__J      zEagleVerifyInput.__post_init__returnc                 C   s   | j | j fS r:   )r2   r>   rB   rB   rC   !get_spec_adjust_token_coefficientM      z2EagleVerifyInput.get_spec_adjust_token_coefficientnum_verify_tokensc                 C   s   | t jdt jddt jddt jddt jdt jddt jd|fdt jddt jd|fdt jddt jd|fdt jddd |||tjdt jdt jddS )	Nr   cudadtypedeviceTr   r7   rM   )r)   r*   r+   r,   r-   r.   r/   r1   r2   r0   r3   r4   r5   )	torchemptylongfullboolint64r   FULLint32)clsr1   r0   rI   rB   rB   rC   create_idle_inputP   s*   z"EagleVerifyInput.create_idle_inputbatch	page_sizec           	   	   C   s   |j  rd S | j|_|dkr t|jt|j|_|j| j	 }n+|j}|j
}|| j	 }|| j	 }t|jj|j|}t|j|||||t|j|_|| _| }t|j|jj|j||j| t  rutjdd |jD tj|jd|_d S d S )N   c                 S   s   g | ]}|j |j qS rB   )mamba_ping_pong_track_buffermamba_next_track_idx).0reqrB   rB   rC   
<listcomp>   s    
z7EagleVerifyInput.prepare_for_verify.<locals>.<listcomp>rL   )forward_modeis_idler)   	input_idsr   
tree_cachelenout_cache_locseq_lensr2   r5   r   req_to_token_poolreq_to_tokenreq_pool_indicesr   last_loc
batch_sizer   r   enable_mamba_extra_bufferrP   tensorreqsrU   rN   mamba_track_indices)	r?   rZ   r[   
end_offsetprefix_lensprefix_lens_cpuend_offset_cpurl   bsrB   rB   rC   prepare_for_verifyh   s\   


	
	z#EagleVerifyInput.prepare_for_verifyrk   paged_kernel_lenspaged_kernel_lens_sumrj   c              
   C   s
  |j }t|}tjdd| | j | jtj|d}tj|d ftj|d}|| j }tj|dd|dd < tj|| j|  tj|d}	t	|f ||||d |	|
d || j | jd |  }
| j |
k r~tj| jtj|
| j  fdtj|dgdd| _|	||| jfS )Nr   r\   )steprM   rN   rL   dim   T)rN   rf   rP   aranger2   rW   zeroscumsumrQ   r	   sizer*   numelcatrS   rT   )r?   rk   rx   ry   rj   rN   rm   	qo_indptrcum_kv_seq_len
kv_indices
mask_numelrB   rB   rC   generate_attn_arg_prefill   sZ   

	z*EagleVerifyInput.generate_attn_arg_prefilllogits_outputtoken_to_kv_pool_allocator
vocab_maskc           .         sP  |j  r3ttj|j|jj|jj| j	t
jd|tjdtj|jdg tjd| jd fdtj|jddS | jjd }| j|| j}|j}t|jjdd }	|	d  d7  < tj|	tj|jd}
tj|| jd fdtj|jd}tj|ftj|jd}|t|krt|}|| j | j |jrt |j|| jd |j!j"s|j#durtj$||jjd ftj%|jd}|&| |j'tj(|| jdd	 |dur| j)dusJ | j)j*|j|d
 |j+}|st,st-.d |st,stj/|jdd	}||| j}t0|
|||| j| j1| j2|| j	d	\}
}}nxtj(|j3| jdd	}t4j5|j| dd	}t6|tj(|j7| jdd	}t8|j9dksGt:|tj(|j9| jdd	}||| jd}tj$|jtj%|jd}tj;|tj%|jd}tj<|ftj%|jd}t=|
|||| j| j1| j2||||t> j?t> j@dd tAdkrtB||
||| jd}g }g }| }|
 }d}tCtD|jE|D ]\}\}}d}tC|D ]b\}}|dkr nX|d7 }|| } |jFG|  |H  |I rd}d|||d df<  n3|j)durz	|j)J|  W q tKy }! zt-Ld|d|d|d|
d	 |!d}!~!ww q| jM|7  _M|jM|_N|I sI|G| |dkrB|G||d|f  n|G||  | jOd7  _OtPdd |D d }"| jQ|"7  _Q|R|" q|rv|dkjPdd	d }||dk }|
| }#tjS| jdtjTd}$d|$|< |U }%|%  |dkr|V|jW|$  nc| j	dkrtXt|jYf |jY|$|| jtZ| j |V|jW|$  nAt[|jY|jW||| j|\}&}'}(tj|(P \ ftj]|(jd})t^|f |'|)||(|jW| jtZ| jtZ| |V|) |j_` a|'|& |s^|dks| j	dkr/|jW| |_Wtb|jc|jdje|jY|jY| d |jW| n|'|_W|jY'|d  |jf'|%d  t|jgjh| |#| |jY|jf|jcd}*t|*||#|*ji|dS |dksi| j	dkrtb|jc|jdje|jY|jY| d |jW| | |jY'|d  |jf'|%d  t|dkrtj|}tjk|tj]|
jd}+ fdd|D },|dks| j	dkr|jW| |_Wn2tjt|tP|, tj]|
jd|_Wtl||+|jY}-|jf'|%d  tm|f |jW|'||-tZ|tZ| j t|jgjh| |
| |,||+ |jY|+ |jf| |jc|+ d}*ntj|j|jj|jj| j	t
jd}*t|*||# |dS )a~  
        Verify and find accepted tokens based on logits output and batch
        (which contains spec decoding information).

        WARNING: This API in-place modifies the states of logits_output

        This API updates values inside logits_output based on the accepted
        tokens. I.e., logits_output.next_token_logits only contains
        accepted token logits.
        )rN   hidden_sizerM   r1   r3   r   rL   r\   r7   )draft_inputr   verified_idaccept_length_per_req_cpuaccepted_indicesN)num_tokens_in_batchr{   )logitsr   ziTree speculative sampling kernel unavailable (likely AMD/HIP build). Falling back to greedy verification.)	predictsaccept_indexaccept_token_num
candidatesr,   r-   r.   target_predictr1   g      ?T)r   r   r   r   r,   r-   r.   uniform_samples"uniform_samples_for_final_samplingtarget_probsdraft_probsthreshold_singlethreshold_accdeterministicg        )r   predictaccept_lengthrv   r0   Fzi=z, req=z
accept_index=z	
predict=
c                 s   s    | ]	}|d krdV  qdS )r7   r\   NrB   )r_   idxrB   rB   rC   	<genexpr>  s    z*EagleVerifyInput.verify.<locals>.<genexpr>rO   )hidden_statesr   r   accept_length_cpuseq_lens_for_draft_extendseq_lens_for_draft_extend_cpu!req_pool_indices_for_draft_extendc                    s   g | ]} | qS rB   rB   )r_   iaccept_length_listrB   rC   ra   .  s    z+EagleVerifyInput.verify.<locals>.<listcomp>)r   r   r   r   r   r   r   )nrb   rc   EagleVerifyOutputEagleDraftInputrY   rN   model_configr   rM   r1   r   LASTrP   rQ   rR   rS   r0   rW   r,   shaper)   reshaper2   sampling_infolistnext_token_logitsrf   r   deepcopyfilter_batchtolisthas_custom_logit_processorr   penalizer_orchestratoris_required
logit_biasr   float32apply_logits_biasadd_repeat_interleaver6   apply_vocab_maskis_all_greedyr   loggerwarningargmaxr   r-   r.   temperaturesFsoftmaxr%   top_ksalltop_psr&   	rand_likerandr'   r   #speculative_accept_threshold_single speculative_accept_threshold_accr   r    	enumerateziprp   
output_idsappendcheck_finishedfinishedaccept_token
ValueErrorinfokv_committed_lenkv_allocated_lenspec_verify_ctsumspec_accepted_tokens update_spec_acceptance_histogram	full_likerT   cpufreerg   r   rh   r$   r!   itemrU   r"   r   get_kvcachemove_kv_cacher   rk   ri   rj   r5   	spec_infor   r   r   ro   r   r   ).r?   rZ   r   r   r[   r   rv   r   r   predict_shaper   r   r   linear_penaltyr   r   expanded_temperaturer   r   coinscoins_for_final_samplingunfinished_indexunfinished_accept_indexaccept_index_cpupredict_cpuhas_finishedr   r`   accept_index_rownum_acceptedjr   ideaccepted_draft_tokensr   
evict_maskr   src_cache_loctgt_cache_locto_free_num_slotsto_free_slotsr   unfinished_index_devicedraft_input_accept_length_cpuaccept_length_filterrB   r   rC   verify   sl  












	






	




	zEagleVerifyInput.verifyr:   )__name__
__module____qualname__rP   Tensor__annotations__intr   r6   r   r8   rD   r   rG   classmethodrY   r   rw   r   r
   r   r   r   __classcell__rB   rB   r@   rC   r(   6   sX   
 







6
@r(   c                       s  e Zd ZU dZejed< dZejed< dZejed< e	j
Ze	ed< dZejed< dZejed< dZee ed< dZejed	< dZejed
< dZeed< dZeed< dZejed< dZejed< dZejed< dZee ed< dZeej ed< dZeejj ed<  fddZde eef fddZ!de"fddZ#e$dej%dedej&dede	f
d d!Z'de"d"efd#d$Z(d%ejd&ejd'ed(ejfd)d*Z)d3d,ejd-e*fd.d/Z+d4d1d2Z,  Z-S )5r   Ntopk_p
topk_indexr   r3   r   r   r   	kv_indptrr   r7   r8   num_tokens_for_logprob_per_reqr   r   r   future_indicesnew_seq_lensverify_donec                    r9   r:   )r;   r<   r   EAGLE_DRAFTr>   r@   rB   rC   rD     rE   zEagleDraftInput.__post_init__rF   c                 C   s   | j | jfS r:   )r8   r	  r>   rB   rB   rC   rG     rH   z1EagleDraftInput.get_spec_adjust_token_coefficientrZ   c                 C   s   |j  rd S t| jt|jksJ d}t|jD ])\}}|j|||  }t	|dd  | j| 
df|j||| < ||7 }qd S )Nr   r\   )rb   rc   rf   r   rh   r   extend_lensrd   rP   r   r   )r?   rZ   ptr   
extend_lenrd   rB   rB   rC   prepare_for_extend  s   

z"EagleDraftInput.prepare_for_extendrN   r   rM   r1   c                 C   sv   | t jd|t jdt jd|f||dt jd|f|t jdt jd|f|t jd|t jd|t jdt jd|t jdg dS )NrJ   )rN   rM   r   )r   r   r  r  r3   r  r   r   )rP   rQ   rW   r   rU   )rX   rN   r   rM   r1   r3   rB   rB   rC   rY     s   	z!EagleDraftInput.create_idle_inputspeculative_num_stepsc                 C   s   |j  rd S | j|_dd |jjD |_t|j|_|jj	|_
|jj|_|jj|_d|_d|_tj| _| jd tj|jtjd| _tj| jtjd| _tt|j
f |j|j
| j| j| jtt|d t|j
 d S )Nc                 S   s   g | ]}|d  qS )r\   rB   )r_   xrB   rB   rC   ra     s    z?EagleDraftInput.prepare_extend_after_decode.<locals>.<listcomp>Fr\   rO   )rb   rc   r   rd   r   r   r  r   extend_num_tokensr   rh   r   r5   r   rk   return_logprobreturn_hidden_statesr   r   r3   r   r   rP   
empty_likerR   r+   rW   r   rf   r$   max)r?   rZ   r  rB   rB   rC   prepare_extend_after_decode  s,   



z+EagleDraftInput.prepare_extend_after_decoderk   rx   ry   rj   c           
   
   C   s   |j }| j }tj|d ftj|d}tj| jdd|dd < tj|d ftj|d}tj|dd|dd < |d u r?|d }tj|tj|d}	t|f ||||d |	|	d |	||d fS )Nr\   rL   r   r{   r7   )
rN   r   r   rP   r   rW   r   rQ   r	   r   )
r?   rk   rx   ry   rj   rN   rv   r   r   r   rB   rB   rC   r     s*   
	z)EagleDraftInput.generate_attn_arg_prefillTnew_indiceshas_been_filteredc                 C   s   | j d ur| j j| | j _d S tj }|rbdt| dt| j d}t|t| jkr8|r3t|t	| | jd t| | _| j
d t| | _
| jd t| | _| jd t| | _d S | j| | _| j
| | _
| j| | _| j| | _d S )Nzlength of new_indices: z != length of topk_p: z, this should not happen)r
  indicesr   &SGLANG_SPEC_ENABLE_STRICT_FILTER_CHECKgetrf   r  r   r   r   r  r   r   )r?   r  r  strict_check	error_msgrB   rB   rC   r     s$   


zEagleDraftInput.filter_batchr   c                 C   s   | j d ur|j d usJ tt| j j|j jgd| _ d S | jd u r4|j| _|j| _|j| _|j| _d S |jd u r;d S tj| j|jgdd| _tj| j|jgdd| _t| j|jg| _t| j|jg| _d S )N)r  r   )axis)	r
  r   rP   r   r  r   r   r  r  )r?   r   rB   rB   rC   merge_batch  s,   


zEagleDraftInput.merge_batch)T)r   r   ).r   r   r   r  rP   r  r  r  r   r   rV   r3   r   r   r   r   r  r  r   r8   r	  r   r   r   r
  r   r   r  r  rK   EventrD   r   rG   r   r  r  rN   rM   rY   r  r   rT   r   r"  r  rB   rB   r@   rC   r   g  sb   
 
 
 r   c                   @   s>   e Zd ZU eed< eed< ejed< ee	 ed< ejed< dS )r   r   r   r   r   r   N)
r   r   r   r   r  r
   rP   r  r   r  rB   rB   rB   rC   r   *  s   
 
r   )Eloggingr   dataclassesr   typingr   r   r   rP   torch.nn.functionalnn
functionalr   +sglang.srt.constrained.base_grammar_backendr   sglang.srt.environr   !sglang.srt.layers.attention.utilsr	   "sglang.srt.layers.logits_processorr
   sglang.srt.layers.samplerr   !sglang.srt.managers.overlap_utilsr   "sglang.srt.managers.schedule_batchr   sglang.srt.mem_cache.allocatorr   sglang.srt.mem_cache.commonr   r   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.server_argsr   $sglang.srt.speculative.eagle_info_v2r   r   "sglang.srt.speculative.eagle_utilsr    sglang.srt.speculative.spec_infor   r   !sglang.srt.speculative.spec_utilsr   r   r   r   r   r   r   r    r!   r"   sglang.srt.utilsr#   r$   
sgl_kernelr%   r&   r'   	getLoggerr   r   r(   r   r   rB   rB   rB   rC   <module>   sH    0
    4 C