o
    پiD                     @  s  d dl mZ d dlmZ d dlmZmZ d dlZd dlm	  m
Z d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. e+ Z/e, Z0e- Z1erd dl2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8 e+ rd dl9m:Z:m;Z;m<Z< ej=d2ddZ>eG dd dZ?eG dd dZ@ej=d3dd ZAej=d4d"d#ZBej=d5d%d&ZCd6d0d1ZDdS )7    )annotations)	dataclass)TYPE_CHECKINGAnyN)LogitsProcessorOutput)ModelWorkerBatchScheduleBatch)get_alloc_len_per_decode)alloc_paged_token_slots_extendalloc_token_slotsget_last_loc)ReqToTokenPool)CaptureHiddenModeForwardBatchForwardMode)ModelRunner)get_global_server_args)verify_tree_greedy_func)SIMULATE_ACC_LENgenerate_simulated_accept_index)is_cudais_hipis_npunext_power_of_2)TpModelWorker)EAGLEDraftCudaGraphRunner)EagleDraftInputEagleVerifyInput)top_k_renorm_probtop_p_renorm_prob%tree_speculative_sampling_target_onlypool_lentl.constexprtopkspeculative_num_stepsc                 C  s   d}t jdd}|| }	||| |  }
t || }|t | | |  }t |	|}t|D ]%}t d|||  }||	k }t j|| | |d}t j|
| ||d q0d S )N   r   axismask)tl
program_idloadcdivrangearangestore)req_pool_indicesreq_to_tokenseq_lensout_cache_locr!   r#   r$   
BLOCK_SIZEpidcopy_lenout_cache_ptrkv_start
token_poolnum_loopicopy_offsetr)   data r?   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/eagle_info_v2.py#assign_draft_cache_locs_page_size_15   s   
rA   c                   @  s*   e Zd ZdddZdddZdddZdS )EagleDraftInputV2Mixinselfr   batchr   c                 C  sh  |   ddlm} | }|  |jj}g }g }d}t }|jD ],}	|	j	d|  |	j
 }
||	j
 ||	j
|
  ||
7 }|	 j
|
7  _
|	 jd7  _q"tj|tjdd}tj|tjdd}|dkrlt|j|}n"|j|jd}|j|jd}t|jj|j|}t|j||||||}||j|jj|j|jd|j|jd|| |j |_|j  |_d S )Nr   )assign_req_to_token_pool_func      cpudtypedevice)rK   )maybe_evict_swa!sglang.srt.speculative.spec_utilsrE   
batch_sizemaybe_wait_verify_donetoken_to_kv_pool_allocator	page_sizer	   reqskv_committed_lenkv_allocated_lenappenddecode_batch_idxtorchtensorint32r   
tree_cachetorK   r   req_to_token_poolr2   r1   r
   r3   rH   seq_lens_cpusumitemseq_lens_sum)rC   rD   rE   bsrQ   cur_kv_lens_cpunxt_kv_lens_cpunum_needed_tokensalloc_len_per_decoderxr4   cur_kv_lensnxt_kv_lenslast_locr?   r?   r@   prepare_for_decodeR   s\   


z)EagleDraftInputV2Mixin.prepare_for_decoder\   r   r   cuda_graph_runnerr   draft_model_runnerr   r#   int	num_stepsc           
      C  s   |j  s0t|j}tj|| | ftj|jjd|_	t
|f |j|j|j|j	|jjd || || _|| _tj|_|jj|dd| _t||}|oO||}	||	fS )NrI   rG   r   dim)forward_modeis_idlelenr3   rW   emptyint64	input_idsrK   r4   rA   r1   r2   shapenum_tokens_per_reqnum_tokens_for_logprob_per_reqr   LASTcapture_hidden_moderepeat_interleave	positionsr   init_newcan_run)
rC   r\   rD   rl   rm   r#   ro   ra   forward_batchcan_cuda_graphr?   r?   r@   prepare_for_v2_draft   s.   
	

z+EagleDraftInputV2Mixin.prepare_for_v2_draftpredicttorch.Tensornum_draft_tokensr   c           
        s   |j }t|j  }| |_||_|j  |_|j   |_ | j|7  _ fddtt|jD |_| |_	||_
tj|_|j rFtjntj|_t||}|oV||}	|j sd|	sd|j| |S )Nc                   s   g | ]} qS r?   r?   ).0_r   r?   r@   
<listcomp>   s    zSEagleDraftInputV2Mixin.prepare_for_extend_to_fill_draft_kvcache.<locals>.<listcomp>)r]   rt   r3   	spec_inforw   r`   r.   extend_seq_lenstolistextend_prefix_lensextend_num_tokensr   FULLr|   rr   rs   r   IDLEDRAFT_EXTEND_V2r   r   r   attn_backendinit_forward_metadata)
rC   rD   r   r   rm   rl   seq_lens_cpu_r   r   r   r?   r   r@   (prepare_for_extend_to_fill_draft_kvcache   s(   
z?EagleDraftInputV2Mixin.prepare_for_extend_to_fill_draft_kvcacheN)rC   r   rD   r   )rC   r   r\   r   rD   r   rl   r   rm   r   r#   rn   ro   rn   )
rD   r   r   r   r   rn   rm   r   rl   r   )__name__
__module____qualname__rk   r   r   r?   r?   r?   r@   rB   P   s    

<&rB   c                   @  s$   e Zd Zdd	d
Z	ddddZdS )EagleVerifyInputV2MixinrC   r   r\   r   rD   r   target_workerr   c              	   C  s   |j  s&t|j}| j|_|jj}t|j|j|j	|j	| j
 || j
|d|_|j  r.tjntj|_ tj|_t||j}t|jjoH|jj|}|rW|jj| ||fS |j  sc|jj| ||fS )N)r1   r2   start_offset
end_offsetrN   draft_token_numrK   )rr   rs   rt   r1   draft_tokenrw   rK   assign_extend_cache_locs_funcr2   r3   r   r4   r   r   TARGET_VERIFYr   r   r|   r   r   model_runnerboolgraph_runnerr   replay_preparer   r   )rC   r\   rD   r   ra   rK   verify_forward_batchcan_run_cuda_graphr?   r?   r@   prepare_for_v2_verify   s>   



z-EagleVerifyInputV2Mixin.prepare_for_v2_verifyNlogits_outputr   
vocab_maskr   c                 C  sT  |j  r+tjdtj|jjd}tjdtj|jjd}tjdtj|jjd}|||fS t|j}|j	}|j
}	|jj}
|durM| jdusEJ | jj|	|d | j|| j}t|	jdd }tj|tj|
d }tj|| jd fdtj|
d}tj|ftj|
d}|jstrtj|	dd}||| j}t||||| j| j| j|| jd	\}}}nftj|j| jdd}t j!|	| dd}t"|tj|j#| jdd}t$|tj|j%| jdd}||| jd}t&|}tj'|tj(|
d}tj)|ftj(|
d}t*||||| j| j| j||||t+ j,t+ j-d	d
 t.dkr t/|||t.|| jd}|0d |||fS )z
        Verify and find accepted tokens based on logits output and batch
        (which contains spec decoding information).
        r   rI   N)logitsr   rG   rp   )	predictsaccept_indexaccept_token_num
candidatesretrive_indexretrive_next_tokenretrive_next_siblingtarget_predictr#   T)r   r   r   r   r   r   r   uniform_samples"uniform_samples_for_final_samplingtarget_probsdraft_probsthreshold_singlethreshold_accdeterministic)r   r   accept_lengthsimulate_acc_lenra   
spec_steps)1rr   rs   rW   ru   rY   rw   rK   rt   r3   sampling_infonext_token_logitsgrammarapply_vocab_maskr   reshaper   listrx   zerosflattenfullr   is_all_greedy_is_npuargmaxr   r   r   r   r#   r}   temperaturesFsoftmaxr   top_ksr   top_ps
zeros_like	rand_likefloat32randr    r   #speculative_accept_threshold_single speculative_accept_threshold_accr   r   add_)rC   rD   r   r   r   r   r   ra   r   r   rK   r   predict_shaper   expanded_temperaturer   r   coinscoins_for_final_samplingr?   r?   r@   sample  s   













zEagleVerifyInputV2Mixin.sample)rC   r   r\   r   rD   r   r   r   )N)rC   r   rD   r   r   r   r   r   )r   r   r   r   r   r?   r?   r?   r@   r      s    
1r   r   c                 C  sL   t jdd}t || }|| | d }t | | }t || | d S )Nr   r&   rG   )r*   r+   r,   r0   )verified_idaccept_lensnew_verified_idr   r6   r   verified_id_idxverified_id_datar?   r?   r@   fill_new_verified_idz  s
   	r   
size_upperc           
      C  s   t jdd}t d|}t j| | ||k dddkt j}t |}t | | }|dkr?t || }	t || |	 d S d S )Nr   r&   r   )other)r*   r+   r/   r,   r[   rv   r^   r0   )
r   r4   accepted_out_cache_locr   r6   offsetmasksdstsrcvaluer?   r?   r@   fill_accepted_out_cache_loc  s   $
r   bs_upperc                 C  s  d}t jdd}t || }	t || }
|t | | |  }t d|}t j|| ||k dd}t j|| ||k dd}t j|| dd}|| }t d||	 }t d|}t |
|	 |}t|D ]!}||
k }t j|| |d}t j|| ||d ||7 }||7 }qfd S )N    r   r&   )r)   r   r(   )r*   r+   r,   r/   r^   r-   r.   r0   )r1   r2   r   r   r4   r!   r   r5   r6   r9   kv_endr:   length_offsetstartend
out_offsetr8   load_offsetsave_offsetr;   r   r)   r>   r?   r?   r@   assign_extend_cache_locs  s(   

r   r1   r   r2   r   r   rN   rn   r   returnc              	   C  s   t str$tj|| ftj|d}t|f | |||||jd t| |S tr?tj|| ftj	|d}tj
j| |||| |S d S )NrI   rG   )_is_cuda_is_hiprW   ru   rv   r   rx   r   r   rY   opsnpucache_loc_update)r1   r2   r   r   rN   r   rK   r4   r?   r?   r@   r     s>   	
r   )r!   r"   r#   r"   r$   r"   )r   r"   )r   r"   )r!   r"   r   r"   )r1   r   r2   r   r   r   r   r   rN   rn   r   rn   r   r   )E
__future__r   dataclassesr   typingr   r   rW   torch.nn.functionalnn
functionalr   tritontriton.languagelanguager*   "sglang.srt.layers.logits_processorr   "sglang.srt.managers.schedule_batchr   r   sglang.srt.managers.utilsr	   sglang.srt.mem_cache.commonr
   r   r    sglang.srt.mem_cache.memory_poolr   ,sglang.srt.model_executor.forward_batch_infor   r   r   &sglang.srt.model_executor.model_runnerr   sglang.srt.server_argsr   "sglang.srt.speculative.eagle_utilsr   rM   r   r   sglang.srt.utils.commonr   r   r   r   r   r   r   sglang.srt.managers.tp_workerr   4sglang.srt.speculative.eagle_draft_cuda_graph_runnerr   !sglang.srt.speculative.eagle_infor   r   
sgl_kernelr   r   r    jitrA   rB   r   r   r   r   r   r?   r?   r?   r@   <module>   sT      &"