o
    پiӂ                     @   s  d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF d dlGmHZH eE ZIeD ZJeKeLZMdeNdeeOe jPf fddZQG dd  d e%ZRG d!d" d"e&ZSdS )#    N)ListOptionalTuple)envs)EAGLEDraftExtendNpuGraphRunner)EAGLEDraftNpuGraphRunner)TritonMultiStepDraftBackend)TRTLLMMLAMultiStepDraftBackend)get_attention_tp_group)#speculative_moe_a2a_backend_contextspeculative_moe_backend_context)UpdateWeightsFromTensorReqInput)ModelWorkerBatch)GenerationBatchResult)TpModelWorker)CaptureHiddenModeForwardBatch)
ServerArgs)BaseDraftWorkerBaseSpecWorker)DraftBackendFactory)EAGLEDraftCudaGraphRunner)EAGLEDraftExtendCudaGraphRunner)EagleDraftInputEagleVerifyInput)assign_extend_cache_locsfill_accepted_out_cache_locfill_new_verified_id)TreeMaskModebuild_tree_kernel_efficient)SpeculativeAlgorithm)
detect_nandraft_tp_contextgenerate_token_bitmaskload_token_mapselect_top_k_tokens)MultiprocessingSerializerempty_context	fast_topkget_available_gpu_memoryis_cudais_npunext_power_of_2)monkey_patch_torch_reductionsdevicereturnc                 C   s<   t j rt|  }t| |}||fS d t fS N)	r   !SGLANG_ENABLE_OVERLAP_PLAN_STREAMgettorchget_device_moduleStreamstream
contextlibnullcontext)r.   plan_streamplan_stream_ctx r;   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/eagle_worker_v2.py_get_plan_streamF   s
   
r=   c                   @   s   e Zd Zdedededededededed	efd
dZdd Zdd Zdd Z	dd Z
defddZdefddZdd Zdedejdejfdd Zded!efd"d#Zd$S )%EagleDraftWorkerserver_argsgpu_idtp_rankdp_rankmoe_ep_rankattn_cp_rankmoe_dp_rank	nccl_porttarget_workerc
                 C   sF  || _ || _|| _|| _|| _|| _|	| _|| _|| _|j	| _	|j
| _|j| _|j| _t|j| _|j}
d|_|	 \| _| _|jrN| j rNtt }nt }|A t . t  t|||d|||||d| j| jd| _W d    n1 syw   Y  W d    n1 sw   Y  W d    n1 sw   Y  | jj| _d| _ | j rt!| jj"j#di }|$dd| _ | %  | &  |
| jj _|jrtnt| _| | jj'6 t # t  | (  | )  W d    n1 sw   Y  W d    n1 sw   Y  W d    n	1 sw   Y  t*j+| _,t-| j	\| _.| _/d S )NTr   )r?   r@   rA   pp_rankrB   rC   rD   rE   rF   is_draft_workerreq_to_token_pooltoken_to_kv_pool_allocatorFeagle_configuse_aux_hidden_state)0r?   r@   rA   rB   rC   rF   rG   rD   rE   r.   speculative_eagle_topktopkspeculative_num_stepsspeculative_num_draft_tokensr    from_stringspeculative_algorithmdisable_cuda_graphget_memory_poolrJ   rK   enable_dp_attention	is_eagle3r"   r
   r'   r   r   r   draft_workermodel_runnerdraft_runnereagle_use_aux_hidden_stategetattrmodel_config	hf_configr2   init_token_mapinit_lm_headtp_groupinit_attention_backendinit_cuda_graphsr   	FULL_MASKtree_mask_moder=   r9   r:   )selfr?   r@   rA   rB   rC   rD   rE   rF   rG   backup_disable_cuda_graphctxrL   r;   r;   r<   __init__R   s   

  



  zEagleDraftWorker.__init__c                 C   sh   | j  r| jjd urtd d | _d S | jjd ur/t| jj| _dt| j d| j_	d S d | _d S )NzgSpeculative token map specified, but EAGLE3 models already have this. Ignoring the specified token map.z{"hot_vocab_size": })
rS   rW   r?   speculative_token_maploggerwarninghot_token_idr$   lenjson_model_override_argsrf   r;   r;   r<   r_      s   



zEagleDraftWorker.init_token_mapc                 C   s   | j jj \}}| j r?t| jjdr#| jjjr#| jj	|| n| jj
| | jjjd ur=| jjj|j| _d S d S | jd urW| }| j|j| _|j| j |_| jj	|| d S )Nload_lm_head_from_target)rG   rY   modelget_embed_and_headrS   rW   hasattrrZ   rr   set_embed_and_head	set_embedrn   tor.   clonedata)rf   embedheadr;   r;   r<   r`      s$   



zEagleDraftWorker.init_lm_headc                 C   sL   d| _ d | _t| j| j| j| j}| | _|	 | _| j| j_t
j| _d S )NF)has_prefill_wrapper_verifydraft_extend_attn_backendr   r?   rZ   rO   rP   create_decode_backenddraft_attn_backendcreate_draft_extend_backendr   rd   re   )rf   draft_backend_factoryr;   r;   r<   rb      s   

z'EagleDraftWorker.init_attention_backendc              	   C   sd  d| _ d| _| jjrdS ttd}| jdkrSt }t	| j
| j}td|dd || jj
 | | _ t	| j
| j}tdt | dd|| dd	|dd
 ttd}| jrtsmtret| jtsmtrt| jtrt }t	| j
| j}td|dd || jj
 | | _t	| j
| j}tdt | dd|| dd	|dd
 dS dS dS dS )zCapture cuda graphs.N)npucuda   zOCapture draft cuda graph begin. This can take up to several minutes. avail mem=z.2fz GBz,Capture draft cuda graph end. Time elapsed: z s. mem usage=z GB. avail mem=z GB.zVCapture draft extend cuda graph begin. This can take up to several minutes. avail mem=z3Capture draft extend cuda graph end. Time elapsed: )cuda_graph_runner"cuda_graph_runner_for_draft_extendr?   rT   r   r   rP   timeperf_counterr)   r.   r@   rl   inforG   r   r   r~   _is_npu_is_cuda
isinstancer   r   r	   )rf   Device2DraftCudaGraphRunnertic
before_mem	after_memDevice2ExtendCudaGraphRunnerr;   r;   r<   rc      sf   
(

(z!EagleDraftWorker.init_cuda_graphsmodel_worker_batchc                 C   s   |j }|| j|| j| j| j| j\}}|r | j|\}}}n|j	 s0| jdkr0| j
| | |\}}}|j	 rGt| j| j| jS | jjj \}}	t|j||||j|j| j| j| j| j||	\}
}}}}}t||
||||d | j| j| jd d d dS )Nr   )draft_tokencustom_mask	positionsretrive_indexretrive_next_tokenretrive_next_siblingretrive_cum_len
spec_stepsrO   draft_token_numcapture_hidden_modeseq_lens_sumseq_lens_cpu)	spec_infoprepare_for_v2_draftrJ   r   rZ   rO   rP   replayforward_modeis_idler   init_forward_metadatadraft_forwardr   create_idle_inputrQ   rG   rY   attn_backend&get_verify_buffers_to_fill_after_draftr   verified_idseq_lensr   re   )rf   r   draft_inputforward_batchcan_cuda_graphparent_listtop_scores_indexdraft_tokenstree_mask_bufposition_buf	tree_maskpositionr   r   r   r;   r;   r<   draft.  s   



	zEagleDraftWorker.draftr   c                 C   s  |j }|j}|j|j|j}}}| jd ur| j| }||j| j| j	}|
d| j	d}g }g }g }	d }
t| j	D ]v}t|||||
| j\}}}
}||d  ||d  |	|d  || j	d krj nI||_|| |_|jd | jj| |_||_| jj|ddj}| jjrt| tj|jdd}t|| jdd\}}| jd ur| j| }|j}q<tj|ddd}tj|dd}tj|| j d dd}|j!}t"|j#}tj$||dd	}t%|	dkrtj|	d d dd}n|	d j&d }tj'|d|	d j(d
}|||fS )N)   r   r   r   r   r   Tskip_attn_backend_initdim)indexr   r.   ))r   out_cache_loctopk_p
topk_indexhidden_statesrn   reshape
batch_sizerO   rP   permuteranger%   append	input_idsr   add_r   attn_backendsr   rZ   forwardlogits_outputr?   enable_nan_detectionr!   r3   softmaxnext_token_logitsr(   catflattenrQ   indicessortvaluesgatherro   shapeemptyr.   )rf   r   r   r   r   r   r   
score_list
token_listparents_listscoresir   	tree_infor   probsss_token_list
top_scoresr   r   r   r   r;   r;   r<   r   }  sz   







zEagleDraftWorker.draft_forwardc                 C      d S r0   r;   rq   r;   r;   r<   draft_extend  s   zEagleDraftWorker.draft_extendbatchtarget_hidden_statesnext_token_idsc                 C   s   |j  s5d}t|jD ](\}}|j|||  }t|dd || df|j||| < ||7 }qt|||j	ddd}||_
t|| j}	| j|	j}
tj|
jdd}t|| jdd\|_|_|
j|_|S )a  
        Run draft model extend to correctly fill the KV cache.

        Args:
            batch: The batch to run.
            target_hidden_states: Hidden states from the target model forward
            next_token_ids: Next token ids generated from the target forward.
        r   r   N)r   r   new_seq_lensnum_tokens_per_reqnum_tokens_for_logprob_per_reqr   r   )r   r   	enumerateextend_seq_lensr   r3   r   r   r   r   r   r   init_newrZ   r   r   r   r   r(   rO   r   r   r   )rf   r   r   r   ptr   
extend_lenr   next_draft_inputr   r   r   r;   r;   r<   _draft_extend_for_prefill  s0   

	z*EagleDraftWorker._draft_extend_for_prefillbatch_resultc                 C   sP  t |jj| jd | jd d}tjt|j| jd| j	 |j
 d }| j |||j| j	| j| j}W d    n1 s=w   Y  | jrQt| j | j |jjd u r\|j
|j_| jod| j|}|rn| j|}n	| jj|ddj}|j| |_|j| |_tj|jdd}t|| jdd\}	}
|j}|j}|	|
||_|_|_d S )Nr   )r   r   r   r   Tr   r   r   ) r   r   r   rP   r3   arangero   r   r.   rQ   accept_lensr:   (prepare_for_extend_to_fill_draft_kvcacher   rZ   r   r9   r4   current_streamwait_streamr   accept_lengthcan_runr   r   r   r   r(   rO   r   r   r   )rf   r   r   r   select_indexr   r   draft_logits_outputr   
ret_topk_pret_topk_indexret_hidden_statesr   r;   r;   r<   _draft_extend_for_decode  st   	

z)EagleDraftWorker._draft_extend_for_decodeN)__name__
__module____qualname__r   intr   ri   r_   r`   rb   rc   r   r   r   r   r   r3   Tensorr   r   r   r;   r;   r;   r<   r>   Q   sN    	

_9OO
0r>   c                   @   s   e Zd Zdedededee dedededed	efd
dZedd Z	edd Z
dd ZdefddZdefddZdedejdejfddZdefddZdS ) EAGLEWorkerV2r?   r@   rA   rB   rC   rD   rE   rF   rG   c
           
   
   C   s   || _ |j| _|j| _|j| _|j| _|| _|| _|j| _|	| _	|j
| _
t|j| _|	 \| _| _|	jjj|_t|||||||||		| _tjdtj| jd| _tjdtj| jd| _t| j\| _| _d S )Nr;   dtyper.   )r?   rN   rO   rP   rQ   r   rA   r@   r.   _target_worker	page_sizer    rR   rS   rU   rJ   rK   rY   r]   context_lencontext_lengthr>   _draft_workerr3   r   int64num_new_pages_per_topkextend_lensr=   r9   r:   )
rf   r?   r@   rA   rB   rC   rD   rE   rF   rG   r;   r;   r<   ri   H  s@   

zEAGLEWorkerV2.__init__c                 C      | j S r0   )r  rq   r;   r;   r<   rG   ~     zEAGLEWorkerV2.target_workerc                 C   r  r0   )r
  rq   r;   r;   r<   rX     r  zEAGLEWorkerV2.draft_workerc                 C   r   r0   r;   rq   r;   r;   r<   clear_cache_pool  s   zEAGLEWorkerV2.clear_cache_poolr   c              
   C   sx  |j  s|jr|tj|_| j|}tj|_| j	
| j	jjS t 7 t $ | j	||jj|j|_|W  d    W  d    W  d    S 1 sNw   Y  W d    n1 s]w   Y  W d    d S W d    d S 1 suw   Y  d S |jd u rtj| j| jjj| jjj| jtjd|_| j	
| j	jj4 t ! t  | j	|}W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  | sJ ||_| |}| j	
| j	jj@ t # t  | j	|| W d    n	1 sw   Y  W d    n1 sw   Y  W d    |S W d    |S 1 s5w   Y  |S )N)r.   hidden_sizer  rO   r   ) r   	is_extendis_extend_in_batchr   FULLr   rG   forward_batch_generationLASTrX   r"   rZ   ra   r   r   r   r   r   r   r   r   r   r   r.   r]   r  r  rO   r   is_verify_inputverifyr   )rf   r   batch_outputverify_inputr;   r;   r<   r    s   
  
* z&EAGLEWorkerV2.forward_batch_generationr   c                 C   s  |j t| j  |j}| jd |_t	|j }| j
 || j|| j\}}W d    n1 s3w   Y  | jrXt| j | j | jjj||rU| jjjjnd  |jro|j }|j }|j|jj }| jjd |ddd}	|	j}
d }|jrt|j|||||j j!}|d ur|j"d usJ |#|jj}d |j _$| j%rt&|
 |'||
|\}}}|j | }t| j( }|)  |j*+ s|| }tj,|tj-d}t.|f |||| j/ n
tj0d| jtj-d}t1|||d}t2|
||||dS )	Nr   T)r   r   	is_verifyr   )r  )r   )r.   r  )r   r   verify_done)r   r   can_run_cuda_graphr   r   )3r   record_streamr3   r4   r.   r   r   rP   r   ro   r:   prepare_for_v2_verifyrJ   rG   r9   r   rY   r   )update_verify_buffers_to_fill_after_draftgraph_runnerbshas_grammarr   cpur   r   viewr   r  r   r#   reqssampling_info
vocab_sizegrammarrx   
vocab_maskr   r!   sampleEventrecordr   r   
empty_likeint32r   rQ   r   r   r   )rf   r   r  r"  verify_forward_batchr  retrieve_next_token_cpuretrieve_next_sibling_cpudraft_tokens_cpuforward_batch_outputr   r*  predictr   accept_indexr   r  all_verified_idr   r   r;   r;   r<   r    s   





	

zEAGLEWorkerV2.verifyr6  r   c              	   C   s   t |j}|| j }tj|tj| jd}tj|tj| jd}t|f |j| j	j
|j|j| || j	j
jd t| t|f ||j|t| | j || dS )z
        Move accepted tokens to the target KV cache.

        Args:
            batch: The batch to run.
            accept_index: The index of the accepted tokens.
            accept_length: The length of the accepted tokens.
        r  r   N)ro   r   rQ   r3   zerosr  r.   r   req_pool_indicesrJ   req_to_tokenr   r,   r   r   rK   get_kvcachemove_kv_cache)rf   r   r6  r   r"  sizetgt_cache_locaccepted_out_cache_locr;   r;   r<   &move_accepted_tokens_to_target_kvcache2  s8   


	
z4EAGLEWorkerV2.move_accepted_tokens_to_target_kvcacherecv_reqc                 C   s\   t   t|j| j }| jjj||jd\}}|s||fS | j	j
j||jd\}}||fS )N)named_tensorsload_format)r-   r&   deserializeserialized_named_tensorsrA   rX   rZ   update_weights_from_tensorrC  rG   rY   )rf   rA  rB  successmessager;   r;   r<   rF  ^  s   


z(EAGLEWorkerV2.update_weights_from_tensorN)r   r   r   r   r  r   r   ri   propertyrG   rX   r  r   r  r  r3   r  r@  r   rF  r;   r;   r;   r<   r  G  sF    	

6

2v
,r  )Tr7   loggingr   typingr   r   r   r3   sglang.srt.environr   Psglang.srt.hardware_backend.npu.graph_runner.eagle_draft_extend_npu_graph_runnerr   Isglang.srt.hardware_backend.npu.graph_runner.eagle_draft_npu_graph_runnerr   *sglang.srt.layers.attention.triton_backendr   .sglang.srt.layers.attention.trtllm_mla_backendr	   sglang.srt.layers.dp_attentionr
   sglang.srt.layers.moe.utilsr   r   sglang.srt.managers.io_structr   "sglang.srt.managers.schedule_batchr   sglang.srt.managers.schedulerr   sglang.srt.managers.tp_workerr   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.server_argsr   'sglang.srt.speculative.base_spec_workerr   r   "sglang.srt.speculative.draft_utilsr   4sglang.srt.speculative.eagle_draft_cuda_graph_runnerr   ;sglang.srt.speculative.eagle_draft_extend_cuda_graph_runnerr   !sglang.srt.speculative.eagle_infor   r   $sglang.srt.speculative.eagle_info_v2r   r   r   "sglang.srt.speculative.eagle_utilsr   r    sglang.srt.speculative.spec_infor    !sglang.srt.speculative.spec_utilsr!   r"   r#   r$   r%   sglang.srt.utils.commonr&   r'   r(   r)   r*   r+   r,   sglang.srt.utils.patch_torchr-   r   r   	getLoggerr   rl   stranyAbstractContextManagerr=   r>   r  r;   r;   r;   r<   <module>   sT    $	

   y