o
    پi                     @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZCmDZD d dlEmFZF eC ZGeB rd dlHmIZI e JeKZLG dd deZMejNdeGddejOdejOd ePfd!d"ZQdS )#    N)ListOptionalTuple)get_tp_group)EAGLEDraftNpuGraphRunner)get_attention_tp_group)LogitsProcessorOutput)#speculative_moe_a2a_backend_contextspeculative_moe_backend_context)add_output_logprobs_for_spec_v1)UpdateWeightsFromTensorReqInput)ScheduleBatch)GenerationBatchResult)TpModelWorker)alloc_paged_token_slots_extendalloc_token_slotsget_last_loc)CaptureHiddenModeForwardBatchForwardMode)
ServerArgs)DraftBackendFactory)EAGLEDraftCudaGraphRunner)EAGLEDraftExtendCudaGraphRunner)EagleDraftInputEagleVerifyInputEagleVerifyOutput)build_tree_kernel_efficientorganize_draft_results)SpeculativeAlgorithm)assign_draft_cache_locs
detect_nandraft_tp_context	fast_topkgenerate_token_bitmask(get_last_loc_large_page_size_large_top_kload_token_mapselect_top_k_tokens)MultiprocessingSerializerempty_contextget_available_gpu_memoryis_cudais_npunext_power_of_2)monkey_patch_torch_reductions)segment_packbitsc                       sz  e Zd Zdedededee dedededed	ef fd
dZdd Zdd Z	e
dd ZdedefddZdefddZdedeeejeeej f fddZdefddZdefddZdefddZd efd!d"Zd#d$ Zded%efd&d'Zded(ed)ed%ed*ejf
d+d,Z	-d<ded.ejd/ejd0eej d1eej f
d2d3Zdefd4d5Z d)ed6e!fd7d8Z"d9e#fd:d;Z$  Z%S )=EAGLEWorkerserver_argsgpu_idtp_rankdp_rankmoe_ep_rankattn_cp_rankmoe_dp_rank	nccl_porttarget_workerc
                    s\  || _ |j| _|j| _|j| _|j| _|| _|j| _|	| _|j	| _	t
|j| _|	jjj|_|j}
d|_|	 \| _| _| j rO|jd urKtd d | _n|jd uret|j| _dt| j d|_nd | _|jrv| j rvtt }nt  }|B t! / t"  t# j$|||d|||||d| j| jd W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  | jjj%& \}}| j rt'| j(j%dr| j(j%j)r| j(j%*|| n| j(j%+| | j(j%jd ur| j(j%j,|j| _n!| jd ur|- }| j,|j| _|j.| j |_.| j(j%*|| |
| j(j _|jr/tnt | _d| _/| j rNd| _/t0| j(jj1d	i }|2d
d| _/| | j(j38 t! $ t"  | 4  | 5  W d    n	1 sqw   Y  W d    n	1 sw   Y  W d    n	1 sw   Y  t6j7dt6j8| jd| _9t6j7dt6j8| jd| _:d S )NTzgSpeculative token map specified, but EAGLE3 models already have this. Ignoring the specified token map.z{"hot_vocab_size": }r   )r1   r2   r3   pp_rankr4   r5   r6   r7   r8   is_draft_workerreq_to_token_pooltoken_to_kv_pool_allocatorload_lm_head_from_targetFeagle_configuse_aux_hidden_state dtypedevice);r1   speculative_eagle_topktopkspeculative_num_stepsspeculative_num_draft_tokensenable_nan_detectionr2   rE   r9   	page_sizer   from_stringspeculative_algorithmmodel_runnermodel_configcontext_lencontext_lengthdisable_cuda_graphget_memory_poolr=   r>   	is_eagle3speculative_token_maploggerwarninghot_token_idr&   lenjson_model_override_argsenable_dp_attentionr"   r   r)   r
   r	   super__init__modelget_embed_and_headhasattrdraft_model_runnerr?   set_embed_and_head	set_embedtoclonedataeagle_use_aux_hidden_stategetattr	hf_configgettp_groupinit_attention_backendinit_cuda_graphstorchemptyint64num_new_pages_per_topkextend_lens)selfr1   r2   r3   r4   r5   r6   r7   r8   r9   backup_disable_cuda_graphctxembedheadr@   	__class__rB   W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/eagle_worker.pyr]   P   s   



  

  
zEAGLEWorker.__init__c                 C   s8   t | j| j| j| j}| | _| | _| j| j_d S N)	r   r1   ra   rG   rH   create_decode_backenddraft_attn_backendcreate_draft_extend_backenddraft_extend_attn_backend)rs   draft_backend_factoryrB   rB   rz   rl      s   
z"EAGLEWorker.init_attention_backendc              	   C   s.  d| _ d| _| jjrdS ttd}| jdkrSt }t	| j
| j}td|dd || jj
 | | _ t	| j
| j}tdt | dd|| dd	|dd
 | jrtst }t	| j
| j}td|dd t| | _t	| j
| j}tdt | dd|| dd	|dd
 dS dS dS )zCapture cuda graphs.N)npucuda   zOCapture draft cuda graph begin. This can take up to several minutes. avail mem=z.2fz GBz,Capture draft cuda graph end. Time elapsed: z s. mem usage=z GB. avail mem=z GB.zVCapture draft extend cuda graph begin. This can take up to several minutes. avail mem=z3Capture draft extend cuda graph end. Time elapsed: )cuda_graph_runner"cuda_graph_runner_for_draft_extendr1   rR   r   r   rH   timeperf_counterr*   rE   r2   rV   infor9   r   _is_npur   )rs   Device2DraftCudaGraphRunnertic
before_mem	after_memrB   rB   rz   rm      sH   
(
(zEAGLEWorker.init_cuda_graphsc                 C   s   | j S r{   )rN   rs   rB   rB   rz   ra     s   zEAGLEWorker.draft_model_runnerbatchreturnc           	   
   C   s  |j  s|jr`| |\}}}| | jj9 t & t  | 	||j
|||j W d   n1 s5w   Y  W d   n1 sDw   Y  W d   n1 sSw   Y  t||dddS | | jj3 t   t  | |}W d   n1 sw   Y  W d   n1 sw   Y  W d   n1 sw   Y  | ||\}}}}| | jj@ t - t  | jjs|jjjd dkr| | W d   n1 sw   Y  W d   n1 sw   Y  W d   n1 sw   Y  t||jt|j|j|dS )a  Run speculative decoding forward.

        NOTE: Many states of batch is modified as you go through. It is not guaranteed that
        the final output batch have the same state as the input.

        Args:
            batch: The batch to run forward. The state of the batch is modified as it runs.
        Returns:
            A tuple of the final logit output of the target model, next tokens accepted,
            the batch id (used for overlap schedule), and number of accepted tokens.
        Nr   F)logits_outputnext_token_idsnum_accepted_tokenscan_run_cuda_graph)r   r   r   accept_length_per_req_cpur   )forward_mode	is_extendis_extend_in_batchforward_target_extendr"   ra   rk   r
   r	   forward_draft_extendhidden_statesmm_input_embedsr   draftverifyr1   r[   	spec_infoverified_idshape!forward_draft_extend_after_decodesumr   )	rs   r   r   r   seq_lens_cpur   verify_outputmodel_worker_batchr   rB   rB   rz   forward_batch_generation  s   
  
  


  z$EAGLEWorker.forward_batch_generationc                 C   s\   |j jjd dk}| jjs|S tj|gtjd}tjj	|t
 jd |d  }|dk}|S )Nr   )rD   )group)r   r   r   r1   r[   rn   tensorrp   distributed
all_reducer   	cpu_groupitem)rs   r   local_need_forwardglobal_need_forwardglobal_need_forward_cntneed_forwardrB   rB   rz   'check_forward_draft_extend_after_decodeS  s   z3EAGLEWorker.check_forward_draft_extend_after_decodec                 C   s6   |  }tj|_| j|}|j|j}}|||jfS )a  Run the target extend.

        Args:
            batch: The batch to run. States could be modified.

        Returns:
            logits_output: The output of logits. It will contain the full hidden states.
            next_token_ids: Next token ids generated.
        )	get_model_worker_batchr   FULLcapture_hidden_moder9   r   r   r   r   )rs   r   r   batch_resultr   r   rB   rB   rz   r   e  s   z!EAGLEWorker.forward_target_extendc                 C   s  |   |jD ]	}| jd7  _q| }|j}|jjjr)|jj|j	
tj | jdkrA| j| j }t|j|| dd\}}nx| jdkrdt|jj|j|j| j\}}	}
|j}|j| j }|| j }nFt|jj|j|j| j| j| j\}}	}
| _| _}|j}|| j }|| j | j d | j }|| j | j || j| j   }t||  }t|j|||	||
|dd\}}| jdkr| jdkrtj|dd}t| | jd  }tj|tj | j!d}tj|tj | j!d}nd}d\}}}t"|f |j|jj|j| j| j||||||jjj#d | j| j| jt$|t$| j| j  | jdkrC| jdkrC|dkr7| j%j&'|| |d || j | j  }||_(t|j |_)d|_*|jj+| jdd|_,| j-.| d S )	Nr   T)backup_stater   dimrC   )NNNF)/maybe_evict_swareqsdecode_batch_idx
batch_sizer   sampling_infopenalizer_orchestratoris_requiredcumulate_output_tokensr   rd   rn   rp   rK   rH   rG   r   
tree_cache$get_last_loc_large_page_size_top_k_1r=   req_to_tokenreq_pool_indicesseq_lensr   r%   rq   rr   r   r   r   cumsumzerosint32rE   r    r   r-   ra   token_to_kv_poolmove_kv_cacheout_cache_locseq_lens_sumreturn_hidden_statesrepeat_interleave	positionsr>   restore_state)rs   r   reqnum_seqsr   alloc_len_per_decoder   token_to_kv_pool_state_backupprefix_lensr   last_locprefix_lens_cpur   extend_num_tokenslast_page_lenslast_page_lens_cpurq   last_page_lens_cumsumduplicate_cache_lentarget_cache_locsource_cache_locrB   rB   rz   _draft_preprocess_decode  s   




	




z$EAGLEWorker._draft_preprocess_decodec                 C   s(   t j| j| jj| jj| jtjd|_	d S )NrE   hidden_sizerD   rG   r   )
r   create_idle_inputrE   rO   r   rD   rG   r   LASTr   )rs   r   rB   rB   rz   _draft_preprocess_idle  s   z"EAGLEWorker._draft_preprocess_idlec                 C   sV  |j  r| | n| | |j}t|tsJ tj|_	| j
|_| j
|_d|_| }|j	tjks5J t|| j}| joD| j|}|rQ| j|\}}}nd|_|j  sd| jdkrd| j| | |\}}}|j  r{t| j
| j| jS t|j||||j|j | j
| j| j	\}	}
}}}}t||	|
|||d | j| j
| j!jtj"|j |j#dS )NFr   )draft_tokencustom_maskr   retrive_indexretrive_next_tokenretrive_next_siblingretrive_cum_len
spec_stepsrG   draft_token_numr   r   r   )$r   is_idler   r   r   
isinstancer   r   r   r   rG   num_tokens_per_reqnum_tokens_for_logprob_per_reqr   r   r   init_newra   r   can_runreplaycan_run_dp_cuda_graphrH   r}   init_forward_metadatadraft_forwardr   r   rI   r   r   r   r   r1   r   r   )rs   r   r   r   forward_batchcan_cuda_graphparent_listtop_scores_indexdraft_tokens	tree_maskpositionr   r   r   rB   rB   rz   r     s   




zEAGLEWorker.draftr   c                 C   s  |j }t|ts
J |j}|j|j|j}}}| jd ur"| j| }||j	| j
| j}|d| jd}g }g }g }	d }
t| jD ]}t|||||
| j
\}}}
}||d  ||d  |	|d  || jd krq n\||_| jjdkr| jjjd dkr| }|| |_|jd | jj| |_||_| jj|dd	j}| jjrt | t!j"|j#dd
}t$|| j
dd
\}}| jd ur| j| }|j}qCt%|||	| j&\}}}|||fS )N)   r   r   r   r   r   
STANDALONEGptOssForCausalLMTskip_attn_backend_initr   )'r   r   r   r   topk_p
topk_indexr   rX   reshaper   rG   rH   permuteranger'   append	input_idsr1   rM   rO   ri   architectures
contiguousr   add_r}   attn_backendsattn_backendra   forwardr   rJ   r!   rn   softmaxnext_token_logitsr#   r   rI   )rs   r   r   r   r  r  r   
score_list
token_listparents_listscoresir  	tree_infor   probsr   r   r   rB   rB   rz   r   g  sh   









zEAGLEWorker.draft_forwardc                 C   s   d S r{   rB   r   rB   rB   rz   clear_cache_pool  s   zEAGLEWorker.clear_cache_poolr   c                 C   s  |j  }||| j | jd |_d|_|j st	j
nt	j|_||_|j|jd}|j|jks3J |jrJ|j }|j }|j|jj }| jj|dd}|j|j}	}
d }|jrt|j|||||jj}|d ur|jd usuJ | |jj!}d |j_"| j#rt$|	 |	j%|_%|&||	| j'| j|}|	j(|j) |	_(|	j%|j) |	_%| jj*j+d us| jj*j,d us| jj*j-d ur| .|||	|| |j/rt0|||	 |j st	j1nt	j|_|j2|_|	|||
fS )Nr   Fseq_lens_cpu_cacheT)	is_verify)3r   re   prepare_for_verifyrK   rH   r   r   r   r   r   TARGET_VERIFYIDLEr   r   r   r   has_grammarr   cpur   r   viewr   r9   r   r   r   r$   r   r   
vocab_sizegrammarrd   rE   
vocab_maskrJ   r!   r   r   r>   r  accepted_indicesrN   hybrid_gdn_configmamba2_confighybrid_lightning_config_mamba_verify_updatereturn_logprobr   DECODEdraft_input)rs   r   r   seq_lens_pre_verifyr   retrieve_next_token_cpuretrieve_next_sibling_cpudraft_tokens_cpur   r   r   r'  resrB   rB   rz   r     s   


	

zEAGLEWorker.verifyr4  r   r0  c                 C   s>  t j|j|jjt jdd }t j|dd}t t jd|j	|jd|d d g}t j
dt|j|jj |jj|j	|jd}	|jdkrT|jjd dkrT|j|d  |	 }
n|d }
|jd ur| jj}|| |j| k}|j| | }t j|| d dd}t ||j||  |	 d}nd }| jjjj|
|j|| jjjd	 d S )
N)rE   rD   r   r   r   rC   r   )steprD   rE   )min)accepted_stepsmamba_track_indicesmamba_steps_to_trackr^   )rn   r   r   r   rE   rp   r   catr   rD   arangerY   r   r   r   rG   r(  r   r8  r1   mamba_track_intervalclampwherer9   rN   r  #update_mamba_state_after_mtp_verifyr^   )rs   r   r4  r   r   r0  accepted_lengthcumulative_accepted_lengthsaccepted_indices_startaccepted_indices_offsetr7  r<  to_track_masktracking_pointto_track_ithr9  rB   rB   rz   r,    sl   	





z EAGLEWorker._mamba_verify_updateNr   r   r   r   c           	      C   s   t ||ddd|_d|_|j| tj|j_|j|d}t	|| j
}d|_|dur.||_| j
|j}| jr<t| t|jt sDJ |j|ju sLJ | ||j dS )a  Run draft model extend. This API modifies the states of the batch.

        Args:
            batch: The batch to run.
            hidden_states: Hidden states from the target model forward
            next_token_ids: Next token ids generated from the target forward.
        r   )r   r   r   r   Fr  N)r   r   r   prepare_for_extendr   r   r   r   r   r   ra   r-  r   r  r   rJ   r!   r   capture_for_decode)	rs   r   r   r   r   r   r   r   r   rB   rB   rz   r   ^  s0   
z EAGLEWorker.forward_draft_extendc                 C   s  t |jtsJ |j }|j }|j}|jj}|j}|j	
 }|sU|jj dkrU| }|  | j rA| jrA| jjd n| jj}tj| j|| jj| jtjd|_| jd |j_d|j_|j|| j |j	
 sptjntj |_	d|_!|" }	|	j#tjksJ t$%|	| j&}
|
jd ur|
j' ( |
_)n|j' ( |
_)| j*o| j*+|
}|r| j*,|
}|j-|j.|
j_-|
j_.|j/|
j_/nd|
_0|
j	
 s| j&j12|
 | j&j3|
ddj4}| 5||
j | j6rt7| |stj8ntj |_	||_||_||_||j_||_d S )Nr      r   r   FTr  )9r   r   r   r   re   r   r   accept_lengthr-  r   r   r   numelcopyprepare_for_idlerM   rT   rg   rO   r   r   rE   rD   rG   r   r   rH   r   r   prepare_extend_after_decoder   DRAFT_EXTENDr!  r   r   r   r   r   ra   r   r   r   r   r   r   r  r  r   r   r  r   r  r   rH  rJ   r!   r.  )rs   r   seq_lens_backupseq_lens_cpu_backupreq_pool_indices_backupaccept_length_backupreturn_logprob_backupinput_is_idler   r   r   r   r   rB   rB   rz   r     s   






z-EAGLEWorker.forward_draft_extend_after_decoder/  c                 C   s4   t j|jdd}t|| jdd\|_|_|j|_d S )Nr   r   )rn   r  r  r#   rG   r  r  r   )rs   r   r/  r  rB   rB   rz   rH    s   zEAGLEWorker.capture_for_decoderecv_reqc                 C   sZ   t   t|j| j }| jj||jd\}}|s||fS | jjj||jd\}}||fS )N)named_tensorsload_format)	r.   r(   deserializeserialized_named_tensorsr3   rN   update_weights_from_tensorrX  r9   )rs   rV  rW  successmessagerB   rB   rz   r[    s   


z&EAGLEWorker.update_weights_from_tensorr{   )&__name__
__module____qualname__r   intr   r   r]   rl   rm   propertyra   r   r   r   r   r   r   rn   Tensorr   r   r   r   r   r   r  r   r   r   r,  r   r   r   rH  r   r[  __classcell__rB   rB   rx   rz   r0   N   s    	
 *
=
 	SH\
U
(Z
r0   T)dynamicdisabler   r   rH   c                 C   s"   |}|| }t | ||}|||fS r{   )r   )r   r   r   rH   r   r   rB   rB   rz   r     s   
r   )Rloggingr   typingr   r   r   rn   sglang.srt.distributedr   Isglang.srt.hardware_backend.npu.graph_runner.eagle_draft_npu_graph_runnerr   sglang.srt.layers.dp_attentionr   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moe.utilsr	   r
   sglang.srt.layers.utils.logprobr   sglang.srt.managers.io_structr   "sglang.srt.managers.schedule_batchr   sglang.srt.managers.schedulerr   sglang.srt.managers.tp_workerr   sglang.srt.mem_cache.commonr   r   r   ,sglang.srt.model_executor.forward_batch_infor   r   r   sglang.srt.server_argsr   "sglang.srt.speculative.draft_utilsr   4sglang.srt.speculative.eagle_draft_cuda_graph_runnerr   ;sglang.srt.speculative.eagle_draft_extend_cuda_graph_runnerr   !sglang.srt.speculative.eagle_infor   r   r   "sglang.srt.speculative.eagle_utilsr   r    sglang.srt.speculative.spec_infor   !sglang.srt.speculative.spec_utilsr    r!   r"   r#   r$   r%   r&   r'   sglang.srt.utilsr(   r)   r*   r+   r,   r-   sglang.srt.utils.patch_torchr.   r   
sgl_kernelr/   	getLoggerr^  rV   r0   compilerc  ra  r   rB   rB   rB   rz   <module>   s\    ( 

       3