o
    پiG-                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ e eZdZ G dd dZ!dS )    N)ListOptional)"reconstruct_indices_from_tree_mask)add_output_logprobs_for_spec_v1)ScheduleBatch)GenerationBatchResult)TpModelWorker)ForwardMode)
ServerArgs)
NgramCache)NgramVerifyInput)SpeculativeAlgorithm)generate_token_bitmaskTc                   @   s   e Zd Zdedededee dedededed	efd
dZdd Zde	e de	e defddZ
dd Zdedeejejf fddZdefddZdefddZdedefddZdS ) NGRAMWorkerserver_argsgpu_idtp_rankdp_rankmoe_ep_rankattn_cp_rankmoe_dp_rank	nccl_porttarget_workerc
           
   	   C   s   |	| _ |	j| _|| _|j| _|j| _|j| _|j| _	|	j
| _|dkr'd| nd| _|   t|j|j|j|j|j|j|jd| _d S )Nr   zcuda:cuda)min_match_window_sizemax_match_window_sizemin_bfs_breadthmax_bfs_breadthcapacitybranch_lengthdraft_token_num)r   model_runnerr   	page_sizespeculative_num_draft_tokensr    speculative_ngram_branch_lengthr   'speculative_ngram_max_match_window_sizer   max_running_requestsmax_batch_sizedevice_init_preallocated_tensorsr   'speculative_ngram_min_match_window_size!speculative_ngram_min_bfs_breadth!speculative_ngram_max_bfs_breadthspeculative_ngram_capacityngram_cache)
selfr   r   r   r   r   r   r   r   r    r0   W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/speculative/ngram_worker.py__init__   s(   zNGRAMWorker.__init__c                 C   s   | j   d S N)r.   reset)r/   r0   r0   r1   clear_cache_pool?   s   zNGRAMWorker.clear_cache_poolseq1seq2nc                 C   s8   t |}||kr|| d  S || }|| d  | S r3   )len)r/   r6   r7   r8   seq2_lenneed_from_seq1r0   r0   r1   _efficient_concat_last_nB   s
   z$NGRAMWorker._efficient_concat_last_nc                 C   s  | j | j }| j | j | j }tj|ftj| jd| _tj| j | jftj| jd| _tj| j | jftj| jd| _tj| j | jftj| jd| _	tj|ftj| jd| _
tj|ftj| jd| _g | _g | _g | _g | _g | _g | _td| j d D ]\}| j| jd |d d f  | j| jd |d d f  | j| j	d |d d f  | j| j
d || j   | j| jd || j   | j| jd || j | j   qzd S )N)dtyper(   r      )r'   r    torchemptyint64r(   draft_tokensretrieve_indexesretrive_next_tokenretrive_next_sibling	positionsbool	tree_maskdraft_tokens_batchtree_mask_batchretrieve_indexes_batchretrive_next_token_batchretrive_next_sibling_batchpositions_batchrangeappend)r/   max_total_draftsmax_total_mask_sizebsr0   r0   r1   r)   J   sZ   


z&NGRAMWorker._init_preallocated_tensorsbatchreturnc           	      C   s   |  }| j  g }|jD ]}| |j|j| j}|| q| j	|\}}t
|}||| j ksAJ d|d|d| j||fS )Nztotal_draft_token_num=z, bs=z, self.draft_token_num=)
batch_sizer.   synchronizereqsr<   origin_input_ids
output_idsr   rP   	batch_getr9   r    )	r/   rT   rS   batch_tokensreqcheck_token
req_draftsmasktotal_draft_token_numr0   r0   r1   _prepare_draft_tokens|   s   

z!NGRAMWorker._prepare_draft_tokensc              	   C   s  |j  rd S | }| j| }| j| }| j| }| j| }| j| }| j| }| 	|\}	}
|j
t|
dd |j
t|	dd t||j|||||| j trg }|
| | j| j}
t|jD ]5\}}t|jt|j }t| j|d f }tj|t|
|  fddtj}||  qetj|dd}tj|_ t!j"|_ t#||||||| j|_$|j$%|| j& d S )NT)non_blockingr>   )dimr   )'forward_mode	is_extendrV   rK   rL   rM   rN   rJ   rI   rb   copy_r?   
from_numpyr   seq_lensr    USE_FULL_MASKreshape	enumeraterX   r9   rY   rZ   onesr   cattorG   rP   flattenr   NGRAMspec_algorithmr	   TARGET_VERIFYr   	spec_infoprepare_for_verifyr"   )r/   rT   rS   retrive_indexrD   rE   rF   rH   rB   r_   r`   ir]   seq_lenreq_maskr0   r0   r1   !_prepare_for_speculative_decoding   sb   






	z-NGRAMWorker._prepare_for_speculative_decodingc                 C   s>   g }|j D ]}| |j|j| j}|| q| j| d S r3   )rX   r<   rY   rZ   r   rP   r.   	batch_put)r/   rT   r\   r]   put_idsr0   r0   r1   _update_ngram_cache   s   
zNGRAMWorker._update_ngram_cachec                 C   sB  |  | | }|j}d}d }|j r|jr,|j }|j }|j	
|jj }| jj|dd}	|	j|	j}
}|j}d }|jret|j|||||jj}|d ure|jd usZJ ||jj}d |j_|||
| j|\}
}}|j}|jr}t|||
 | | tj|_n| j|}	|	j|	j |	j}
}}t!|
||||dS )Nr   T)	is_verify)logits_outputnext_token_idsnum_accepted_tokenscan_run_cuda_graphaccept_lens)"rz   get_model_worker_batchrt   re   is_target_verifyhas_grammarrD   cpurE   draft_tokenviewshaper   forward_batch_generationr   r   r   rX   sampling_info
vocab_sizegrammarro   r(   
vocab_maskverifyr"   accept_lengthreturn_logprobr   r}   r	   DECODEr   r   )r/   rT   model_worker_batchrt   r   r   retrieve_next_token_cpuretrieve_next_sibling_cpudraft_tokens_cpubatch_resultr   r   verify_inputr   r   r0   r0   r1   r      sr   



	




z$NGRAMWorker.forward_batch_generationN)__name__
__module____qualname__r
   intr   r   r2   r5   r   r<   r)   r   tuplenpndarrayrb   rz   r}   r   r   r0   r0   r0   r1   r      s>    	

%2
9r   )"loggingtypingr   r   numpyr   r?   sgl_kernel.speculativer   sglang.srt.layers.utils.logprobr   "sglang.srt.managers.schedule_batchr   sglang.srt.managers.schedulerr   sglang.srt.managers.tp_workerr   ,sglang.srt.model_executor.forward_batch_infor	   sglang.srt.server_argsr
   ,sglang.srt.speculative.cpp_ngram.ngram_cacher   !sglang.srt.speculative.ngram_infor    sglang.srt.speculative.spec_infor   !sglang.srt.speculative.spec_utilsr   	getLoggerr   loggerrj   r   r0   r0   r0   r1   <module>   s$    
