o
    
۾iZ                     @   s2   d dl Z d dlmZ d dlmZ G dd dZdS )    N)
VllmConfig)
InputBatchc                   @   st   e Zd ZdZdefddZ	ddedeee  de	e
ejf ee	e
ejf  B dB d	eee  fd
dZdd ZdS )SuffixDecodingProposerz
    Speculative decoding proposer for Suffix Decoding (https://arxiv.org/pdf/2411.04975).
    This class imports and uses the official implementation from Arctic Inference
    (https://github.com/snowflakedb/ArcticInference).
    vllm_configc                 C   sb   |j }|d usJ d|j| _|j| _|j| _|j| _|jj	| _	ddl
m} ||j|jd| _d S )NzSpeculative config must be setr   )SuffixDecodingCache)max_tree_depthmax_cached_requests)speculative_confignum_speculative_tokenssuffix_decoding_max_tree_depthr   suffix_decoding_max_spec_factormax_spec_factorsuffix_decoding_min_token_probmin_token_probmodel_configmax_model_len arctic_inference.suffix_decodingr   #suffix_decoding_max_cached_requestssuffix_cache)selfr   configr    r   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/spec_decode/suffix_decoding.py__init__   s   
zSuffixDecodingProposer.__init__Ninput_batchsampled_token_idsslot_mappingsreturnc                 C   s8  g }t |D ]\}}|s|g  q|j| }|j| }|| jkr'|g  q|j| }	|| jjvrS|| jjv r>| j	| |j
|	 }
|j|	d|
f }| j|| | j|| td|| j }|j|||f }| jj||t| j| j| d | j| jd}||j q| jj|j  D ]}| j| q|S )a   
        Propose speculative tokens for each request in the input batch. Suffix Decoding
        will speculate a dynamic number of tokens for each request every decoding step,
        so each entry in the returned list may have different lengths.
        Nr      )max_spec_tokensr   r   )	enumerateappendreq_idsnum_tokens_no_specr   req_id_to_indexr   active_requestscached_requestsevict_cached_responsenum_prompt_tokenstoken_ids_cpustart_requestadd_active_responsemaxr   	speculateminr
   r   r   	token_idskeysstop_request)r   r   r   r   draft_token_idsisampled_idsreq_id
num_tokensindexr(   prompt_token_idsstartpatterndraftr   r   r   propose#   sD   







zSuffixDecodingProposer.proposec                 O   s   d S Nr   )r   argskwargsr   r   r   
load_modelc   s   z!SuffixDecodingProposer.load_modelr=   )__name__
__module____qualname____doc__r   r   r   listintdictstrtorchTensorr<   r@   r   r   r   r   r   	   s$    


@r   )rI   vllm.configr   vllm.v1.worker.gpu_input_batchr   r   r   r   r   r   <module>   s   