o
    
۾i*                     @   s   d dl Z d dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZ G dd dZeddded	ejd
ejdededededejdejfddZedddejdededededejfddZdS )    N)get_num_threadsjitnjitprangeset_num_threads)
VllmConfigc                   @   s   e Zd ZdefddZdededejdejdeee  f
d	d
Z		ddeee  dejdejde
eejf ee
eejf  B dB deee  f
ddZdd ZdS )NgramProposervllm_configc                 C   s   |j d usJ |j jd usJ |j jd usJ |j j| _|j j| _|j j| _|jj| _|j	j
}tj|| jftjd| _tj|tjd| _d| _|jj}t }|ratd|d | _|  j|  _nd| _| g gd tjdtjdtjd| jftjd d S )Ndtypei          i   )speculative_configprompt_lookup_minprompt_lookup_maxmin_nmax_nnum_speculative_tokenskmodel_configmax_model_lenscheduler_configmax_num_seqsnpzerosint32valid_ngram_draftvalid_ngram_num_draftsnum_tokens_thresholdparallel_configtensor_parallel_sizeos	cpu_countminnum_numba_thread_availablepropose)selfr	   r   tp_sizer"    r(   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/spec_decode/ngram_proposer.py__init__   s,   



zNgramProposer.__init__num_requestsvalid_ngram_requestsnum_tokens_no_spectoken_ids_cpureturnc              
   C   s   g }t | }r=t }t|}|| jkr#tdt| j|}	t|	 ntd t	|||| j
| j| j| j| j| j	 t| t|D ]$}
|
|v r`| j|
 dkr`|| j|
d| j|
 f   qA|g  qA|S )a  Batch version of ngram proposer using numba for acceleration.

        Args:
            valid_ngram_requests:
                Set of indices of requests that need ngram proposals.
            num_tokens_no_spec:
                Numpy array of shape (batch_size,) representing the number
                of tokens without speculative tokens for each request.
            token_ids_cpu:
                Numpy array of shape (batch_size, max_model_len)
                representing the token IDs for each request.

        Returns:
            list[list[int]]:
                A list where each element is a list of proposed
                token IDs for the corresponding request.
        r   r   N)lenr   r   sumr   maxr#   r$   r   batch_propose_numbar   r   r   r   r   r   rangeappendtolist)r&   r+   r,   r-   r.   draft_token_idsnum_ngram_requestsoriginal_num_numba_threadstotal_tokensfinal_num_threadsir(   r(   r)   batch_propose?   s:   


zNgramProposer.batch_proposeNsampled_token_idsslot_mappingsc                 C   sZ   g }t |D ]\}}t|}|sq|| }	|	| jkrq|| q| t||||}
|
S N)	enumerater0   r   r5   r=   )r&   r>   r-   r.   r?   r,   r<   sampled_idsnum_sampled_ids
num_tokensr7   r(   r(   r)   r%      s    

zNgramProposer.proposec                 O   s   d S r@   r(   )r&   argskwargsr(   r(   r)   
load_model   s   zNgramProposer.load_modelr@   )__name__
__module____qualname__r   r*   intlistr   ndarrayr=   dictstrtorchTensorr%   rG   r(   r(   r(   r)   r      s<    2

K


!r   T)parallelr,   r-   r.   r   r   r   r   r   r   c	                 C   st   t t| D ]1}	| |	 }
||
 }||
d |f }t|||||d}|jd ||
< t|r7|||
d |jd f< qd S )N)origin_tokens	min_ngram	max_ngramr   r   r   )r   r0   ._find_longest_matched_ngram_and_propose_tokensshape)r,   r-   r.   r   r   r   r   r   r   r<   idxrD   context_token_idsdrafter_outputr(   r(   r)   r3      s    r3   )nopythonrS   rT   rU   r/   c                 C   s@  | j d }||k rtjd| jdS t||| }|dkr$tjd| jdS | ddd }tj|tjd}d}d}	d}
d}||k r}||
 || krj|
d7 }
|
|krS|
}|}	||k r[|
||< |
|kre||d  }
|d7 }n|
dkru||
d  }
n|d7 }||k s?||k rtjd| jdS |d |	 | }t||| }| |||  S )z
    Find the longest n-gram which matches the suffix of the given tokens
    whose length is within [min_ngram, max_ngram] (inclusive).

    If found, we will extract k right after the matched ngram.
    r   )r   r
   Nr   )rW   r   emptyr   r#   r   r   )rS   rT   rU   r   r   total_tokentokenslpslongest_ngrampositionprev_lpsr<   start_positionr(   r(   r)   rV      s@   

"rV   )r!   numpyr   rP   numbar   r   r   r   r   vllm.configr   r   rL   rM   rK   r3   rV   r(   r(   r(   r)   <module>   sR    	