o
    .iX+                  
   @   s   d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZmZ eeZG dd deZdedefddZdedejdejdedef
ddZejdd ZdS )    )AnyN)	Attention)
VllmConfigget_layers_from_vllm_config)SpeculativeConfig)init_logger)	get_model)tltriton)CommonAttentionMetadataextend_all_queries_by_1)PADDING_SLOT_IDSpecDecodeBaseProposerc                       s   e Zd Z	ddedejf fddZdefddZd	d
 Z	dd Z
dd Zdd Zdd ZdejdejdejdejdB dedejdB deeejef fddZdeddfddZ  ZS )DraftModelProposerNvllm_configdevicec                    s@   t  j||d|d |   |   |   |   |   d S )NF)r   r   pass_hidden_states_to_modelrunner)super__init___raise_if_multimodal_raise_if_mrope'_raise_if_padded_drafter_batch_disabled_raise_if_vocab_size_mismatch_raise_if_draft_tp_mismatch)selfr   r   r   	__class__ \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/v1/spec_decode/draft_model.pyr      s   zDraftModelProposer.__init__returnc                 C   s   |   }|jjS N)_get_attention_metadata_builderkv_cache_spec
block_size)r   builderr   r   r   _block_size)   s   zDraftModelProposer._block_sizec                 C   s   | j rtdd S )NzMSpeculative Decoding with draft models does not support multimodal models yet)supports_mm_inputsNotImplementedErrorr   r   r   r   r   -   s
   z'DraftModelProposer._raise_if_multimodalc                 C   s   | j jrtdd S )NzBSpeculative Decoding with draft models does not support M-RoPE yet)draft_model_config
uses_mroper(   r)   r   r   r   r   4   s
   z"DraftModelProposer._raise_if_mropec                 C   s   | j jjr	tdd S )NzSpeculative Decoding with draft models only supports padded drafter batch. Please don't pass --disable-padded-drafter-batch in the speculative_config.)r   speculative_configdisable_padded_drafter_batchr(   r)   r   r   r   r   :   s
   
z:DraftModelProposer._raise_if_padded_drafter_batch_disabledc                 C   s   | j j  d S r!   )r   r,   &verify_equal_vocab_size_if_draft_modelr)   r   r   r   r   B   s   z0DraftModelProposer._raise_if_vocab_size_mismatchc                 C   s:   | j j}|jj}|jj}||krtd| d| dd S )NzYCurrently, 'draft_tensor_parallel_size' and 'tensor_parallel_size' must be the same. Got z and zE. Please pass 'draft_tensor_parallel_size' in the speculative_config.)r   r,   target_parallel_configtensor_parallel_sizedraft_parallel_config
ValueError)r   spec_cfgtgt_tpdraft_tpr   r   r   r   E   s   z.DraftModelProposer._raise_if_draft_tp_mismatchtarget_token_idsnext_token_idstarget_positionslast_token_indicescadnum_rejected_tokens_gpuc              
   C   s  |  }|f}|jd d }	|jdd  d }
|d ur|
|8 }
|jd | }tj|f| jjtjd}t| |||	|
| j||jd dd t| |||
 d |	|
| j	||jd dd t
|| j	d | ||  | jd}t|| j|d}|jdd  d }|d ur||8 }|||fS )N   r   )r   dtype)target_toks_ptrnext_toks_ptrquery_start_locs_ptrquery_end_locs_ptrout_ptr_merged_toksout_ptr_is_rejected_toktarget_toks_sizerejected_tok_fill)r:   new_positionsis_rejected_token_maskr$   max_model_len)arangenew_slot_mapping)
batch_sizequery_start_locshapetorchempty	input_idsr   boolmerge_toks_kernel	positionscompute_new_slot_mappingr&   rI   r   rJ   )r   r6   r7   r8   r9   r:   r;   rL   grid
start_locsend_locs
num_tokensis_rejected_tokrK   new_cadnew_last_token_indicesr   r   r   set_inputs_first_passV   s\   	

z(DraftModelProposer.set_inputs_first_passtarget_modelc                 C   s   t t| jt }ddlm} t| jd}t	d|j
j|jj|jj |d t|dd| _W d   n1 s:w   Y  t| jt | }t|| _dS )z/Takes target_model to satisfy the type checker.r   )set_model_tag)target_model_vllm_configz/Starting to load draft model %s. TP=%d, rank=%ddraft_model)r   prefixN)setr   r   r   keysvllm.compilation.backendsr_   "create_vllm_config_for_draft_modelloggerinfomodel_configmodelparallel_configr0   rankr   listattn_layer_names)r   r^   target_attn_layer_namesr_   draft_vllm_configdraft_attn_layer_namesr   r   r   
load_model   s*   
zDraftModelProposer.load_modelr!   )__name__
__module____qualname__r   rO   r   r   intr&   r   r   r   r   r   Tensorr   tupler]   r   rr   __classcell__r   r   r   r   r      s:    
Ar   r`   r    c                 C   s0   | }|j jj|jjd}|jd|j j|d}|S )ar  The vllm_config is configured for the target model, e.g.
    its quant_config and parallel_config. But the draft model is potentially
    quantized differently, and has potentially different tensor_parallel_size.
    This function creates a new vllm_config configured for the draft model.
    The vllm_config is useful when loading the draft model with get_model().
    )rl   N)quant_configri   rk   )r,   r1   replacerk   rl   r*   )r`   oldnew_parallel_confignewr   r   r   rf      s   	rf   r:   rG   rH   r$   rI   c                 C   s   | j j\}}tj|| jjd}tj||  d t|d}tj	||d d}|| ||  }	| j 
d|	 }
|| }|
| | }||k}||t ||t |S )N)r   r=   )output_size)maxr<   )block_table_tensorrN   rO   rJ   rM   r   repeat_interleavenaive_query_lenslenclampviewmasked_fill_r   )r:   rG   rH   r$   rI   rL   n_blocks_per_reqreq_indicesclamped_positionsblock_table_indices
block_numsblock_offsetsrK   exceeds_max_model_lenr   r   r   rU      s   rU   c                 C   s(  t d}t || }	|t dd k}
|
r|t j}nt || d t j}t || }t || }t|	|d D ]Q}||krbt | | }t || | | t || | d q@||d kr}t || | | t || | d q@t || | | t || | d q@dS )ar  
    Merges the `target_toks_ptr` and the `next_toks_ptr` into a new tensor
    called `out_ptr_merged_toks`. Rejected tokens are those after the
    `query_end_locs_ptr` and before the next `query_start_locs_ptr`. Fills the
    rejected tokens positions with the value `rejected_tok_fill`. Also fills a mask
    of the rejected tokens in `out_ptr_is_rejected_tok`.
    r   r=   FTN)r	   
program_idloadnum_programstoint32rangestore)r?   r@   rA   rB   rC   rD   rE   rF   pid	start_locis_last_programnext_start_locend_locnew_valiold_valr   r   r   rS      s&   
rS   ) typingr   rO   vllm.attention.layerr   vllm.configr   r   vllm.config.speculativer   vllm.loggerr    vllm.model_executor.model_loaderr   vllm.triton_utilsr	   r
    vllm.v1.attention.backends.utilsr   r   vllm.v1.spec_decode.eagler   r   rs   rg   r   rf   rw   rv   rU   jitrS   r   r   r   r   <module>   s>    "

