o
    i                     @   s  d dl Z d dlmZmZmZmZ d dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ erad dlmZ d dlm  m  m Z! n	eZede" dZ!ee#Z$ed Z%edde%f Z&eddddde&f Z'eG dd dZ(dS )    N)TYPE_CHECKINGAnyLiteralget_args)FieldSkipValidationmodel_validator)Self)ModelConfig)ParallelConfig)config)init_logger)get_hf_text_config)	safe_hash)
LazyLoaderhas_arctic_inference)PretrainedConfigmodel_executorz'vllm.model_executor.layers.quantization)deepseek_mtpmimo_mtpglm4_moe_mtpglm4_moe_lite_mtpglm_ocr_mtp	ernie_mtpexaone_moe_mtpqwen3_next_mtplongcat_flash_mtpmtppangu_ultra_moe_mtpstep3p5_mtpeagleeagle3ngrammedusamlp_speculatordraft_modelsuffixc                	   @   s  e Zd ZU dZdZedB ed< 	 edddZe	ed< 	 dZ
edB ed< 	 dZedB ed< 	 edd	d
Ze	dB ed< 	 dZe	dB ed< 	 dZejdB ed< 	 edd	d
Ze	dB ed< 	 dZedB ed< 	 dZedB ed< 	 eddd
Ze	dB ed< 	 dZeed< 	 edd	d
Ze	dB ed< 	 edd	d
Ze	dB ed< 	 dZedB ed< 	 dZeed< 	 dZee ed< 	 dZee  ed< 	 dZ!ee ed< 	 dZ"ee  ed< 	 dZ#e	ed< 	 dZ$e	ed < 	 d!Z%e&ed"< 	 d#Z'e&ed$< 	 d%efd&d'Z(e)d(e*d%e*fd)d*Z+d+d, Z,d-d. Z-e)d/e	dB d0e	d1e	d%e	fd2d3Z.e)de d4e	dB d5e*d%e	fd6d7Z/e)de d4e	d%e fd8d9Z0e1d:d;d%e2fd<d=Z3d>d? Z4d%efd@dAZ5d%efdBdCZ6d%efdDdEZ7dS )FSpeculativeConfigz'Configuration for speculative decoding.Nenforce_eagerr   )defaultgtnum_speculative_tokensmodelmethod   )r)   gedraft_tensor_parallel_sizetensor_parallel_sizequantizationmax_model_lenrevisioncode_revision   disable_by_batch_sizeFdisable_padded_drafter_batchprompt_lookup_maxprompt_lookup_minspeculative_token_treeparallel_draftingtarget_model_configtarget_parallel_configdraft_model_configdraft_parallel_config   suffix_decoding_max_tree_depthi'  #suffix_decoding_max_cached_requestsg      ?suffix_decoding_max_spec_factorg?suffix_decoding_min_token_probreturnc                 C   s0   g }| | jdk tt| dd }|S )a  
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        r!   F)usedforsecurity)appendr-   r   strencode	hexdigest)selffactorshash_str rO   M/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/config/speculative.pycompute_hash   s   zSpeculativeConfig.compute_hash	hf_configc                 C   sn  | j d }| jdv rd| _| jdkr!t| dd }| |dgd | jdv r)d| _| jdkr=t| dd }| |d	gd | j d d
krWd| _t| dd }| d|dgd | j d dkrpd| _t| dd }| |dgd | j d dkrd| _t| dd }| d|dgd | j d dkrd| _t| dd }| d|dgd | jdkrd| _| jdkrt| dd }| |dgd | jdkrd| _| jdkrt| dd }| |dgd | jdkrd| _| jdkrt| dd }| |dgd | jd krd!| _t| dd"}| |d#gd | jd$kr(d%| _t| dd"}| |d&gd |d'kr5| d(d)gi | S )*Nr   )deepseek_v3deepseek_v32glm_moe_dsar   num_nextn_predict_layersDeepSeekMTPModel)	n_predictarchitecturespangu_ultra_moer   OpenPanguMTPModelMiMoForCausalLMr   MiMoMTPModel)num_hidden_layersrX   rY   Glm4MoeForCausalLMr   Glm4MoeMTPModelGlm4MoeLiteForCausalLMr   Glm4MoeLiteMTPModelGlmOcrForConditionalGenerationr   GlmOcrMTPModelernie4_5_moer   ErnieMTPModel
qwen3_nextr   Qwen3NextMTP
exaone_moer   ExaoneMoeMTPlongcat_flashr   r.   LongCatFlashMTPModelstep3p5r   
Step3p5MTPMistralLarge3ForCausalLMrY   EagleMistralLarge3ForCausalLM)rY   
model_typegetattrupdate)rR   initial_architecturerX   rO   rO   rP   hf_config_override   s   

















z$SpeculativeConfig.hf_config_overridec                 C   s  | j ttv r| j dkrtd| j  d| _ | jd u r\| jd ur\| j dkrF| jd u r.td| jj	j
dkr8d| _| jj| _| jsE| jj| _n| j dv rOd| _n| j dkrXd| _ntd	| j d u rn| jd urn| jdv rnd| _ | j dv rd| _ | jd u r| jd u rd
| _d
| _n%| jd u r| jd u rtd| j| _n| jd u r| jd u rtd| j| _| j| jkrtd| j d| j | j| _| j| _| S | j dkr|   | S d| _d| _| jd urtd;i d| jddd| jjd| jjd| jjd| jjd| jjd| jjd| jjd| jd| jd| jjd| jjd| jd| jjd| jjd t j!d!| jj"| _| j d"v r?nrd#| jj# v rLd$| _ ned%| jj# v rYd%| _ nX| jj$j
d&kred&| _ nL| jj$j
d'krqd'| _ n@| jj$j
ttv rd| _ | jd(krtd) n'| jj$j
d*v rd*| _ | jd(krtd+ n| j d,krn	t%d-| j  d.| j d"v r dd/l&m'} dd0l(m)} t*| jj$||frn2|| jj$| j d$d1}|| j_$t+| jj$| j_	| j, | j_-| jj./| jj0| j\}}|| j_1|| j_2| jd urt3| jj$d2r| j| jj$_4t5| jj$d3d }|d urD| jd u r+|| _n| j|krD| j| dkrDtd4| j d5|| j6d u rXt7d6d7 t8| jD | _6nt9:| j6}t7t;|d8d9 d:| _6t <| j| j=| jj$| _=t >| j| jj| jj| j_t ?| j| j=| _| S )<Nr   z0method `%s` is deprecated and replaced with mtp.z+target_model_config must be present for mtprT   T)r"   z[ngram]r"   r&   zBnum_speculative_tokens was provided but without speculative model.   z[Either prompt_lookup_max or prompt_lookup_min must be provided when using the ngram method.zprompt_lookup_min=z must be <= prompt_lookup_max=r   r,   runnerdraft	tokenizertokenizer_modetrust_remote_codeallowed_local_media_pathallowed_media_domainsdtypeseedr4   r5   tokenizer_revisionspec_target_max_model_lenr2   r(   max_logprobshf_overridesconfig_format)r    r!   zeagle-r    r!   r#   r$   r.   zEnabling num_speculative_tokens > 1 will runmultiple times of forward on same MTP layer,which may result in lower acceptance rater   z`LongCat MTP models only have one layer. Might need some code changes to support multiple layers.r%   z!Unsupported speculative method: '')SpeculatorsConfig)EAGLEConfig)r-   rq   num_lookahead_tokensrX   znum_speculative_tokens:z  must be divisible by n_predict=c                 S   s   g | ]}|d  d qS )r.   )r   rO   ).0irO   rO   rP   
<listcomp>  s    z3SpeculativeConfig.__post_init__.<locals>.<listcomp>c                 S   s   t | | fS N)len)trO   rO   rP   <lambda>  s    z1SpeculativeConfig.__post_init__.<locals>.<lambda>)keyrO   )@r-   r   MTPModelTypesloggerwarningr,   r+   r=   
ValueErrorhf_text_configrq   r(   r2   r:   r9   r?   r>   r@   _validate_suffix_decodingr
   ry   rz   r{   r|   r}   r~   r   r4   r5   r   r3   r   r'   ru   r   lowerrR   NotImplementedErrorvllm.transformers_utils.configsr   %vllm.transformers_utils.configs.eagler   
isinstancer   get_model_arch_configmodel_arch_configregistryinspect_model_clsrY   _model_info_architecturehasattrr   rr   r;   rI   rangeastliteral_evalsorted_verify_and_get_draft_tpr0   #_maybe_override_draft_max_model_lencreate_draft_parallel_config)rL   r   r   eagle_config
model_infoarchrX   tree_choicesrO   rO   rP   __post_init__  s  	













 # 
 ! 	




		zSpeculativeConfig.__post_init__c                 C   s   t  std| jd u r| j| _td| j | jdk r%td| j d| jdk r3td| j d| jdk rAtd	| j dd| j	  krLdksVn td
| j	 dd S )NzdArctic Inference is required for suffix decoding. Install via `pip install arctic-inference==0.1.1`.z;Defaulted num_speculative_tokens to %s for suffix decoding.r.   zsuffix_decoding_max_tree_depth=z must be >= 1r   z$suffix_decoding_max_cached_requests=z must be >= 0z suffix_decoding_max_spec_factor=zsuffix_decoding_min_token_prob=z must be in [0, 1])
r   ImportErrorr+   rB   r   r   r   rC   rD   rE   rL   rO   rO   rP   r     sF   



z+SpeculativeConfig._validate_suffix_decodingspeculative_max_model_lendraft_max_model_lentarget_max_model_lenc                 C   sN   | dur"| |krt d| d|| |kr t d| d|| S t||S )a  Determine the max sequence len for the draft model. This is usually
        the draft_max_model_len, but may be the target_max_model_len if it is
        less than the draft_max_model_len, or may be speculative_max_model_len
        if it is specified.

        This is necessary so that sequences do not exceed the capacity of the
        draft model or the target model.

        speculative_max_model_len is mainly used for testing that sequences can
        skip speculation.
        Nzspeculative_max_model_len=z+ cannot be larger than draft_max_model_len=z, cannot be larger than target_max_model_len=)r   min)r   r   r   rO   rO   rP   r   2  s$   z5SpeculativeConfig._maybe_override_draft_max_model_len&speculative_draft_tensor_parallel_sizedraft_hf_configc                 C   s^   |du r|j dkrd}| jdkrtd|j  |S | j}|S |d| jfvr-td|d|S )z
        Verifies and adjusts the tensor parallel size for a draft model
        specified using speculative_draft_tensor_parallel_size.
        Nr$   r.   zV%s cannot currently be run with tp>1; setting speculative_draft_tensor_parallel_size=1z'speculative_draft_tensor_parallel_size=zB cannot be other value than 1 or target model tensor_parallel_size)rq   r1   r   r   r   )r>   r   r   rO   rO   rP   r   X  s(   


z*SpeculativeConfig._verify_and_get_draft_tpc              	   C   s&   t | j|| j| j| j| j| jd}|S )zCreate a parallel config for use by the draft worker.

        This is mostly a copy of the target parallel config, except the tp_size.
        )pipeline_parallel_sizer1   distributed_executor_backendmax_parallel_loading_workersdisable_custom_all_reduceray_workers_use_nsightplacement_group)r   r   r   r   r   r   r   )r>   r   r@   rO   rO   rP   r   {  s   	
z.SpeculativeConfig.create_draft_parallel_configafter)modec                    s    j d ur	td jd u rtd jdkr td j d jr* j j  jd ur< jdk r<td jg d} jd	kr` jr`t	 fd
d|D s`td| d jj
j    S )Nz{'tensor_parallel_size' is not a valid argument in the speculative_config. Please pass 'draft_tensor_parallel_size' instead.z}num_speculative_tokens must be provided with speculative model unless the draft model config contains an n_predict parameter.r   z9Expected num_speculative_tokens to be greater than zero (z).r6   zmExpect the batch size threshold of disabling speculative decoding is > 1, but got self.disable_by_batch_size=)llamaqwenminicpmgpt_oss
hunyuan_vlhunyuan_v1_denseafmoer!   c                 3   s    | ]
}| j jjv V  qd S r   )r=   r   rq   )r   supported_modelr   rO   rP   	<genexpr>  s
    
z1SpeculativeConfig._verify_args.<locals>.<genexpr>zEagle3 is only supported for z@ models. Got self.target_model_config.hf_text_config.model_type=)r1   r   r+   r?   verify_with_parallel_configr@   r7   r-   r=   anyr   rq   &verify_equal_vocab_size_if_draft_model)rL   eagle3_target_supportedrO   r   rP   _verify_args  sN   




zSpeculativeConfig._verify_argsc                 C   s`   | j dkr(| jd ur*| jd ur,| j }| j }||kr.td| d| dd S d S d S d S )Nr%   zUTarget and draft model should have the same vocabulary size. Target model vocab_size=z. Draft model vocab_size=zd. Using models with different tokenizers can cause out-of-bounds errors during speculative decoding.)r-   r=   r?   get_vocab_sizer   )rL   target_vocab_sizedraft_vocab_sizerO   rO   rP   r     s"   




z8SpeculativeConfig.verify_equal_vocab_size_if_draft_modelc                 C   s
   | j dv S )N)r    r!   r   r-   r   rO   rO   rP   	use_eagle     
zSpeculativeConfig.use_eaglec                 C   s
   | j dkS )Nr%   r   r   rO   rO   rP   uses_draft_model  r   z"SpeculativeConfig.uses_draft_modelc                 C   s8   | j }|dv r	d n| jj}| j}d|d|d|dS )N)r"   r&   zSpeculativeConfig(method=z, model=z, num_spec_tokens=))r-   r?   r,   r+   )rL   r-   r,   num_spec_tokensrO   rO   rP   __repr__  s   zSpeculativeConfig.__repr__)8__name__
__module____qualname____doc__r(   bool__annotations__r   r+   intr,   rI   r-   SpeculativeMethodr0   r1   r2   me_quantQuantizationMethodsr3   r4   r5   r7   r8   r9   r:   r;   r<   r=   r   r
   r>   r   r?   r@   rB   rC   rD   floatrE   rQ   staticmethodr   ru   r   r   r   r   r   r   r	   r   r   r   r   r   rO   rO   rO   rP   r'   8   s   
 e t$%"8r'   ))r   typingr   r   r   r   pydanticr   r   r   typing_extensionsr	   vllm.config.modelr
   vllm.config.parallelr   vllm.config.utilsr   vllm.loggerr   vllm.transformers_utils.configr   vllm.utils.hashingr   vllm.utils.import_utilsr   r   transformersr   'vllm.model_executor.layers.quantizationr   layersr2   r   globalsr   r   r   EagleModelTypesr   r'   rO   rO   rO   rP   <module>   sD   
