o
    ie,                    @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dl	Z	d dl
mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZCmDZD d dlEmFZF d dlGmHZH d dlImJZJmKZK eeLZMG d d! d!ZNG d"d# d#eNZOd$e	jPd%e9d&eQe	jPe	jPf fd'd(ZRdS ))    N)replace)	find_spec)cast)CUDAGraphMode
VllmConfigget_layers_from_vllm_config)get_pp_group)set_forward_context)init_logger)AttentionLayerBase)	get_model)supports_multimodal)DeepseekV32IndexerCache)SupportsMultiModal)Eagle3LlamaForCausalLM)MULTIMODAL_REGISTRY)current_platform)triton)is_pin_memory_available)AttentionMetadataBuilderCommonAttentionMetadata)AttentionBackendEnum)TreeAttentionMetadataTreeAttentionMetadataBuilder)TritonAttentionMetadata)CudagraphDispatcher)KVCacheConfig)SamplingMetadata)_SAMPLING_EPS)SpecDecodeMetadata)PADDING_SLOT_IDcompute_new_slot_mapping#copy_and_expand_eagle_inputs_kernel"eagle_prepare_inputs_padded_kernel&eagle_prepare_next_token_padded_kernelextend_all_queries_by_N)CpuGpuBuffer)coordinate_batch_across_dp)CachedRequestState
InputBatchc                   @   s  e Zd Z	d`dedejdefddZdd Zd	d
 Z	dd Z
dd ZdefddZdedejfddZ	d`dedejdB deeejf fddZdeddfddZ			dadejdejdejdejd ejdB d!ed"ed#eeej ejf dB d$ejdB d%eeejf eeeejf  B dB dejfd&d'Zdejdejdejdejd ejdB d(ed$ejdB deeejef fd)d*Zdefd+d,Zd-eee  d.eeef d/ed0eeef dejf
d1d2Zd!ed-ejd.eeef d/ed3ejdeejejf fd4d5Zd!ed6e d7ejdeeejejf fd8d9Z!	d`d:ed;ejdejd<ejd!ed%eeejf eeeejf  B dB deej fd=d>Z"d!ed-eee  d?ee deeejf fd@dAZ#dBe$j%defdCdDZ&de$j%fdEdFZ'dGe$j%ddfdHdIZ(dJe$j%ddfdKdLZ)dJe$j%ddfdMdNZ*e+ 	O	P	dbdedQedRed%eeejf dB ddf
dSdTZ,de-fdUdVZ.defdWdXZ/dYe0ddfdZd[Z1d\ed]edeeejf fd^d_Z2dS )cSpecDecodeBaseProposerNvllm_configdevicepass_hidden_states_to_modelc                 C   sX  || _ |jd us
J |j| _| jj| _| jj| _|| _|| _|| _|jj| _|jj	| _	|j
j| _| jj| _| j | _| j | _| jj| _| jsKdn| j| _| j| jrVdnd | _| jdk| _d| _d | _| jrm|   |jj}|jj| j|  | _t| j| _t | _!| j!"|j| _#d | _$d | _%g | _&g | _'| ( | _)| j j*| _*t+| j | _,t-j.| jt-j/|d| _0| jj1| _1| j jj2| _2| jj2| _3| j1rt-j.d| jd ft-j4|d| _5n&| j2dkr| j3dkrt-j.| j2| jd ft-j4|d| _6nt-j.| jt-j4|d| _7t-j.| j| jf| j|d| _8t9|d | j}t-j||t-j/d| _| jr-| :  | ;  | <  d | _=d | _>| jrOt-j.| jft-j?|d| _=t-j.| jft-j?|d| _>t-j.| j| jf| j|d| _@tA|t-j/tB |dd| _Ct-j.| jt-j4|d| _Dd | _EtFG rddlHmI} tJ|g}tKtLjMjNd	d
rddlOmP}	 |Q|	 ddlRmS}
 |Q|
 ddlTmU} |Q| tV|| _E| jjW}|d usJ tXY|| _Zt[| jZd }dg| }| jZD ]}|t[|d   d7  < q|d g| _\|d g| _]t^d|D ]}| j\Q| j\d ||   | j]Q|| ||d    qt-jdt[| jZd |t-j/d_|d| _`d S )N   r   dtyper,      r,   r0   T)r0   
pin_memoryr,   
with_numpy)RocmAttentionMetadataF)include_classname)AiterFlashAttentionMetadata)MLACommonMetadata)FlexAttentionMetadata)ar+   speculative_configdraft_model_configmethodr-   runnerr,   model_configr0   max_model_lenparallel_configdata_parallel_rankdp_ranknum_speculative_tokensget_hidden_sizehidden_sizeget_inputs_embeds_sizeinputs_embeds_sizeparallel_draftingextra_slots_per_requestnet_num_new_slots_per_requestneeds_extra_input_slotsparallel_drafting_token_id%parallel_drafting_hidden_state_tensor_init_parallel_drafting_paramsscheduler_configmax_num_seqsmax_num_batched_tokensmax_num_tokensnparangetoken_arange_npr   mm_registrysupports_multimodal_inputssupports_mm_inputsattn_metadata_builderdraft_indexer_metadata_builderattn_layer_namesindexer_layer_names,_get_eagle3_use_aux_hidden_state_from_configeagle3_use_aux_hidden_statecompilation_configr   cudagraph_dispatchertorchzerosint32	input_ids
uses_mropeuses_xdrope_dimdraft_uses_xdrope_dimint64mrope_positionsxdrope_positions	positionshidden_statesmax'_raise_if_padded_drafter_batch_disabled_raise_if_multimodal_raise_if_mropeis_rejected_token_maskis_masked_token_maskboolinputs_embedsr&   r   backup_next_token_ids_slot_mapping_bufferallowed_attn_typesr   is_rocm$vllm.v1.attention.backends.rocm_attnr5   r   r   r   ROCM_AITER_FAget_path(vllm.v1.attention.backends.rocm_aiter_far7   append2vllm.model_executor.layers.attention.mla_attentionr8   )vllm.v1.attention.backends.flex_attentionr9   tuplespeculative_token_treeastliteral_evaltree_choiceslencu_drafts_per_levelchild_drafts_per_levelrangerepeattree_draft_pos_offsets)selfr+   r,   r-   r>   max_batch_sizemax_num_slots_for_aranger5   
rocm_typesr7   r8   r9   spec_token_tree
tree_depthnum_drafts_per_levelnodelevel r   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/spec_decode/eagle.py__init__=   s
  






















zSpecDecodeBaseProposer.__init__c                 C      | j jrtdd S )NzSpeculative Decoding with draft models or parallel drafting only supports padded drafter batch. Please unset disable_padded_drafter_batch in the speculative_config.)r;   disable_padded_drafter_batchNotImplementedErrorr   r   r   r   ro     
   z>SpecDecodeBaseProposer._raise_if_padded_drafter_batch_disabledc                 C   s   | j rtdd S )NzbSpeculative Decoding with draft models or parallel drafting does not support multimodal models yet)rY   r   r   r   r   r   rp     s
   z+SpecDecodeBaseProposer._raise_if_multimodalc                 C   r   )NzWSpeculative Decoding with draft models or parallel drafting does not support M-RoPE yet)r<   rf   r   r   r   r   r   rq   $  r   z&SpecDecodeBaseProposer._raise_if_mropec                 C   s^   | j j}t|dr|j| _nt|dr|j| _ntd| jr-tj	| j
| j| jd| _d S d S )N
pard_tokenptd_token_idztFor parallel drafting, the draft model config must have `pard_token` or `ptd_token_id` specified in its config.json.r/   )r<   	hf_confighasattrr   rM   r   
ValueErrorr-   rb   emptyrF   r0   r,   rN   )r   model_hf_configr   r   r   rO   +  s   



z5SpecDecodeBaseProposer._init_parallel_drafting_params
num_tokensc                 C   sT   | j r| jd d d |f S | jdkr#| jdkr#| jd d d |f S | jd | S Nr   )rf   rj   rg   rh   rk   rl   )r   r   r   r   r   _get_positions@  s
   z%SpecDecodeBaseProposer._get_positionsrl   c                 C   sr   | j r|| jd d d |f< d S | jdkr'| jdkr'|| jd d d |f< d S | jjj r0|d }|| jd |< d S r   )rf   rj   rg   rh   rk   r+   r?   rl   )r   r   rl   r   r   r   _set_positionsG  s   
z%SpecDecodeBaseProposer._set_positionsslot_mappingreturnc                    sj   |dur!|j d }| jd| | ||kr!| j|| t | jd|   fdd| j| j D S )zzReturn slot_mapping dict for EAGLE layers.

        If slot_mapping is provided, copies it into the buffer first.
        Nr   c                    s   i | ]}| qS r   r   ).0nameviewr   r   
<dictcomp>d  s    z<SpecDecodeBaseProposer._get_slot_mapping.<locals>.<dictcomp>)shaperw   copy_fill_r    r\   r]   )r   r   r   
num_actualr   r   r   _get_slot_mappingT  s   	
z(SpecDecodeBaseProposer._get_slot_mappingcudagraph_modec                 C   s:   | j js| tjtjfv rtj}ntj}| j| dS )zInitialize cudagraph dispatcher keys for eagle.

        Eagle only supports PIECEWISE cudagraphs (via mixed_mode).
        This should be called after adjust_cudagraph_sizes_for_spec_decode.
        N)	r;   enforce_eager
mixed_moder   	PIECEWISEFULLNONEra   initialize_cudagraph_keys)r   r   eagle_cudagraph_moder   r   r   r   f  s   
z0SpecDecodeBaseProposer.initialize_cudagraph_keystarget_token_idstarget_positionstarget_hidden_statesnext_token_idstoken_indices_to_samplecommon_attn_metadatasampling_metadatamm_embed_inputsnum_rejected_tokens_gpuslot_mappingsc           -      C   s  |  }| jdkr!t| jtsJ | j|}|jd | jks!J | j|||||||	d\}}}| j	d us7J | j
d u rA|  }n| j
}|j|dd}| jrW| jj|dd}nd }i }| jD ]}|||< q^| jD ]}|d uspJ |||< qh| j||d\}}| j|\}}|j}|d ur||| j< | jr|pd\}}| jj| jd | ||d| jd |< d }| jd | }n	| jd | }d }|| ||d	}| jr| jd | |d
< t|| j|||| ||jd | jdi |}|   s|}|}n|\}}W d    n	1 sw   Y  || }| j!|} | j"dks | j#r-| j$dd}!|!%d| j"S | j&r;| j'd d |f }"n| j(| }"| jdv rL| j| }n|| }t|t)rh| j*|| |"|||
d}#t+j,|#ddS | j$dd}!| j-d urt|| j-st.dt/| d| j- |!g}#| j||d\}$}%| j|$\}}|j}&|%d ur|&|%| j< ||_0d|_1| j2d |d  |_3t+4| j5d |d  6 |_7| j"dkr|	d ur| j8|	8  _8d |_9d |_:t;| j"d D ]^}'|#d < }| j&r|"d7 }"|"d | j=k}(t+>|(?dt+@|"|"})n|"d7 }"|"| j=k}(t+>|(d|"})| j8d7  _8|j8A|(d tB|jCd | j=|_C|j9d urB| j9d7  _9|j:d urO| j:d7  _:|jDjE}*| j&r^|)d |* }+n|)|* }+|jFjGd|+%ddd},|,%d},| j&r|,|* |)d |*  |_n	|,|* |)|*  |_|jA|(tH |j||'d d}| jD ]}|||< q|| jd |< | I||) || jd |< | jr| j|| jd |< d }| jd |& }n	| jd |& }d }|| |&|d	}| jr| jd |& |d
< t|| j|&|%|| |&|jd | jdi |}|   s|}|}n|\}}W d    n	1 s%w   Y  |d | }| j!|d | } | j$dd}!|#J|! qt+jK|#dd}!|!S )Neagle3r:   )r   r   r   r   r   cadr   r   r   draft_indexnum_tokens_unpaddednum_tokens_padded)NN)multimodal_embeddingsis_multimodalre   rl   ru   rm   r   num_tokens_across_dpcudagraph_runtime_moder   r.   dim)deepseek_mtp	ernie_mtplongcat_flash_mtppangu_ultra_moe_mtp)
batch_sizelogitsrl   rm   r   r   z^Unsupported attention metadata type for speculative decoding with num_speculative_tokens > 1: z. Supported types are: r   indexr   )Lr   r=   
isinstancemodelr   combine_hidden_statesr   rF   set_inputs_first_passr>   rZ   _get_attention_metadata_builderbuild_for_draftingr[   r\   r]   _pad_batch_across_dpra   dispatchr   rC   rY   embed_input_idsre   ru   r   r-   rm   r	   r+   r   r   model_returns_tuplecompute_logitsrD   rI   argmaxr   rf   rj   rl   r   propose_treerb   catrx   r   typenum_actual_tokensmax_query_lenrU   query_start_loc
from_numpyrV   clonequery_start_loc_cpuseq_lens_seq_lens_cpu_num_computed_tokens_cpur   intr@   where	unsqueeze
zeros_likemasked_fill_minmax_seq_lenkv_cache_spec
block_sizeblock_table_tensorgatherr    r   r~   stack)-r   r   r   r   r   r   r   r   r   r   r   r   r   rZ   attn_metadatadraft_indexer_metadataper_layer_attn_metadata
layer_namenum_tokens_dp_paddedr   r   
batch_descnum_input_tokens	mm_embedsis_mm_embedre   ru   model_kwargsret_hidden_stateslast_hidden_statesrm   sample_hidden_statesr   draft_token_idsrl   draft_token_ids_listbatch_size_dp_paddedbatch_size_across_dpinput_batch_sizetoken_indexexceeds_max_model_lenclamped_positionsr   block_numbers	block_idsr   r   r   proposew  s  










	


	




zSpecDecodeBaseProposer.proposer   c                 C   s  | j sG|d u r|jdd  d }|jd }|dd  | jd |d < || j|< | jdkr5| jdkr5|d }| || || jd |< |||fS | jd usNJ | j	d usUJ |
 }	|j| j }
tdt|
}|
| d | }|jd }|| j|	  }tj|	| j tj| jd}tj|tj| jd}|	|f}|j}|jdd  d }|d ur|| }t| di d|d|d|d| jd	| jd
| jd| j	d|d|d|d|ddd| jd|d| jd| jd| | jr| jd usJ || j|< | j	d | }tj|d| j| jd | | jd | d | jd u r#|  n| j}t|| jd | | jd | |jj | j| j!d}t"|| j| j#|d}|||fS )Nr.   r      r/   target_token_ids_ptrtarget_positions_ptrnext_token_ids_ptrout_input_ids_ptrout_positions_ptrout_is_rejected_token_mask_ptrout_is_masked_token_mask_ptrout_new_token_indices_ptrout_hidden_state_mapping_ptrquery_start_loc_ptrquery_end_loc_ptrpadding_token_idrM   total_input_tokensnum_padding_slots_per_requestshift_input_idsBLOCK_SIZE_TOKENSout)r   new_positionsrr   r   num_new_tokensr@   )NrU   new_slot_mappingr   )$rL   r   r   re   rg   rh   r   rm   rr   rs   r   r   rK   r   r   next_power_of_2rb   r   rJ   rd   r,   r"   rl   rM   r-   rN   r   r   rZ   r   r!   r   r   r@   r%   rU   )r   r   r   r   r   r   r   r   r   r   max_num_tokens_per_requestr)  
num_blockstotal_num_input_tokenstotal_num_output_tokensout_hidden_state_mappinggridr   query_end_locmaskbuilderr/  new_cadr   r   r   r     s   








	


z,SpecDecodeBaseProposer.set_inputs_first_passc                 C   s
   | j dvS )N)mtpdraft_model)r=   r   r   r   r   r   K  s   
z*SpecDecodeBaseProposer.model_returns_tuplesampled_token_idsrequestsgpu_input_batchnum_scheduled_tokensc                 C   sv   |j }g }t|D ]$\}}|r|d }	n|| }
||
 }|j||
  }||}	||	 q	tj|tj| jj	d}|S )aj  
        This function is used to prepare the inputs for speculative decoding.
        It calculates the next token ids for each request based on the sampled
        token ids from the CPU. If a request has no sampled token ids (e.g.,
        during the initial decoding steps), it falls back to using the request
        state to get the next token id.
        r:   r/   )
req_ids	enumeratenum_computed_tokensget_token_idr~   rb   tensorrd   re   r,   )r   r=  r>  r?  r@  rA  r   i	token_idsnext_token_idreq_id	req_stateseq_lenr   r   r   prepare_next_token_ids_cpuN  s   

z1SpecDecodeBaseProposer.prepare_next_token_ids_cpudiscard_request_maskc                    s   j }tj fddt|D tjd| jjd|< | j| | jj}|j\}}	|j	}
|j
tjks6J |j
tjks>J tj|tj|
d}||}|f}t|	}t| |||||j|	||d|d
 ||fS )a  
        This function is used to prepare the inputs for speculative decoding.
        It calculates the next token ids and the number of valid sampled tokens
        for each request, considering the "discarded" requests whose next token
        is not sampled and comes from `request.get_token_id()` instead. This is denoted
        the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
        c                    s*   g | ]}j |   j|  qS r   )rA  rD  seq_lens_cpuitem)r   rF  r   r?  r>  r   r   
<listcomp>  s    zHSpecDecodeBaseProposer.prepare_next_token_ids_padded.<locals>.<listcomp>r0   Nr/   r   )r)  )num_reqsrT   arrayr   rd   rv   copy_to_gpugpur   r,   r0   rb   rt   r   	new_emptyr   r0  r$   
vocab_sizestride)r   r   r=  r>  r?  rM  rS  backup_tokens_gpur   r   r,   r   valid_sampled_tokens_countr6  r)  r   rP  r   prepare_next_token_ids_paddedo  s<   	


z4SpecDecodeBaseProposer.prepare_next_token_ids_paddedspec_decode_metadatar[  c                 C   s   |j }|j}tj|ftj|d}tj|ftj|d}|f}t| |j||j||| |j}	|	dd |	dd  }
|	d 	 }t
|j|j|	|j|j|j ||
 	 |j 	 |j|jd| d|jd}|||fS )a  
        This function is used to prepare the inputs for speculative decoding
        It updates the common_attn_metadata for speculative decoding,
        but does not consider the rejected tokens. Instead, all tokens
        are included as inputs to the speculator, with the rejected tokens
        used as padding and filtered out later by `token_indices_to_sample`.
        No blocking CPU operations should be introduced in this function.
        r/   r.   Nr:   Tr   r   r   r   r   rS  r   r   r   r   r   causaldcp_local_seq_lens)rS  r,   rb   r   rd   r#   cu_num_draft_tokensr   r   rO  r   r   r   r   rn   rN  r   r   r`  )r   r   r]  r[  rS  r,   r   r   r6  r   new_query_len_per_reqtotal_num_tokensspec_common_attn_metadatar   r   r   prepare_inputs_padded  sN   

	
z,SpecDecodeBaseProposer.prepare_inputs_paddedr   r   rm   c           &   
   C   s&  | j jd d  }t|tsJ | jd }|}	| jd }
|
dkr,|jdd|d}nt	j
||
ddj|d}|g}||dd}t	jd| jj| jjd}t	jd| jj| jjd}t	jd| jj| jjd}||d| jd |d d f  }t| j}t|d D ]}||d  }|| | jk}t	|d||d}|	dkr|j|	dd}|
dkr|j|
dd}t	j||gdd}t	j||gdd}t	j||gdd}|}t||| jd |d   |j|	 || |d}|j||d d}i }| jD ]}|||< qt|j| j|_|j |d |j!j"}|d d ||| f }|| }|j#j$d|d}|| ||  }t%||< |d|_&|j'}|d} | | jd |< |d| jd |< ||d| jd |< | j()|\}!}"|"j*}#t+|| j,|#|!| -|#|j&d	" | j.| jd |# | jd |# | jd |# d d
\}$}W d    n	1 sw   Y  |d | ||dd d |	 d f }|$d | ||dd d |	 d f }%| j./|%0||	 d}| j|d  }
|
dkr|jdd|d}nt	j
||
ddj|d}|1| | j|d  | }	| j|d  }q|S )Nr   r.   r:   r   r2   )r   r   r   r   r   r   )r   r   r   )re   rl   rm   ru   )2r>   attn_groupsget_metadata_builderr   r   r   r   r   r   rb   topkindicesr   re   r,   r0   rl   rm   r   r   r   r@   r   repeat_interleaver   r   rU   r   r   r\   r   r   r   r   r   block_tabler   r    r   r   ra   r   r   r	   r+   r   r   r   reshaper~   )&r   r   r   rl   rm   r   r   tree_attn_metadata_buildertotal_num_draftslevel_num_draftsnum_childrenr  r  draft_hidden_statestree_input_idstree_positionstree_hidden_statesflattened_draft_positionsr   r   draft_positionsr  	query_lenr  r  r  r   query_positionsr  r  r   r   re   r   r  r  r  draft_last_hidden_statesr   r   r   r     s   


 



	

z#SpecDecodeBaseProposer.propose_treenum_draft_tokensc                    sX   fddt |D }tj|tjd}|jj}|j}|j| }|dd |dd  }|| }	|	 }
tj	|j
tjt d}| }tj|
|dd d |d }t|dd |
}| jd| | }t|dd  |
}|| }t|j|d	d
}t|j|d	d
|j|d	d
|||j|j||  |  |j|j| d	|jd}||fS )a+  
        This function is used to prepare the inputs for speculative decoding.
        It updates to the common_attn_metadata to account for the rejected
        tokens (and newly sampled tokens). It also returns the token indices
        of the tokens that should be fed to the speculator.
        c                    s0   g | ]\}}|d kr|d t  |  nd qS )r   r.   )r   )r   rF  nr=  r   r   rQ    s    z9SpecDecodeBaseProposer.prepare_inputs.<locals>.<listcomp>rR  r.   Nr:   )r0   r3   r*  T)non_blockingr^  )rB  rb   rE  rd   r   r,   r   rN  numpyrc   r   r   rT   cumsumr   rV   r   tor   r   rS  rn   rO  r   r   r`  )r   r   r=  rz  num_rejected_tokensr,   r   new_seq_lens_cpurb  new_num_tokens_per_reqnew_num_tokens_per_req_npnew_query_start_loc_cpunew_query_start_loc_nprc  new_query_start_locs_expandedtoken_offsetsold_query_start_locs_expandedtoken_indices_nptoken_indicesrd  r   r|  r   prepare_inputs  sX   



z%SpecDecodeBaseProposer.prepare_inputsr   c                 C   s   t |dr|j}|jjS )Nmodule)r   r  	__class____name__)r   r   r   r   r   get_model_name  s   
z%SpecDecodeBaseProposer.get_model_namec                 C   sL   ddl m} |d t| j| jjd}W d   |S 1 sw   Y  |S )z
        Default method to call get_model(). Can be overridden by subclasses which
        need to customize model loading.
        r   )set_model_tag
eagle_head)r+   r?   N)vllm.compilation.backendsr  r   r+   r;   r<   )r   r  r   r   r   r   
_get_model  s   

z!SpecDecodeBaseProposer._get_modeltarget_modelc           
   
   C   s   t t| jt }t t| jt }|  | _t| jt | }t| jt}| | }t|| | _	t|| _
| j
r[| j
d }||   || | j| j
| j| j| _nd | _| jrztjdgg| jjd}| jj|d d W n tttfy   td d| _Y nw t|rt|dsJ | |dv r|jj| jj_n| |d	kr|jj j| jj_n|jj| jj_t!t"|# }	n|}	| $|	 | %|	 | j&r| j'r| j(d usJ | j()| j*r| j+| jj,-d
| j. n| jj,-| j. d S d S d S )Nr   r.   )r,   )r   zNDraft model does not support multimodal inputs, falling back to text-only modeFconfig)"Qwen2_5_VLForConditionalGenerationQwen3VLForConditionalGeneration"Qwen3VLMoeForConditionalGeneration!HunYuanVLForConditionalGenerationGlmOcrForConditionalGenerationPixtralForConditionalGenerationr1   )/setr   r+   r   keysr   r  r   listr\   r]   get_attn_backendget_builder_clsget_kv_cache_specr,   r[   rY   rb   rE  re   r   r   AttributeError	TypeErrorloggerwarningr   r   r  r  image_token_idimage_token_indexvision_configr   r   get_language_model_maybe_share_embeddings_maybe_share_lm_headrI   r-   rN   r   r_   r   mask_hiddenr   rF   )
r   r  target_attn_layer_namestarget_indexer_layer_namesdraft_attn_layer_namesindexer_layersdraft_indexer_layer_namesfirst_layerdummy_input_idstarget_language_modelr   r   r   
load_model  s   






z!SpecDecodeBaseProposer.load_modelr  c                 C   s   t  jdkrt|dd}|du rtdt|dr|j}nt|dr&|j}ntdd}t| jd	rl| jjs>d
}t	
d n5t|jtjrft| jjjjtjrft|j | jjjj rfd
}t	
d nt	
d nd
}t	
d |rt| jjdr| jj`|| jj_dS dS t	
d dS )a  
        Some draft models may not have their own embedding layers, and some may
        have a duplicate copy of the target model's embedding layers. In these cases,
        we share the target model's embedding layers with the draft model to save
        memory.
        r.   r   Nz,Target model does not have 'model' attributeembed_tokens	embeddingzBTarget model does not have 'embed_tokens' or 'embedding' attributeFhas_own_embed_tokensTzDetected EAGLE model without its own embed_tokens in the checkpoint. Sharing target model embedding weights with the draft model.zDetected EAGLE model with embed_tokens identical to the target model. Sharing target model embedding weights with the draft model.zrDetected EAGLE model with distinct embed_tokens weights. Keeping separate embedding weights from the target model.zPDetected MTP model. Sharing target model embedding weights with the draft model.zRThe draft model's vocab embedding will be loaded separately from the target model.)r   
world_sizegetattrr  r   r  r  r   r  r  infor   weightrb   Tensorequalcpu)r   r  inner_modeltarget_embed_tokensshare_embeddingsr   r   r   r  l  s\   


z.SpecDecodeBaseProposer._maybe_share_embeddingsc                 C   sV  d}t | jdrG| jjsd}td n:t |drAt|jjtj	rAt| jjjtj	rAt
|jj | jjj rAd}td ntd nd}td |rt |drt | jdr^| j`|j| j_t| jd	d
}|rrt|dd
nd
}|d
urt|tjr| n|}|D ]"}t|dd
}|d
urt |dr|`|j|_td qd
S d
S d
S d
S )z
        Some draft models may not have their own LM head, and some may have a
        duplicate copy of the target model's LM head. In these cases, we share
        the target model's LM head with the draft model to save memory.
        Fhas_own_lm_headTzzDetected EAGLE model without its own lm_head in the checkpoint. Sharing target model lm_head weights with the draft model.lm_headz{Detected EAGLE model with lm_head identical to the target model. Sharing target model lm_head weights with the draft model.zkDetected EAGLE model with distinct lm_head weights. Keeping separate lm_head weights from the target model.zNDetected MTP model. Sharing target model lm_head weights with the draft model.r   Nlayersshared_headheadz6Shared target model lm_head with MTP shared_head.head.)r   r   r  r  r  r   r  r  rb   r  r  r  r  nn
ModuleDictvaluesr  )r   r  share_lm_headinnerr  itemslayershr   r   r   r    sb   

z+SpecDecodeBaseProposer._maybe_share_lm_headTFuse_cudagraphsis_graph_capturingc              	   C   s<  t |s| jndD ]}|dkr4| j||d\}}|r&| j|\}}	|	j}
ntj}|}
|d ur4|
|| j< | j	rH|d urH| j	d |v rH| 
|
}n|pKi }td | j|
|||d< | jred }| jd |
 }n	| jd |
 }d }t|| |
|d}| jr| jd |
 |d< | jdi | W d    n1 sw   Y  q	d S )Nr.   r   r   r   r   rm   r   )r   rD   r   ra   r   r   r   r   rC   r\   r   r	   r+   rY   ru   re   dictr   r-   rm   r   )r   r   r  r  r   fwd_idxr  r   r   r  r  slot_mapping_dictre   ru   kwargsr   r   r   	dummy_run  sZ   



z SpecDecodeBaseProposer.dummy_runc                 C   s\   d}| j d }| jjD ]}|D ]}||jv r| } nq|dur# nq|dus,J d|S )zFind and return the attention metadata builders for EAGLE layers.

        Returns:
            The metadata builders for EAGLE layers.

        Raises:
            AssertionError: If no metadata builders are found for EAGLE layers.
        Nr   z;Failed to find attention metadata builder for EAGLE layers.)r\   r>   rf  layer_namesrg  )r   r9  chosen_layerkv_cache_group
attn_groupr   r   r   r   2  s   	


z6SpecDecodeBaseProposer._get_attention_metadata_builderc                 C   s:   | j dkrdS d}t| jjdd}|dur|dd}|S )a5  
        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
        hidden states and directly uses the last layer output just like eagle1.
        They might indicate this by setting "use_aux_hidden_state" to False
        inside the "eagle_config" dict of their hf_config.
        r   FTeagle_configNuse_aux_hidden_state)r=   r  r<   r   get)r   r  r  r   r   r   r^   K  s   
zCSpecDecodeBaseProposer._get_eagle3_use_aux_hidden_state_from_configkv_cache_configc                    sX   i  t |jD ]\}}|jD ]}| |< qqtt fdd| jD dks*J ddS )z
        Validate that all drafting layers belong to the same KVCacheGroup.
        Need this assumption to ensure all drafting layers can use the
        same AttentionMetadata.
        May extend to multiple AttentionMetadata in the future.
        c                    s   g | ]} | qS r   r   )r   r  kv_cache_groupsr   r   rQ  i  s    zGSpecDecodeBaseProposer.validate_same_kv_cache_group.<locals>.<listcomp>r.   z<All drafting layers should belong to the same kv cache groupN)rB  r  r  r   r  r\   )r   r  idr  r  r   r  r   validate_same_kv_cache_group[  s    


	z3SpecDecodeBaseProposer.validate_same_kv_cache_groupr   r   c              	   C   s\   t || jjd| jjtjk|d d d\}}}|rJ d|}|d ur*t|| j 	 }||fS )NF)r   rA   allow_microbatchingallow_dp_paddingr   uniform_decode num_scheduled_tokens_per_requestz'DBO ubatching not implemented for EAGLE)
r'   r+   rA   ra   r   r   r   r   rC   rO  )r   r   r   should_ubatchnum_toks_across_dp_r  r   r   r   r   r  s    
z+SpecDecodeBaseProposer._pad_batch_across_dpN)NNN)TFN)3r  
__module____qualname__r   rb   r,   rt   r   ro   rp   rq   rO   r   r   r  r   r  strr   r   r   r   r   r   r  r  r   r   r(   r)   rL  r\  r   re  r   r  r  Moduler  r  r  r  r  inference_moder  r   r   r^   r   r  r   r   r   r   r   r*   <   sR   
 Y
	

  K	
 



!

:
I	

 .

f`EC=r*   c                       s,   e Zd Z	ddedejf fddZ  ZS )EagleProposerNr+   r,   c                    s   t  j||d|d d S )NT)r-   r>   )superr   )r   r+   r,   r>   r  r   r   r     s   
zEagleProposer.__init__r  )r  r  r  r   rb   r,   r   __classcell__r   r   r  r   r    s    r  r   r   r   c                 C   s   |j r| }| jdd}||fS |jd usJ |j}|js'|tk }t|d|}| |dd | j	dtj
d}t|}|  ||jddd}|js]|jdd}t|||}||fS )Nr:   r   g      ?r.   )r   r0   )
all_greedyr   temperature
all_randomr   rb   r   div_r   softmaxfloat32
empty_likeexponential_div)r   r   probsr   r  	is_greedyqgreedy_token_idsr   r   r   #compute_probs_and_sample_next_token  s$   
r  )Sr   dataclassesr   importlib.utilr   typingr   r~  rT   rb   torch.nnr  vllm.configr   r   r   vllm.distributed.parallel_stater   vllm.forward_contextr	   vllm.loggerr
   /vllm.model_executor.layers.attention_layer_baser    vllm.model_executor.model_loaderr   vllm.model_executor.modelsr   &vllm.model_executor.models.deepseek_v2r   %vllm.model_executor.models.interfacesr   'vllm.model_executor.models.llama_eagle3r   vllm.multimodalr   vllm.platformsr   vllm.triton_utilsr   vllm.utils.platform_utilsr   vllm.v1.attention.backendr   r   #vllm.v1.attention.backends.registryr   $vllm.v1.attention.backends.tree_attnr   r   &vllm.v1.attention.backends.triton_attnr   vllm.v1.cudagraph_dispatcherr   vllm.v1.kv_cache_interfacer   vllm.v1.sample.metadatar   vllm.v1.sample.samplerr   vllm.v1.spec_decode.metadatar   vllm.v1.spec_decode.utilsr    r!   r"   r#   r$   r%   vllm.v1.utilsr&   vllm.v1.worker.dp_utilsr'   vllm.v1.worker.gpu_input_batchr(   r)   r  r  r*   r  r  r   r  r   r   r   r   <module>   sp                Z