o
    
۾i3                    @   s   U d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ d dl mZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d d	lmZ d dlm Z  d d
l!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF d dlGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZP d dlQmRZRmSZS d dlTmUZUmVZV d dlWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb d dlcmdZdmeZemfZf d dlgmhZh d dlimjZj d d lkmlZlmmZmmnZn d d!lompZp d d"lqmrZr d d#lsmtZt d d$lumvZv d d%lwmxZxmyZymzZz d d&l{m|Z| d d'l}m~Z~ d d(lmZ d d)lmZmZ d d*lmZmZ d d+lmZ d d,lmZ d d-lmZmZ d d.lmZmZmZmZmZmZmZ d d/lmZ d d0lmZmZmZ d d1lmZ d d2lmZ d d3lmZmZmZmZmZmZmZmZmZmZmZ d d4lmZmZmZmZmZmZmZmZmZmZmZ d d5lmZmZ d d6lmZmZ d d7lmZ d d8lmZ d d9lmZ d d:lmZ d d;lmZ d d<lmZ d d=lmZ d d>lmZ d d?lmZ d d@lmZ d dAlmZmZ d dBlmZ d dClmZmZ d dDlmZ d dElmZ d dFlmZmZ d dGlmZ d dHlmZ d dIlmZ d dJlmZmZmZmZ d dKlmZ d dLlmZ dMdNlmZmZmZmZ erd dOlmZ d dPlmZmZ d dQlmZ eCeZeeef Zee dR< ee eB Zee dS< G dTdU dUeZG dVdW dWeZG dXdY dYeZG dZd[ d[eee݃ZeG d\d] d]ZdS )^    N)defaultdict)IterableIteratorSequence)contextmanager)copydeepcopy)	dataclass)reduce)TYPE_CHECKINGAny
NamedTuple	TypeAliascast)tqdm)compilation_counter)CUDAGraphStatCUDAGraphWrapper)set_cudagraph_capturing_enabled)CompilationModeCUDAGraphMode
VllmConfigget_layers_from_vllm_configupdate_config)get_ec_transferhas_ec_transfer)	EplbState)get_kv_transfer_grouphas_kv_transfer_group)copy_kv_blocks)get_dcp_groupget_pp_groupget_tp_groupgraph_captureis_global_first_rank&prepare_communication_buffer_for_model)BatchDescriptorset_forward_context)init_logger)LoRAMappingLoRAMappingType)	AttentionMLAAttention)AttentionLayerBase)RoutedExpertsCapturer)MRotaryEmbeddingXDRotaryEmbedding)TensorizerLoaderget_model_loader)finalize_layerwise_reloadinitialize_layerwise_reload)MultiModalEmbeddingsSupportsMRoPESupportsMultiModalSupportsXDRoPEis_mixture_of_expertssupports_eagle3supports_mropesupports_multimodal_pruningsupports_realtimesupports_transcriptionsupports_xdrope)VllmModelForPoolingis_pooling_modelis_text_generation_model)MULTIMODAL_REGISTRY)MultiModalBudget)BatchedTensorInputsMultiModalKwargsItemPlaceholderRange)group_mm_kwargs_by_modality)PoolingParams)SamplingType)IntermediateTensors)GenerationTaskPoolingTaskSupportedTask)
instrument)&length_from_prompt_token_ids_or_embeds)json_map_leaves)cdivround_up)DeviceMemoryProfiler
format_gib)PytHooks)is_pin_memory_available)get_dtype_sizekv_cache_dtype_str_to_dtype)AttentionBackendAttentionCGSupportAttentionMetadataAttentionMetadataBuilderAttentionTypeCommonAttentionMetadata
MultipleOf)GDNAttentionMetadataBuilder)"create_fast_prefill_custom_backendget_dcp_local_seq_lens+reorder_batch_to_split_decodes_and_prefills)NewRequestData)CudagraphDispatcher)AttentionSpecChunkedLocalAttentionSpecCrossAttentionSpecEncoderOnlyAttentionSpecFullAttentionSpecKVCacheConfigKVCacheGroupSpecKVCacheSpec	MambaSpecSlidingWindowSpecUniformTypeKVCacheSpecs)EMPTY_MODEL_RUNNER_OUTPUTAsyncModelRunnerOutputDraftTokenIdsECConnectorOutputKVConnectorOutputLogprobsListsLogprobsTensorsModelRunnerOutputPoolerOutputSamplerOutput&make_empty_encoder_model_runner_output)PoolingMetadataPoolingStates)LogitsProcessorsbuild_logitsprocs)LogitsProcessor)SamplingMetadata)RejectionSampler)Sampler)DraftModelProposer)EagleProposer)MedusaProposer)SpecDecodeMetadata)SuffixDecodingProposer)apply_grammar_bitmask)CpuGpuBufferrecord_function_or_nullcontext)mamba_utils) check_attention_cp_compatibilityget_total_cp_world_size)coordinate_batch_across_dp)ECConnectorModelRunnerMixin)CachedRequestState
InputBatch)UBatchWrapper)KVConnectorModelRunnerMixin)LoRAModelRunnerMixin)UBatchSlicescheck_ubatch_thresholdsmaybe_create_ubatch_slicessplit_attn_metadata)is_residual_scattered_for_sp)lock_workspace   )AttentionGroup(add_kv_sharing_layers_to_kv_cache_groupsbind_kv_cachesanity_check_mm_encoder_outputs)TensorizerConfig)GrammarOutputSchedulerOutputNgramProposerAttnMetadataDictPerLayerAttnMetadatac                   @   sJ   e Zd ZdedejdedB dee dej	j
defdd	Zd
efddZdS )AsyncGPUModelRunnerOutputmodel_runner_outputsampled_token_idslogprobs_tensorsNinvalid_req_indicesasync_output_copy_stream
vocab_sizec                 C   s   || _ || _t | _|| _|| _|| _tj	 }tj
|' || | jjddd| _| jr6| j nd | _| j  W d    d S 1 sIw   Y  d S NcpuTnon_blocking)_model_runner_output_invalid_req_indicestorchEventasync_copy_ready_event_sampled_token_idsr   _logprobs_tensorscudacurrent_streamstreamwait_streamtosampled_token_ids_cputo_cpu_nonblocking_logprobs_tensors_cpurecord)selfr   r   r   r   r   r   default_stream r   S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py__init__   s$   	


"z"AsyncGPUModelRunnerOutput.__init__returnc                 C   s   | j jd }| j  | `| `|dkr1| j  }| jD ]}||   qd}| j	dur0| j	
 }ntj| j | j| j| j	d\}}| j}||_||_|S )zCopy the device tensors to the host and return a ModelRunnerOutput.

        This function blocks until the copy is finished.
        r   Nr   )r   shaper   synchronizer   r   tolistr   clearr   tolistsr   parse_outputr   r   r   logprobs)r   max_gen_lenvalid_sampled_token_idsilogprobs_listsoutputr   r   r   
get_output   s,   





z$AsyncGPUModelRunnerOutput.get_output)__name__
__module____qualname__ry   r   Tensorrx   listintr   Streamr   r   r   r   r   r   r      s    
#r   c                   @   s<   e Zd Zdededee dejj	fddZ
defdd	Zd
S ) AsyncGPUPoolingModelRunnerOutputr   raw_pooler_outputfinished_maskr   c                 C   s   || _ t | _|| _tj }tj|' || t	dd | j}| j
  dd t||D | j _W d    d S 1 s@w   Y  d S )Nc                 S      | d u rd S | j dddS r   r   xr   r   r   <lambda>%      z;AsyncGPUPoolingModelRunnerOutput.__init__.<locals>.<lambda>c                 S      g | ]
\}}|r
|nd qS Nr   .0outincluder   r   r   
<listcomp>)      
z=AsyncGPUPoolingModelRunnerOutput.__init__.<locals>.<listcomp>)r   r   r   r   _raw_pooler_outputr   r   r   r   rQ   r   zippooler_output)r   r   r   r   r   r   raw_pooler_output_cpur   r   r   r     s   



"z)AsyncGPUPoolingModelRunnerOutput.__init__r   c                 C   s   | j   | `| jS )zCopy the device tensors to the host and return a ModelRunnerOutput.
        This function blocks until the copy is finished.
        )r   r   r   r   r   r   r   r   r   .  s   
z+AsyncGPUPoolingModelRunnerOutput.get_outputN)r   r   r   ry   rz   r   boolr   r   r   r   r   r   r   r   r   r     s    
r   c                   @   s   e Zd ZU dZded< ejed< edB ed< edB ed< ejed< ejed	< e	ej dB ed
< e
dB ed< edB ed< eeejf e	eeejf  B dB ed< dS )ExecuteModelStatezwEphemeral cached state transferred between execute_model() and
    sample_tokens(), after execute_model() returns None.r   scheduler_outputlogitsNspec_decode_metadata spec_decode_common_attn_metadatahidden_statessample_hidden_statesaux_hidden_statesec_connector_outputcudagraph_statsslot_mappings)r   r   r   __doc____annotations__r   r   r   r_   r   ru   r   dictstrr   r   r   r   r   9  s   
 


,r   c                   @   s  e Zd ZdedejfddZdeddfdd	Zd6d
dZ	d6ddZ
e d6ddZdefddZdddeejB dejdedefddZdd Zd7ddZd6d d!Zd6d"d#Zd7d$d%Zd&ejddddfd'd(Zd)ed*edefd+d,Zd-efd.d/Zd-efd0d1Z ddde!fd2d3Z"d4ede!fd5d6Z#	d8de$j%d7e$jdB de&e$j%e$j%f fd8d9Z'ddd:ed;e$j%ddfd<d=Z(	>d9d?e)eef d@e*dAedBede&ejdB e$j%dB f f
dCdDZ+ddd?e$j%de&eje,dB f fdEdFZ-					>	>			d:dedAedGedHedB dIedB dJe.dB dKejdB dLedBed?e)eef dB dMe/e/e  dB dNe)eejf dB de&e0e1dB f fdOdPZ2d?e$j%dQe$j%dRe/e de/e/e  dB fdSdTZ3d?e$j%dQe$j%dRed@e*dUe4defdVdWZ5d;dXdYZ6d;dZd[Z7d\e$j%d]e$j%de,fd^d_Z8dKejdejfd`daZ9ddde&e/e e/e&ee:f  e/e&ee;f  f fdbdcZ<ddde/ej fdddeZ=	fd<dddgede&e/ej ejf fdhdiZ>de?j@fdjdkZAde/eB fdldmZCde/eD fdndoZEde&eFdpf fdqdrZGdedseHdB dtedeHfdudvZId=dwedxeddfdydzZJd{ejd?ed|e$j%d}eKdB deLeMB f
d~dZNd?edefddZOdede&ejdB ejf fddZP	d8dddedseHdB de&ejdB ejdB ejeHdB e)eef eQdB f fddZRdejdB de,dB deSfddZTdddeSdejdB d{ejd?ede,dB de&e)eef eUdB e/e/e  e)eeVdB f e/e e)eef e/e f fddZWeXdd ZY				d>dejdB dejdB dseHdB dejdB de)eef defddZZe[	d8dedededAededB defddZ\		>				fd?dedAed|e$j%dedededededB dedB dedB dede&e]e^eejdB e_dB f fddZ`d6ddZa	d8dHedIededJdde&e)eejf dB e)eejf e/e)eejf  B dB f f
ddZbe 	d8dddseHdB deLeMB eHB dB fddZcejdddeLeMB eHB fddZddejddfddZed6ddZfdegdB fddZh	>d9dddeddfddZide&e/e/e  e/e f fddZjdejdejddfddZkde/e fddZldddeje/e/e  B demd{ejdejde/ej dB de,dB de1dNe)eejf e/e)eejf  B dB de/e/e  ejB fddƄZnde)eef ddfddɄZoepddˍd9deddfdd΄Zqde&edpf dB fddЄZr			d@dese&eejf  dB dedB deddfddՄZt				dAddلZud{ejd?e)eef de)eeVdB f fddۄZvdejdB de)eef fdd݄ZweXdejdB dejdB fdd߄Zxdedede!fddZye 		>	>		>	>	>		>	fdBdede]dB dededededxededededede&ejejf fddZze d{ejdejfddZ{d{ejdeDde|fddZ}e d{ejde|fddZ~d6ddZepddˍdefddZde/e^ de]fddZdeddfddZded e/e ddfddZde/eee   de/e ddfddZd6ddZe[d	ed
e/e defddZded e/e ddfddZdede)eejf fddZdee fddZdee fddZdede/e fddZdede)eejf d e/e de)eejf fddZde)eejf ddfddZded e/e de)eejf fddZdeddfdd Zdeddfd!d"Zd#d$ Zd%eddfd&d'Zd6d(d)Zde)ee*f fd*d+Zdejde/e/e  fd,d-Zde)ee)eeeB f f fd.d/ZeXd0ed1e/e&eef  d2ed3efd4d5ZdS (C  GPUModelRunnervllm_configdevicec                 C   s8  || _ |j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j	| _	ddl
m} |t| jjd  | j}| j}| j}| j}|| _t | _| jj| _t|j| j| _|jdk| _|j| _|j| _d| _|j| _| jj| _| jj| _| jdkr}dnt j| _ |j!| _"|j#| _$| jj%dkot&t' j(dk| _)|*|| _+|, | _-|j.| _.|j/| _0| jj1 | _2| jj3| _3t4| _5|j6| _6|j7| _7| j58|| _9| jj:r|j;| _<nd| _<| jj=| _>t?| jj@d| _Ad | _B	 g | _Cd | _Dd | _Eg | _Fi | _Gd| _H| jrrt' jIrr|  | jjJd	krdd
lKmL} || j | _MnU| jN r(tO| j | j| d| _MnD| jjJdkr6tP| j | _Mn6| jQ rRtR| j | j| | _M| jjJdkrQ| jMjS| _Hn| jjJdkrctT| j | jd| _Mn	tUd| jjJ tV| jA| _Wd| _X| jr| jjY| _X| jjZ}	|	d ur|	jd ur|	j| _[n| j| _[i | _\i | _]t^j_` | _a|jb}
|
d urtc|
nd}td| j$te| j| j<| j"| j| j| jf | jjgg| jjggth| j jti| j | j| j| j|th|| j| jjjd| _kd | _ld | _m| j>rt^j_` | _lt^n | _m| jjor| jjptqjrkrts| jjo| _t| u  i | _vtwx | _y| jz| j"t^j{d| _|| jz| j"t^j}d| _~| jz| j$d t^j{d| _| jz| j$t^j{d| _| jz| j$t^j{d| _| jdkr`| jz| j$t^j{d| _| jz| j"| j-| jdd| _| jz| j"t^jhd| _| jz| j$t^jhd| _| jz| j$t^j{d| _| jz| j$t^j}d| _| j9r| jz| j"t^jhd| jz| j"t^jhdg| _d| _| j6r| jzd| j"d ft^j}d| _| j7dkr| jz| j7| j"d ft^j}d| _d | _tjte| j$d | j| j"tj}d| _i | _t | _d | _| jjrt^j| j"t^j{| jd| _d| jX | _t| j | _| j9rt| j | j5nd | _d | _t | _d | _d | _t^n | _t^j| j$dft^j}d| jd| _d | _d | _d | _d | _d | _d | _| jXrt^n | _t^j_` | _t^j| j$| jXft^j}d| jd| _| j>rt^n | _t^j_` | _t^j| j$t^j}d| jd| _d | _d | _i | _d| _d S )Nr   )set_cpu_offload_max_bytes   @poolingFr   external_launcher)logprobs_modengramr   )r  r  runnersuffixeagle3medusa)r  r  z%Unknown speculative decoding method: r   )max_num_reqsmax_model_lenmax_num_batched_tokensr  
pin_memoryr   block_sizeskernel_block_sizesis_spec_decodelogitsprocs!logitsprocs_need_output_token_idsrA   cp_kv_cache_interleave_sizedtype)r  numpy   r  r  r   )r  r  r  )r  model_configcache_configcompilation_configlora_configload_configparallel_configscheduler_configspeculative_configobservability_config vllm.model_executor.models.utilsr	  r   cpu_offload_gbr  rW   r  r  rY   cache_dtypekv_cache_dtyperunner_typerA   enable_prompt_embeds"is_multimodal_raw_input_only_modelis_multimodal_pruning_enabledr  calculate_kv_scalesdecode_context_parallel_sizedcp_world_sizer    rank_in_groupdcp_rankr  max_num_tokensmax_num_seqsr  distributed_executor_backendlenr!   ranksbroadcast_pp_outputget_num_attention_headsnum_query_headsget_inputs_embeds_sizeinputs_embeds_sizeattention_chunk_size
uses_alibi	use_alibidisable_cascade_attncascade_attn_enabledis_mm_prefix_lmrC   mm_registry
uses_mropeuses_xdrope_dimsupports_multimodal_inputssupports_mm_inputsis_encoder_decodermax_num_encoder_input_tokensmax_encoder_lenasync_schedulinguse_async_schedulingr   r  sampler
eplb_state	kv_cachescross_layers_kv_cachecross_layers_attn_backendattn_groupsencoder_cacheuse_aux_hidden_state_outputsis_last_rankmethod"vllm.v1.spec_decode.ngram_proposerr   drafteruses_draft_modelr   r   	use_eagler   eagle3_use_aux_hidden_stater   
ValueErrorr   rejection_samplernum_spec_tokensnum_speculative_tokensdraft_model_configeffective_drafter_max_model_lenrequestsnum_prompt_logprobsr   r   r   comm_streamlogits_processorstupler   maxget_vocab_size
block_sizer   r   r  input_batchr   prepare_inputs_eventr   cudagraph_capture_sizescudagraph_moder   NONEsortedcudagraph_batch_sizes_init_device_propertiesencoder_timing_registry	threadingLock_encoder_timing_lock_make_bufferint32	input_idsint64	positionsquery_start_locseq_lensencoder_seq_lensdcp_local_seq_lensinputs_embedsis_token_idsdiscard_request_masknum_decode_draft_tokensnum_accepted_tokensis_mm_embed_buffersis_mm_embed_idxmrope_positionsxdrope_positionsintermediate_tensorsnparange	arange_npshared_kv_cache_layersset'kv_sharing_fast_prefill_eligible_layers&kv_sharing_fast_prefill_logits_indiceskv_sharing_fast_prefillzerosuniform_decode_query_lenrf   cudagraph_dispatcherrD   	mm_budgetreorder_batch_thresholdrunner_only_attn_layers_draft_token_ids_draft_token_req_idstransfer_eventemptysampled_token_ids_pinned_cpuvalid_sampled_token_count_event%valid_sampled_token_count_copy_streamdraft_token_ids_eventdraft_token_ids_copy_streamvalid_sampled_token_count_cpudraft_token_ids_cpuexecute_model_statekv_connector_outputmamba_state_idxlayerwise_nvtx_hooks_registered)r   r  r  r	  r"  r#  r(  r'  r   draft_configrj  custom_logitsprocsr   r   r   r   L  s  






	




	

	

	



zGPUModelRunner.__init__r  r   Nc                 C   s:   || _ | jr| jj}|d u s|j d u r| j | _d S d S d S r   )r  r)  re  rf  )r   r  r  r   r   r   update_max_model_len  s   z#GPUModelRunner.update_max_model_lenc                 C   s   | j r
| j   dS dS )z|
        Clear the multi-modal cache that was used during profiling,
        but no longer needed during inference.
        N)r  reset_cacher   r   r   r   reset_mm_cache  s   zGPUModelRunner.reset_mm_cachec                 C   s   | j   dS )zClear the GPU-side encoder cache storing vision embeddings.

        This should be called when model weights are updated to ensure
        stale embeddings computed with old weights are not reused.
        N)rX  r   r   r   r   r   reset_encoder_cache  s   z"GPUModelRunner.reset_encoder_cachec                 C   s   | j jds	dS t| dg }|D ]
}|dur|  qd}d}| jj}| D ]C\}}t|t	t
frkd\}}	|D ]}
t||
rPt||
}t|tjrP|| q9|D ]}
t||
rjt||
}t|tjrj||	 qSq(dS )a  
        Re-initialize the KV cache and FP8 scales after waking from sleep.
        1. Zero out the KV cache tensors to remove garbage data from re-allocation.
        2. Reset Attention layer scaling factors (_k_scale, _v_scale) to 1.0.
          If these are left at 0.0 (default after wake_up), all KV cache values
          become effectively zero, causing gibberish output.
        fp8NrT  )_k_scalek_scale)_v_scalev_scale)      ?r  )r#  r-  
startswithgetattrzero_r$  static_forward_contextitems
isinstancer+   r,   hasattrr   r   fill_)r   rT  cache_tensork_attr_namesv_attr_namesattn_layersnamemodulek_scale_valv_scale_valattrparamr   r   r   init_fp8_kv_scales  s4   	





z!GPUModelRunner.init_fp8_kv_scales
num_tokensc                 C   s   t |tr-| jr| jjd d d |f S | jdkr%| jjd d d |f S | jjd | S | jr:| jjd d |f S | jdkrI| jjd d |f S | jj| S Nr   )r  r   rI  r  gpurJ  r  r  )r   r  r   r   r   _get_positions  s   


zGPUModelRunner._get_positionsT)r  sizer  r  c                G   s   t ||| j| j|dS )N)r  r  r  
with_numpy)r   r  r  )r   r  r  r  r   r   r   r{    s   zGPUModelRunner._make_bufferc                 C   s   t ttf  }| js|S | jj}| j }t ttf  }t|D ]\}}|j	d ur7|j	
d }d ur7|||< q t|dkr@|S | jjd | }g }	t|D ]}|
||| }
t|| |
k }|	| qNt|	j| jd|d< |S )Ncompressed_token_type_idsr   r  token_type_ids)r  r  r   rA   ro  num_reqsget_pooling_paramsr   	enumerateextra_kwargsgetr;  r  r  ranger   r  appendconcatr   r  )r   model_kwargsr  pooling_paramstoken_type_id_requestsr   r  token_typesr  r  posidsr   r   r   _init_model_kwargs%  s2   



z!GPUModelRunner._init_model_kwargsr   r   c                 C   s8   t | jjdkr
dS | jdurt| j|| jd dS dS )a[  
        Update the order of requests in the batch based on the attention
        backend's needs. For example, some attention backends (namely MLA) may
        want to separate requests based on if the attention computation will be
        compute-bound or memory-bound.

        Args:
            scheduler_output: The scheduler output.
        r   N)decode_threshold)r;  kv_cache_configkv_cache_groupsr  rd   ro  )r   r   r   r   r   _may_reorder_batchG  s   

z!GPUModelRunner._may_reorder_batchc                 C   s   t j| j| _| jj| _dS )z;Initialize attributes from torch.cuda.get_device_propertiesN)r   r   get_device_propertiesr  device_propertiesmulti_processor_countnum_smsr   r   r   r   rv  a  s   z&GPUModelRunner._init_device_propertiesc                 C   s   t j  d S r   )r   r   r   r   r   r   r   _sync_deviceg     zGPUModelRunner._sync_devicec           '      C   s  |j D ]}| j|d | j|d q|j D ]}| j| q|jD ]	}| j|d q#|j	 }| jj
	 }|jj}|||  }|D ]}| j| qDg }|jD ]}	|	j}|| jv rh| ||	}
||
 qR|	j}|	j}|r|jtjkrtj| jd}||j nd}| jr|dusJ |j}|dusJ dtt|  }|j !|}|"| t#||	j$|	j%|	j&||||	j'|	j(g |	j)d}
|
| j|< |r|j*dur|j*dkr| jj+n|j*| j|< | j,r| -|
 | j.dkr| /|
 ||
 qRt0 j1}|j}|j2}| 3 }t4|j5D ]:\}}| j| }
|j(| }|j6| }||jv }|j7| }| jj
8|}|
j9rd| j:rd|du r=d|
_9n'| jj;dusFJ | jj;| }|| d }|
j9| }||8 }|
j<=dg|  ||
_(|s|j>sqg }nN|j>| }|t?| |
j@ }|dkr|
j<|d  n2|dkr|
j<=|| d  n!|t?|
j<k r|
j<|d= |dur| jjA| | } | | jjB|< |s|durtC|
j'|D ]
\}!}"|!=|" qn|du sJ |dusJ ||
_'|du r| j:r|dkr|jD| }#|#| d |
_<||
 q|| jjE|< |dur| jjFG|| |s:|}$|t?| }%|| jjH||$|%f< |%| jjB|< | jI|
| q|D ]}&| jJ|& | jI|&| qE| jK  | L| | jM  dS )ab  Update the cached states and the persistent batch with the scheduler
        output.

        The updated states are used by the `_prepare_inputs` function to create
        the input GPU tensors for the model.

        The SamplingMetadata is updated and copied to the GPU if there is a
        new/resumed/paused/finished request in the batch.
        Nr  z!You did not set `task` in the API)req_idprompt_token_idsprompt_embedsmm_featuressampling_paramsr  	generator	block_idsnum_computed_tokensoutput_token_idslora_requestr   r   r   )Nfinished_req_idsrg  poprh  ro  remove_requestfree_encoder_mm_hashesrX  num_scheduled_tokenskeysreq_id_to_indexscheduled_cached_reqsresumed_req_idsscheduled_new_reqsr  _update_streaming_requestr  r  r  sampling_typerJ   RANDOM_SEEDr   	Generatorr  manual_seedseedrA   taskr   r@   	get_modelpoolerget_pooling_updatesapplyr   r  r  r  r  r  r  prompt_logprobsr   rI  _init_mrope_positionsrJ  _init_xdrope_positionsr!   rZ  scheduled_spec_decode_tokens_get_valid_sampled_token_countr  req_idsnew_block_idsnum_output_tokensr  prev_num_draft_lenrQ  prev_req_id_to_indexr  extendnew_token_idsr;  r  num_prompt_tokensnum_tokens_no_specr   all_token_idsnum_computed_tokens_cpublock_table
append_rowtoken_ids_cpuupdate_req_spec_token_idsadd_requestcondenser  refresh_metadata)'r   r   r  mm_hashscheduled_req_idscached_req_idsr  unscheduled_req_idsreqs_to_addnew_req_data	req_stater  r  r  r  model	to_updaterZ  req_datascheduled_spec_tokensvalid_sampled_token_countr   r  r  resumed_from_preemptionr  	req_indexprev_req_indexnum_acceptednum_rejectedr  num_new_tokensend_idxr  new_idsresumed_token_idsstart_token_indexend_token_indexrequestr   r   r   _update_statesj  s
  
































zGPUModelRunner._update_statesr  c              
   C   s   | j r| jjs	dS tj|tj|ddfd|jdgdddk 	d
  }t|D ]
\}}|| jj|< q-| jjdkrTt|| j| j| j| j| jj| j  dS dS )a  Update the cached states after model execution.

        This is used for MTP/EAGLE for hybrid models, as in linear attention,
        only the last token's state is kept. In MTP/EAGLE, for draft tokens
        the state are kept util we decide how many tokens are accepted for
        each sequence, and a shifting is done during the next iteration
        based on the number of accepted tokens.
        Nr   r   r   r  dimalign)r)  r"  	is_hybridr   catfullr  r  r   argmaxr   r  r  ro  num_accepted_tokens_cpur#  mamba_cache_moder   postprocess_mambar  rg  r  r$  r  r)  get_mamba_state_copy_func)r   r  r   r  r   r  r   r   r   "_update_states_after_model_executea  s@   z1GPUModelRunner._update_states_after_model_executer  r'  c                 C   s|   | j | | j| }|j|_|j|_|j|_|j|_|j|_|j|_|j	|_	t
|j|j|_|j  | jr<| | |S )aj  Updates streaming session request from `scheduled_new_reqs`.

        Removes the request from InputBatch (if present), updates the cached
        state, and prepares it for re-addition to the batch.

        NOTE: prompt_token_ids includes intermediate output tokens - tokens
        previously generated but now are input context (part of the prompt).
        )ro  r  rg  r  r  r  r  r  r  r  rP   r  r  r   rI  r  )r   r  r'  r(  r   r   r   r     s    


z(GPUModelRunner._update_streaming_requestr(  c                 C   sP   |   }t|sJ d|jd usJ dtt|}||j|j\|_|_d S )Nz"M-RoPE support is not implemented.z1M-RoPE requires prompt_token_ids to be available.)	r  r;   r  r   r6   get_mrope_input_positionsr  r  mrope_position_delta)r   r(  r)  mrope_modelr   r   r   r    s   
z$GPUModelRunner._init_mrope_positionsc                 C   sJ   |   }tt|}|jd usJ dt|sJ d||j|j|_d S )Nz2XD-RoPE requires prompt_token_ids to be available.z#XD-RoPE support is not implemented.)r  r   r8   r  r?   get_xdrope_input_positionsr  r  )r   r(  r)  xdrope_modelr   r   r   r    s   

z%GPUModelRunner._init_xdrope_positionsc                 C   s   |r| j si S ttttf   }|jD ]}|jD ]}|jd ur(||j	|jf qqi }t
|| j| jdD ]
\}}}|| q5|S )Nr  r  )r1  r   rk  r  rF   r  r  datar  modalityrH   r  r  update)r   r   	mm_kwargsreqfeaturemm_kwargs_combined_mm_kwargs_groupr   r   r   _extract_mm_kwargs  s"   



z!GPUModelRunner._extract_mm_kwargsnum_seqsc                 C   s:   | j si S | j}|d usJ |jsi S | }| ||S r   )r1  r  mm_max_toks_per_itemget_modality_with_max_tokens_get_mm_dummy_batch)r   rW  r  dummy_modalityr   r   r   _dummy_mm_kwargs  s   zGPUModelRunner._dummy_mm_kwargscumsum_dtypec                 C   s@   t j||d}|d }t || |}| jd| | }||fS )zGet the cumulative sum and batched arange of the given array.
        # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
        # Equivalent to but faster than:
        # np.concatenate([np.arange(n) for n in num_tokens])
        r  r   N)r  cumsumrepeatr  )r   r  r]  cu_num_tokenstotal_num_tokenscumsums_offsetsr  r   r   r   _get_cumsum_and_arange  s
   z%GPUModelRunner._get_cumsum_and_arangetotal_num_scheduled_tokensr`  c                 C   s  | j jdu r| j| | jr| j| | j| dS | j j}|dus'J g }g }g }g }d}	d}
d}|j}| j j	
 D ]U\}}|| }dur|| t||d}||7 }||  d }|||  |t|| d |d  || j }|t|||  |	||kM }	t|
|}
q>t|}|| }||k r| j| | jr| j| | j| |dkrdS |	r|
|d kr| jjd| j| j jd|df dd | jrd| jjd|< dS tj|tj| jdj| jdd}tj|tj| jdj| jdd}| jjjd|| j j|df d	 | jdu s|s dS t| jtjs*J tj|tj| jdj| jdd}tj|tj| jdj| jdd}| jjtjd
}| jjjd||  | d	 dS )a  Prepare the input IDs for the current batch.

        Carefully handles the `prev_sampled_token_ids` which can be cached
        from the previous engine iteration, in which case those tokens on the
        GPU need to be copied into the corresponding slots into input_ids.NTr   r   r   r   r   )r  r  )r<  indexsrcr  )!ro  prev_sampled_token_idsr}  copy_to_gpur0  r  r  r  r  r  r  r  r  r;  itemr  r  rc  rl  r  copy_r   tensorr~  r  r   r  scatter_r  r  r   r|  flatten)r   r   rd  r`  r  sample_flattened_indicesspec_flattened_indicesprev_common_req_indicesprev_draft_token_indicesindices_matchmax_flattened_indextotal_num_spec_tokensr,  r  	cur_index
prev_index	draft_lenflattened_indexstartnum_commmon_tokenstotal_without_specsampled_tokens_index_tensorprev_common_req_indices_tensordraft_tokens_index_tensorprev_draft_token_indices_tensordraft_token_idsr   r   r   _prepare_input_ids  s   




	



z!GPUModelRunner._prepare_input_idsFr  kv_cache_specr  for_cudagraph_capturec                 C   s   t |tsdS d| jjd |< |D ])}| jj| }| j| }|jd u r*d| jj|< qtdd |jD }|| jj|< q|rNt	| j
jd| j}	|	| jjd |< | j| | jjd | }
| jjd | }|
|fS )NNNr   c                 s       | ]}|j jV  qd S r   )mm_positionlength)r   rR  r   r   r   	<genexpr>      
z7GPUModelRunner._get_encoder_seq_lens.<locals>.<genexpr>max_source_positions)r  ri   r  r  ro  r  rg  r  sumr  r"  	hf_configrO  rh  r  )r   r  r  r  r  r  r/  r(  encoder_input_tokensrO  r  encoder_seq_lens_cpur   r   r   _get_encoder_seq_lens  s0   


z$GPUModelRunner._get_encoder_seq_lensc                    s  |j }|dks	J  jj}|dksJ  jj| t jd| |} |\}} j	jd| }tj
 jj| ||d  jrH |  jdkrR | || jjjd   }	t|	}
tj jj d|
 jjd| d  jr jj }tj|d|
 jjd| d  jjrd}t|D ][}|| }| jjvr||7 }q|dkr||7 }q jj| } jj| }||jd kr||7 }q|| }t||jd }|| }|dkr jj|||   |||  ||7 }q jj!||  jj"| d j#jd< | j#jd|d <  j#j|d d $|d   j#%   j#j&d|d  } jjd| |  j'jd|<  j'j|d $d  j'%   fdd jj(D }tj)|tj*d} j'jd| |k  j+jd|<  j+%|  ,|||  jr j-j&ddd|f j  j-jddd|f d	d
 n( jdkr j.j&ddd|f j  j.jddd|f d	d
 n j	%| t/|j0dk}|s|dd d }d}tj1|tj*d}n`tj2|tj*d}tj3|dtj*d}|j04 D ]$\}} jj5| }t/|||<  jj|  jj6| krt/|||< q 7||}|j8}|d }| j9jd|<  j9j|d $d  j9%   j:r^t;| j<j=j>ksVJ  ? j|| ||fS )z]
        :return: tuple[
            logits_indices, spec_decode_metadata,
        ]
        r   N)r   r   r   c                    s   g | ]} j | jqS r   )rg  r  )r   rr   r   r   r   2      z2GPUModelRunner._prepare_inputs.<locals>.<listcomp>r  Tr   )@rd  ro  r  r  commit_block_tabler  r_  r  rc  r  addr  rI  _calc_mrope_positionsrJ  _calc_xdrope_positionsr  r   r   
from_numpyindex_selecttoken_ids_cpu_tensorrm  r}  r   r0  is_token_ids_tensorr  req_prompt_embedsr  minr  rj  compute_slot_mappingcommit_slot_mappingr  fillrh  r  r  r  arrayr|  r  r  r  r  r;  r  onesr  r@  r  r  r  _calc_spec_decode_metadatalogits_indicesr  r%  r  r  r(  r  set_active_loras)r   r   r  rd  r  req_indicesr`  r  positions_nptoken_indicestoken_indices_tensorr  
output_idxreq_idx	num_sched
req_embeds	start_posend_pos
actual_endactual_num_schedr  r  num_tokens_npuse_spec_decoder  r   num_sampled_tokensnum_draft_tokensr  r  r  r   r   r   _prepare_inputs  s   














zGPUModelRunner._prepare_inputsmax_query_lennum_tokens_paddednum_reqs_paddedubatch_slicesr  r  cascade_attn_prefix_lensr  c           $         s@  t jjdkri dfS p|pdurdusJ i  |dur/dd tt |D  r5j}njjd   }	raj	j
d jjd< jjd d j  jjdtffdd}|dusxJ |d}|d }jjr|d|   _tjjdd  jjdd  jjd jjd j	jd ||||d	d
}jdkrtjjd jjjjjjd< jjd d j jjd |_jjd |_|durj j!r|"d|_#$||_%i 	ddtdtdtdtdB ddf
 	fdd}d}t&D ]v\}}t'|}j(|
pIi |j)d\|_*|_+|dkrc|||_,|| |_j-r|du rt.j/t0rj/j1d |j2v r|}n|}tt j3| D ]$}|durt&t4||D ]\}}||||| qq|||| qq;j5ri }j	j6D ]'}g }j7| }|j8D ]}|j9}|: } |;|  qȈj	j<| }!|||!< qt. t=r D ]}"|"> D ]}#||#_?qqn > D ]}#||#_?q |durks|kr|@|} |fS )zQ
        :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
        r   Nc                 S      g | ]}t  qS r   )r  r   rT  r   r   r   r         z<GPUModelRunner._build_attention_metadata.<locals>.<listcomp>r   kv_cache_gidc                    sn   d urd us
J  |  j }t|tr!tjdftjjd}njj|  }|	}| 
d |S )Nr   r!  r   )r  r  rj   r   r  r|  r  ro  r  get_device_tensorr  )r  r  blk_table_tensor	blk_table)r  r  r  r  r   r   r   _get_block_table  s   


zBGPUModelRunner._build_attention_metadata.<locals>._get_block_tableT)r  query_start_loc_cpur  _seq_lens_cpu_num_computed_tokens_cpur  num_actual_tokensr  max_seq_lenblock_table_tensorslot_mappingcausalattn_gidcommon_attn_metadataubidr   c                    sJ  j |  | }||pd}|  j}t|tr |j|jd  }|t|f}r.|  | nd}i }	rSt|trS|d u sAJ dt	j
jd  jjd  d}	r[||}
n%|v rn|jrn|| |j|j}
n|jd||d|	}
|jr|
|< |d u rt t	sJ  }nt tsJ  | }|jD ]}|
||< qd S )Nr   z$UBatching not supported with GDN yet)r  num_decode_draft_tokens_cpu)common_prefix_lenr  r   )rW  get_metadata_builderr  r  rq   kv_cache_specslayer_namestypera   r  r  r  r  r   build_for_cudagraph_capturesupports_update_block_tableupdate_block_tabler  r  buildr   )r  r  r  r  
attn_groupbuilderr  	cache_keycascade_attn_prefix_lenextra_attn_metadata_argsattn_metadata_iattn_metadata_dict
layer_name)attn_metadatacached_attn_metadatar  r  r  r  r   r  r   r   _build_attn_group_metadata  s^   



zLGPUModelRunner._build_attention_metadata.<locals>._build_attn_group_metadata)r  r   )Ar;  r  r  r  r  r  r  rl  ri  ro  rB  r  r  rh  r   r"  enable_return_routed_expertsr   r  r  r_   r  r  num_computed_tokens_cpu_tensorr5  rc   r7  r'  r  r  r  dcp_local_seq_lens_cpur#  r  r  num_logits_indices _prepare_kv_sharing_fast_prefilllogits_indices_paddedr  r   r  r  r  r  r  r)  r  r]  r   attn_layer_namesr  rW  r   rG  r  rg  r  r  extract_embeds_ranger  r  r   valuesmm_prefix_rangeunpadded)$r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  block_table_gid_0slot_mapping_gid_0cm_baser  r   r  kv_cache_groupcmr  r  _cmreq_doc_rangesr  image_doc_rangesr(  
mm_featurepos_infoimg_doc_ranger  ub_metadata	_metadatar   )
r  r  r  r  r  r  r  r  r   r  r   _build_attention_metadata  s   
?









z(GPUModelRunner._build_attention_metadatar  num_common_prefix_blocksc           
   
   C   s   d}t | jj}dd t|D }t|D ].}| j| D ]&}t|jtr'd}	n| |||| |j|	 }	|| 
|	 ||	dkO }qq|rH|S dS )z
        :return: Optional[cascade_attn_prefix_lens]
            cascade_attn_prefix_lens is 2D: ``[kv_cache_group_id][attn_group_idx]``,
            None if we should not use cascade attention
        Fc                 S      g | ]}g qS r   r   r  r   r   r   r     s    zDGPUModelRunner._compute_cascade_attn_prefix_lens.<locals>.<listcomp>r   N)r;  r  r  r  rW  r  r  rj    _compute_cascade_attn_prefix_lenr  r  )
r   r  r  r  use_cascade_attnnum_kv_cache_groupsr  r  r  r  r   r   r   !_compute_cascade_attn_prefix_lens}  s(   z0GPUModelRunner._compute_cascade_attn_prefix_lensattn_metadata_builderc           
      C   s   ||j  }|dkrdS t|| }||j  |j  }t|tp(t|to(|jdu}t|tp7t|to7|jdu}t|ts?J |j	||| j
|j| j||| j| jd	}	|	rV|S dS )a6  Compute the length of the common prefix for cascade attention.

        NOTE(woosuk): The common prefix length returned by this function
        represents the length used specifically for cascade attention, not the
        actual number of tokens shared between requests. When cascade attention
        is disabled (use_cascade=False), this function returns 0 even if
        requests share common tokens. Additionally, the common prefix length is
        truncated to a multiple of the block size and may be further truncated
        due to implementation details explained below.

        Args:
            num_scheduled_tokens: Number of tokens scheduled per request.
            num_common_prefix_blocks: Number of shared KV cache blocks.

        Returns:
            int: Length of common prefix in tokens.
        r   N)	r  
query_lensr?  num_kv_headsrD  use_sliding_windowuse_local_attentionr  r5  )rn  r  r  rp   rk   sliding_windowrh   rB  rg   use_cascade_attentionr?  r  rD  r  r5  )
r   r  r  r  r  r  r  r  r  use_cascader   r   r   r    s6   
)



z/GPUModelRunner._compute_cascade_attn_prefix_lenc                 C   s6  d}t | jjD ]\}}| j| }|jd usJ | jj| }|j| }t|j|j	}|| |kr?t
d|| }	t
d||	 }
n|}	d}
||	|
 ksKJ |	dkrt|}||	 }|}||	 }|jd d ||f | jjd d ||f< ||	7 }|
dkr|}||
 }|jd usJ tj| jj||j||	 |
d ||
7 }qd S )Nr   )r   
out_offsetrH  context_lenr3  )r  ro  r  rg  r  r  r  rP   r  r  rl  r   rH  r/   get_next_input_positions_tensorr  )r   r   mrope_pos_ptrre  r  rQ  r  r  r  prompt_part_lencompletion_part_len	dst_startdst_end	src_startsrc_endr   r   r   r    sL   

z$GPUModelRunner._calc_mrope_positionsc                 C   s$  d}t | jjD ]\}}| j| }|jd usJ | jj| }|j| }t|j|j	}|| |kr?t
d|| }	t
d||	 }
n|}	d}
||	|
 ksKJ |	dkrt|}||	 }|}||	 }|jd d ||f | jjd d ||f< ||	7 }|
dkr|}||
 }tj| jj|||	 |
d ||
7 }qd S )Nr   )r   r	  r
  r3  )r  ro  r  rg  r  r  r  rP   r  r  rl  r   r0   r  r  )r   r   xdrope_pos_ptrre  r  rQ  r  r  r  r  r  r  r  r  r  r   r   r   r  3  sH   

z%GPUModelRunner._calc_xdrope_positionsr  cu_num_scheduled_tokensc              	   C   s  |d }| j |tjd\}}t|| |}||7 }|d }| j |tjd\}}t|| |}	|	|7 }	t|j| jdd}t|j| jdd}t|j| jdd}t|	j| jdd}	t|j| jdd}| jj	| }
|
|	d  }
t
|
| |||	||dS )Nr   )r]  Tr   )r  r  cu_num_draft_tokenscu_num_sampled_tokenstarget_logits_indicesbonus_logits_indicesr  )rc  r  r|  r_  r   r  r   r  r}  r  r   r   )r   r  r  r  r  r  r  r  r  r  r  r   r   r   r  b  sT   






z)GPUModelRunner._calc_spec_decode_metadatac                 C   s   | j d usJ |jd }|dksJ | j d | | | j |d  |d   | jj|dd\}}|j}| j d | }|S )Nr   r   Tdisable_full)r  r   rj  r  ri  r  dispatchr  )r   r  
num_logitsrT  
batch_descnum_logits_paddedr  r   r   r   r    s   


z/GPUModelRunner._prepare_kv_sharing_fast_prefillc                 C   s   |j }|s
g g g fS tt  }ttttf   }ttttf   }| D ]0\}}| j| }|D ]$}	|j|	 }
|
j	du r=q0|
|
j |
|
j|
j	f |
||
jf q0q%|||fS )a  Batch multimodal inputs from scheduled encoder inputs.

        Args:
            scheduler_output: The scheduler output containing scheduled encoder
                inputs.

        Returns:
            A tuple of (mm_hashes, mm_kwargs, mm_lora_refs) where:
            - mm_hashes: List of multimodal hashes for each item
            - mm_kwargs: List of multimodal kwargs for each item
            - mm_lora_refs: List of (req_id, placeholder_range) for each item
        N)scheduled_encoder_inputsr   r  rk  rF   rG   r  rg  r  rM  r  
identifierrN  r  )r   r   r  	mm_hashesrP  mm_lora_refsr  encoder_input_idsr(  mm_input_idr  r   r   r   _batch_mm_inputs_from_scheduler  s"   





	z.GPUModelRunner._batch_mm_inputs_from_schedulerc           #   
      s    |\}}}|sg S t jo jjo|j}tt j} jr j	
 rg }g }t }	g }
|D ]@\}} jj| }t jj| } j| }|| ||g|  |
| |dkrq jj|}|d urq|	| q1tt|t|dtjd} j	|	| t jdr fdd|
D }ttj|tj dtj|tj d}tt|! t|dtj"d} j	|	| g }d}t#| j$ j%d	D ]\}}} j&r*|d
kr*|dkr*t't(j)  }t*|D ]?}|||  } +|||| d$ t,t#|g j$ j%d	\}}}|j-di |}|| W d    n	1 s!w   Y  q|} n! +|||| |j-di |} W d    n	1 sFw   Y  t.| |d ||  ||7 }qt/||D ]\}!}"|" j0|!< t12d|!  3 j0|! q`|S )Nr   T)
is_prefillr  get_num_mm_connector_tokensc                    s   g | ]} j |qS r   )r)  r'  )r   r  r   r   r   r   1	  s    
z6GPUModelRunner._execute_mm_encoder.<locals>.<listcomp>r  )index_mappingprompt_mappingr&  r  rL  videor   expected_num_itemszFinish execute for mm hash %sr   )4r%  r   r*  enable_mm_processor_statsr  r   r7   r)  r%  lora_managersupports_tower_connector_lorar  ro  r  r   request_lora_mappingget_num_mm_encoder_tokensget_num_embedsr  r  lora_id_to_lora_requestr  r  r)   rk  r*   TOWERset_active_adaptersr  r  r_  r  r|  r   	CONNECTORrH   r  r  r2  r   r   r   r  timed_encoder_operationnextembed_multimodalr   r   rX  loggerdebugmaybe_save_ec_to_connector)#r   r   r!  rP  r"  should_timer)  prompt_lora_mappingtoken_lora_mappinglora_requestsencoder_token_countsr  r  r  lora_idr  r  tower_mappingpost_op_countsconnector_token_mappingconnector_mappingencoder_outputscurrent_item_idxrN  	num_itemsrU  curr_group_outputs_lst	video_idxvideo_mm_kwargs_itemrT  micro_batch_mm_inputsmicro_batch_outputscurr_group_outputsr"  r   r   r   r   _execute_mm_encoder  s   












z"GPUModelRunner._execute_mm_encoderr   shift_computed_tokensc                 C   sV  |j }d| j | _| j| j }ttj  }|j}d|d |< d}d}d}	| jjD ]}
g }|j	|
 }| j
|
 }|j| }|jD ]}|j}|j}|j}||| krQ n|| |krXq>t|| d}t|| | |}||k snJ |||\}}||kr{q>|j}| j|d }|d usJ d| d|j }d ur||| }||| }n||| }|| | }|d u rd||| || < n||| ||   |O  < || q>| jr| jr|jd usJ d}| jj|j||j|jd\}}}|j| ||_| | ||7 }q(|!|}|r| "| | j!| |	r'| #| | j$!| ||fS )Nr   Fr   zEncoder cache miss for .T)r}  multimodal_embeddingsr  r  )%rd  r  r  r   r   r   r   ro  r  r  rg  r  r  r  offsetr  rl  r  get_embeds_indices_in_ranger   rX  r  is_embedr  r2  rI  r  r)  recompute_mrope_positionsr  rj  rH  r  rh  r  r  r  )r   r   rQ  rd  is_mm_embed_buf	mm_embedsis_mm_embedreq_start_idxshould_sync_mrope_positionsshould_sync_xdrope_positionsr  mm_embeds_reqr  r(  r  r  r  r  num_encoder_tokens	start_idxr4  curr_embeds_startcurr_embeds_endr"  encoder_outputrV  mm_embeds_itemreq_start_posnew_mrope_positions	new_deltar   r   r   _gather_mm_embeddings	  s   










z$GPUModelRunner._gather_mm_embeddingsc                 C   s2   t | ds	tdt| jttfr| j S | jS )Nr)  z2Cannot get model before model has been initialized)r  ra  r  r)  r   r   unwrapr   r   r   r   r  	  s
   

zGPUModelRunner.get_modelc                 C   sX   |   }tt  }t|r|d t|r!|jrdgS |d t|r*|d |S )Ngeneratetranscriptionrealtime)r  r   rL   rB   r  r>   supports_transcription_onlyr=   )r   r)  supported_tasksr   r   r   get_supported_generation_tasks
  s   



z-GPUModelRunner.get_supported_generation_tasksc                 C   sZ   |   }t|s
g S t|j }d|v r+t| jjdd}|dkr+|d t	
d |S )Nscore
num_labelsr   r   z.Score API is only enabled for num_labels == 1.)r  rA   r   r  get_supported_tasksr  r"  r  remover:  
debug_once)r   r)  rn  rq  r   r   r   get_supported_pooling_tasks
  s   

z*GPUModelRunner.get_supported_pooling_tasks.c                 C   sF   t t  }| jjdkr||   | jjdkr||   t|S )Nrj  r  )r   rN   r"  r/  r  ro  ru  rk  )r   tasksr   r   r   rr  '
  s   
z"GPUModelRunner.get_supported_tasksr  	sync_selfc                    s   | j d usJ | jjjt| j |rC|d usJ | D ]$\}}|dko' }|r. n}| j | d | j|d | dd qt fdd| j  D S )NresidualTr   c                    s:   i | ]\}}||d kr r|d  n|d qS )rx  Nr   r   kvis_rsr  tpr   r   
<dictcomp>H
  s    
zFGPUModelRunner.sync_and_slice_intermediate_tensors.<locals>.<dictcomp>)r  r  r'  tensor_parallel_sizer   r  rj  rK   )r   r  r  rw  rz  r{  is_scatteredcopy_lenr   r|  r   #sync_and_slice_intermediate_tensors1
  s    
z2GPUModelRunner.sync_and_slice_intermediate_tensorsis_dummy
is_profilec                 C   sJ   | j jsdS | jdusJ |  }t|sJ | jj||| j jjd dS )zN
        Step for the EPLB (Expert Parallelism Load Balancing) state.
        N)	log_stats)r'  enable_eplbrS  r  r9   stepeplb_configlog_balancedness)r   r  r  r)  r   r   r   	eplb_stepP
  s   
zGPUModelRunner.eplb_stepr   num_scheduled_tokens_npr  c                 C   s  | j j}|t| j jksJ d|d | }| jjd | }| j  }|j|||jd t	t
| j}|j||d}	dd t||jD }
t| j j | j j |d}|	d u s[t|
scd g| |_|S | jrot||	|
| jdS tdd	 |	}	d
d t|	|
D |_|   |S )NzEEither all or none of the requests in a batch must be pooling requestr  r   pooling_metadatac                 S   s   g | ]\}}||kqS r   r   )r   seq_len
prompt_lenr   r   r   r   y
  s    z(GPUModelRunner._pool.<locals>.<listcomp>)r  r  r  )r   r   r   r   c                 S   r   r   r   r   r   r   r   r   
  r   z&GPUModelRunner._pool.<locals>.<lambda>c                 S   r   r   r   r   r   r   r   r   
  r   )ro  r  r;  r  r  r   get_pooling_metadatabuild_pooling_cursorr  r   r@   r)  r  r   prompt_lensry   r  r   r  anyr   rQ  r   r   rQ   r  )r   r   r  r  r  r  seq_lens_cpur  r)  r   r   r   r   r   r   _pool`
  sR   



zGPUModelRunner._poolc                 C   s*   | j jj}| jjjr|dkrt||S |S Nr   )r  r'  r  r$  pass_config	enable_sprS   )r   r  tp_sizer   r   r   _pad_for_sequence_parallelism
  s   

z,GPUModelRunner._pad_for_sequence_parallelismc                 C   s6   | j jr| jjd | }nd }| jjd | }||fS r   )r)  requires_raw_input_tokensr}  r  r  )r   r  r}  r  r   r   r   _prepare_mm_inputs
  s
   z!GPUModelRunner._prepare_mm_inputsnum_input_tokensc                 C   s  |j }t j}| jj}d }| jre|re|se| j|| jd}| | | 	|\}}	W d    n1 s3w   Y  | j
j| jjd | ||	d}
| jjd | |
 | |\}}i |  | |}nJ| jr|r| jjd | jddd}| dkr| jj| }| j
j|d}|| jj|< | jjd | }|  }d }n| jjd | }d }|  }| jr| jjd d d |f }n| jdkr| jjd d d |f }n| jjd | }|rd }n|d usJ | ||d}|r|jr| |}|d	|i ||||||fS )
NrX  )rS  is_multimodalF)as_tupler   r   )r}  TrG  ) rd  r!   is_first_rankr"  rM  rL  maybe_get_ec_connector_outputrX  rP  rh  r)  embed_input_idsr}  r  r  rj  r  r  rV  r0  r  nonzerosqueezenumelrI  r  rJ  r  r  r  r  rO  )r   r   r  r  r  r  rM  r   rY  rZ  inputs_embeds_scheduledr}  r  r  token_ids_idx	token_idstokens_to_embedsr  rG  r   r   r   _preprocess
  s|   





zGPUModelRunner._preprocessr   r   c                 C   sd   | j j}| j   |d u r| j||dS | jr(| jd ur(|  \}}| j | | |d ||}|S )Nr   sampling_metadata)	ro  r  update_async_output_token_idsrR  rQ  r  _get_draft_token_ids_cpuupdate_async_spec_token_idsrb  )r   r   r   r  r  rT  sampler_outputr   r   r   _sample#  s"   
zGPUModelRunner._sampler  c                    sb  i }t jr
| |}| jj}t| jjd | d }	|	D ]}
| jj	t
|
}|d ur5|| d  q| jj }| jj }|jjd }|j}|j}g }d }| js|jd }|dkry| |}|	D ]
}
|t
|
   qe|d urx| }n:tj|| jj|	|d\}}n,g }|	 }t| | jjd u r|jd dksJ || j_ fddt| jjD | j_| jj}t|D ]a}| jr| vrdgnd }n|| }|rt |nd}|sq| jj!| }|| }|| j"ksJ d| d	| j" || jj#|||f< d
| jj$|||f< || jj!|< || }| j%| }|j&'| q| (|d | |j)}|||||||fS )Nr      r   r   r   c                    s   i | ]\}}| vr||qS r   r   )r   r   r  invalid_req_indices_setr   r   r    s
    z4GPUModelRunner._bookkeeping_sync.<locals>.<dictcomp>zGSampled token IDs exceed the max model length. Total number of tokens: z > max_model_len: T)*envsVLLM_COMPUTE_NANS_IN_LOGITS_get_nans_in_logitsro  r  r  r  r  
generatorsr  r   
set_offset
get_offsetr  r   r  r   r   r   rQ  _to_listr   r   r   r   r   r   r  rg  r  r  r  r;  r  r  r  r  rg  r  r  _get_prompt_logprobs_dictr  )r   r   r  r   r   r  r   num_nans_in_logitsr  "discard_sampled_tokens_req_indicesr   genreq_ids_output_copyreq_id_to_index_output_copyr  r   r   r   r   r   r   r  r  sampled_idsnum_sampled_idsr`  r4  r  r(  prompt_logprobs_dictr   r  r   _bookkeeping_syncA  s   







z GPUModelRunner._bookkeeping_syncc                 c   sD    | j d u rd V  d S | j   zd V  W | j   d S | j   w r   )rp  r   r   r   r   r   r   synchronize_input_prep  s   

z%GPUModelRunner.synchronize_input_prepr}  r  r  r  c                 K   s   | j d||||d|S )aM  Helper method to call the model forward pass.

        This method can be overridden by subclasses for model execution.
        Motivation: We can inspect only this method versus
        the whole execute_model, which has additional logic.

        Args:
            input_ids: Input token IDs
            positions: Token positions
            intermediate_tensors: Tensors from previous pipeline stages
            inputs_embeds: Input embeddings (alternative to input_ids)
            **model_kwargs: Additional model arguments

        Returns:
            Model output tensor
        r}  r  r  r  Nr   )r)  )r   r}  r  r  r  r  r   r   r   _model_forward  s   zGPUModelRunner._model_forwardmax_num_scheduled_tokensr  force_uniform_decodec                 C   s    |du r| |ko|| | kS |S )zn
        Checks if it's a decode batch with same amount scheduled tokens
        across all requests.
        Nr   r  r  r  r  r  r   r   r   _is_uniform_decode  s   
z!GPUModelRunner._is_uniform_decoder  allow_microbatchingforce_eagerforce_has_loraforce_num_active_lorasnum_encoder_reqsc              
      st  j |j|||djjo|dk}|
d ur|
ntjj|	d u r'dkn|	| fdd}||p>|\}}|jj	j
jrX|jjjj dksXJ dd\}}jjjdkrj	jtjk}t|j||||jd\}}}|d urjj}t||  ||tjjkd	\}}|jksJ d }jjjrt||j|j| t|d
}|||||fS )Nr  r   c                    s(    sj j| |dS tjtfS )N)r  has_lorauniform_decoder  num_active_loras)r  r  r   rs  r&   )r  r  r  r  r  r  r   r  r   r   r   1  s   zGGPUModelRunner._determine_batch_execution_and_padding.<locals>.<lambda>zQSequence parallelism requires num_tokens to be a multiple of tensor parallel size)FNr   )num_tokens_unpaddedr'  r  allow_dp_paddingr  r   num_scheduled_tokens_per_requestrr  r  )num_unpadded_tokensnum_padded_tokensnum_paddingsruntime_mode)r  r  r"  rM  r;  ro  r3  r  r  r$  r  r  r  r'  r  data_parallel_sizerr  r   rs  r   valuedata_parallel_rankr   ri  	PIECEWISEr*  cudagraph_metricsr   r  )r   r  r  r  r  r  r  r  r  r  r  r  has_encoder_outputdispatch_cudagraphrr  batch_descriptorshould_ubatchnum_tokens_across_dpr  synced_cudagraph_modedp_rankr   r   r  r   &_determine_batch_execution_and_padding  s   







z5GPUModelRunner._determine_batch_execution_and_paddingc                 C   sr   | j jjr5| js7| jjtjkrt	d | j jj
tjkr#t	d dS t }|| j| jjj d| _dS dS dS )z
        Register layerwise NVTX hooks if --enable-layerwise-nvtx-tracing is enabled
        to trace detailed information of each layer or module in the model.
        zlayerwise NVTX tracing is not supported when CUDA graph is turned off; you may observe part or all of the model missing NVTX markerszylayerwise NVTX tracing is not supported when CompilationMode is STOCK_TORCH_COMPILE, skipping function hooks registrationTN)r  r*  enable_layerwise_nvtx_tracingr  r$  rr  r   rs  r:  rt  moder   STOCK_TORCH_COMPILErV   register_hooksr)  	__class__r   )r   	pyt_hooksr   r   r   _register_layerwise_nvtx_hooks  s$   
z-GPUModelRunner._register_layerwise_nvtx_hooksr  zUBatchSlices | Nonec                    s   t drjdurtjjdksdS dtffdd  fdd	tjjD }i }tjjD ]\}}|| }	|jD ]}
|	||
< q@q5|durng }|D ]}i }| D ]\}
}	|	|j ||
< qX|	| qP||fS ||fS )
a2  
        Build slot mappings in both formats needed by the system.

        Args:
            num_tokens_padded: Total number of tokens (padded)
            num_reqs_padded: Total number of requests (padded)
            num_tokens_unpadded: Actual number of tokens (unpadded)
            ubatch_slices: Optional ubatch slicing info for DBO

        Returns:
            A tuple of:
            - slot_mappings_by_gid: dict[int, torch.Tensor] for attention metadata
            - slot_mappings_by_layer: dict[str, torch.Tensor] or list for ForwardContext
        r  Nr   r  r  c                    sv    d urd us
J j j|  j}t|tr"tjftjjd}nj	j
|  }|jjd  }| d |S )Nr!  r   )r  r  r  r  rj   r   r  r~  r  ro  r  r  r  r  )r  r  r  r  )r  r  r  r   r   r   _get_slot_mapping  s    
z<GPUModelRunner._get_slot_mappings.<locals>._get_slot_mappingc                    s   i | ]	\}}| |qS r   r   )r   gidrT  )r  r   r   r        z5GPUModelRunner._get_slot_mappings.<locals>.<dictcomp>)
r  r  r;  r  r   r  r  r  token_slicer  )r   r  r  r  r  slot_mappings_by_gidslot_mappings_by_layerr  r  r  r  resultubatchsliced_mappingsr   )r  r  r  r  r   r   _get_slot_mappings  s0   




z!GPUModelRunner._get_slot_mappingsc           /         s,  j d ur	tdjjjr t }|d ur|  nt	d  j
r-t r-t  j
  j}td     t rwt jrwj jd}  t W  d    W  d    W  d    S 1 srw   Y  |sjjdkrjjdkrd t stW  d    W  d    S  jW  d    W  d    S jj rj!rJ dj"j#}j"j$} fdd	|D }t%j&|t%j'd
}	t(|	) }
 j}* |	\}}d }j+rjj,s-|	j"j.d |  j/}j0|||	|
|d ut1 j2d\}}}}}t3d|||| |j4}|j#d ur(|j#n|}t5||	||jj6\}}t3d|| t7fddt8j9j:D  }|t;j<k}jj=dkrot>? j9jj@j"jAjBjCjDE  t1 jFdk}|r{|n|}jG|s|r|n||s|r|n|||d\}}jH||r|nd ||r|nd |
||| jI||d\}}J ||\} }!}"}}#}W d    n	1 sw   Y  W d    n	1 sw   Y  jKrt;jL}d_Kt1 j2}$jjMo|$dk}%tN|j|||||||%d	@ td+ O }&jPd| |"||!d|#}'W d    n	1 s,w   Y  W d    n	1 s<w   Y  W d    n	1 sLw   Y  td jQr_|'\}(})n|'}(d })jRstS jTstU|(tVstJ |&|(_W|&_W|(W  d    S jXrY|(||	|&W  d    S |(| }*jDZ|*}+nRjXrJ |(| }*tS jTsdt[j| i},tS j\|(j]t^ |,d d }+njDZ|*}+i }-|+d ur|+_ |-d< tS j`|-t1tS jad d}.|.d usJ |.d }+W d    n	1 sw   Y  tb |+|||(|*|)|||
_ |&_Wd S )NzOState error: sample_tokens() must be called after execute_model() returns None.&RoutedExpertsCapturer not initialized.zgpu_model_runner: preprocessr  r  r   z--kv-sharing-fast-prefill produces incorrect logprobs for prompt tokens, tokens, please disable it when the requests need prompt logprobsc                    s   g | ]} j | qS r   r  r   r   )r   r   r   r   2  s    z0GPUModelRunner.execute_model.<locals>.<listcomp>r  )r  r  r  r  r  r  zhRunning batch with cudagraph_mode: %s, batch_descriptor: %s, should_ubatch: %s, num_tokens_across_dp: %s+ubatch_slices: %s, ubatch_slices_padded: %sc                 3   s8    | ]\}}t |jtstd d  j| D V  qdS )c                 s   r  r   )backend forward_includes_kv_cache_update)r   gr   r   r   r  t  s
    
z9GPUModelRunner.execute_model.<locals>.<genexpr>.<genexpr>N)r  r  rj   allrW  )r   idspecr   r   r   r  s  s    

z/GPUModelRunner.execute_model.<locals>.<genexpr>r=  r   r  r  r  r  )r  r  r  r  r  r  r  r  r  r  r  F)r  r  cudagraph_runtime_moder  r  r  skip_compiledzgpu_model_runner: forwardr  zgpu_model_runner: postprocessrx  )all_gather_groupall_gather_tensorsr   )rf  r   )cr  RuntimeErrorr  r"  r  r.   get_instanceclear_bufferr:  errorpreempted_req_idsr   r   handle_preemptionsrd  r   r  r:  r   r   is_producerr  rX  rP  r|   r'  r:  r  
_dummy_runrr   kv_connector_no_forwardr#  r  rh  ro  r  r  r  r  r|  r   rl  r  rF  use_ubatchingr   r  r  r  r;  r  r;  r  r   num_ubatchesr  r  r  r  r   FULLrC  r   preprocess_mambar  rg  r$  r  r)  rE  r  r  r  r  r  r3  rs  rM  r'   maybe_get_kv_connector_outputr  rY  r=  r!   rZ  r  rK   r  rA   r  compute_logitsr   send_tensor_dicttensorsr"   
contiguousbroadcast_tensor_dictr<  r   )/r   r   r  capturerr  r   r  r  tokensr  r  r  r  r   r  rr  r  r  r  r   r  r  r  ubatch_slices_paddedhas_separate_kv_updatepad_attnr  ubatch_slices_attnslot_mappings_by_groupr  r  r   r}  r  r  r  r  has_encoder_inputr  model_outputr   r   r   r   r  model_output_broadcast_databroadcastedr   )r   r   r   execute_model  s  






!
		



   .
  




;zGPUModelRunner.execute_modelgrammar_outputzGrammarOutput | Nonec                    s  j }d _ jd u r+jrt jdkr  |sd S | r"tS tt}||_ |S j\
} }}d _|d urGt	|j
| td |}W d    n1 s\w   Y  |j jrt }js|jdkr|jr|j d _d _d j
_ fdd}	j}
d}|
d urd uojj jk}|
 p|
 o|
j }|rtjttB sJ |j}|r|	| n@j d urd usJ j!|j"j
j#j$\}}%|| t&j'dj(t&j)d*t+j
j,j_j-dd n|}td	 .||j/\}}}}}}}W d    n	1 s2w   Y  |r>|	| td
 0  W d    n	1 sRw   Y  td6 j1j2rwt34 }|d urr|j5j6d nt78d t9||||||j:r|nd ||d	}W d    n	1 sw   Y  js|S td t;||j|j<|j=j
j>d}W d    n	1 sw   Y  td j
?|j@|jA W d    |S 1 sw   Y  |S )Nr   zgpu_model_runner: samplec                    sd   d usJ t d | jj 	_ W d    d S 1 s+w   Y  d S )Nzgpu_model_runner: draft)r   propose_draft_token_idsro  r  r  _copy_draft_token_ids_to_cpu)r   r   r   r   r   r   r  r   r   r   r   r,  e  s   
"z=GPUModelRunner.sample_tokens.<locals>.propose_draft_token_idsFr  r  T)
zeros_onlyzgpu_model_runner: bookkeepzgpu_model_runner: eplbz#gpu_model_runner: ModelRunnerOutput)indicesr  )	r  r  r   r   r  r  r   r  r   z+gpu_model_runner: AsyncGPUModelRunnerOutput)r   r   r   r   r   r   z-gpu_model_runner: set_async_sampled_token_ids)Br  r  rQ  r!   
world_size1_pp_receive_prev_sampled_token_ids_to_input_batchis_emptyrr   r   r   ro  r   r  rF  r   r=  rZ  $_pp_broadcast_prev_sampled_token_idsr  r  rg  r)  r  rc  rf  r_  r^  disable_padded_drafter_batchr  r]  r   r   r  prepare_next_token_ids_paddedrg  r  r  _copy_valid_sampled_token_countr   r  r  r|  expandr;  r  r-  r  rd  r  r"  r  r.   r  save_captured_expertsr  r:  r  ry   rL  r   r   r   r   set_async_sampled_token_idsr   r   )r   r+  r  r   r   r   r   r  ppr,  spec_config propose_drafts_after_bookkeepinginput_fits_in_drafteruse_gpu_toksr   next_token_idsvalid_sampled_tokens_countr  r   r   r  r  r  r   r  async_outputr   r.  r   sample_tokens$  s.  





	

	








zGPUModelRunner.sample_tokensr   c                 C   sL   t  }|jsJ | dkr|jd dksJ dtjj||j|jd dS )z4Broadcast sampled token ids (GPU) from last PP stage   r   r   z>PP+async expects sampled_token_ids to have shape [num_reqs, 1]rf  groupN)	r!   rZ  r<  r   r   distributed	broadcastrankdevice_group)r   r   r<  r   r   r   r5    s   


z3GPUModelRunner._pp_broadcast_prev_sampled_token_idsc           
      C   s   t  }|jrJ | jj}tj|dftj| jd}tjj	||j
|jd || j_t| jjd| d }t|}i }t| jjD ]\}}||v rIq@|||< | j| }	dur]|	jd q@|| j_dS )z6Receive sampled token ids broadcast from last PP stager   r!  rF  Nr   r   )r!   rZ  ro  r  r   r  r|  r  rH  rI  	last_rankrK  rg  r  r  r  r  r  r  rg  r  r  r  r  )
r   r<  r  recvdiscard_req_indicesdiscard_req_indices_setr  r   r  r(  r   r   r   r3    s"   
z@GPUModelRunner._pp_receive_prev_sampled_token_ids_to_input_batchc                 C   s&   | j r| jsd S |  \}}t||S r   )rc  r  r  rt   )r   r  r  r   r   r   take_draft_token_ids  s   
z#GPUModelRunner.take_draft_token_idsr0  c                 C   s   | j r|js| jjjsd S | jj | _| j}t	
|sd S | jd us%J | jd us,J | jd us3J t	j }|jd }t	j| j* |sZ| j| | jd | j|dd nd| jd |< | j  W d    d S 1 sqw   Y  d S Nr   Tr   )rQ  has_structured_output_requestsro  r  r  r  r   r  r  r   	is_tensorr  r  r  r   r   r   r   r   rj  r   )r   r   r0  r  r   r  r   r   r   r-    s0   


"z+GPUModelRunner._copy_draft_token_ids_to_cpuc                 C   sp   t | jtr| j| jjfS | j}|d u rg g fS | jd usJ | jd us&J | j  | jd t	| 
 |fS r   )r  r  r   ro  r  r  r  r  r   r;  r   )r   r  r   r   r   r  5  s   
z'GPUModelRunner._get_draft_token_ids_cpurA  rB  c                 C   s   | j d u rd S tj }tj| j, | j| |}| j}|d us%J |d |jd  j	|dd | j 
  W d    n1 sBw   Y  |d| j_d S )Nr   Tr   r   )r  r   r   r   r   r  r   r  r   rj  r   	unsqueezero  rg  )r   rA  rB  r   counts
counts_cpur   r   r   r8  @  s   

z.GPUModelRunner._copy_valid_sampled_token_countc                 C   sR   | j j}| j}|d u s|d u rg S | j}|d usJ |  |d |jd   S r  )ro  rg  r  r  r   r   r   )r   rg  sampled_count_eventrV  r   r   r   r  S  s   z-GPUModelRunner._get_valid_sampled_token_countr  r   r   r  c
                    s  |j  | j}
|
d usJ |
jdkr6ddlm} t|tsJ t| j|s&J | jj|| j	j
| j	j|	d}|S |
jdkrVt|tsBJ t| jtsJJ | jj| j	||	d}|S |
jdkrt|tsbJ t| jtsjJ |jd t|krv|}n4g }d}|d usJ dt|j|D ]\}}||t| d  ||d 7 }qtj|| jd	}|| }| jj|||	d
}|S |
 s|
 rt| jttB sJ |
jrt|tsJ d| j|| j| j	|j}n t|tjsJ d| j||| j| j	| j j!\}}| "|| d }|d u r8d }| j#j!d   }| $ }| j%r1|d us"J tj& fdd|D dd}n}|d   }nv|
jrsd }| j'|||j\}| j#j! }| $}| j%rn|d us_J tj&fdd|D dd}n@| }n;| j(|||\}}}|j)| j#j!d  }| $}| j%r|d usJ tj&fdd|D dd}n|d  }| j*r| jj*r| j+|dd}nd }| jj||||||||||	d
}|S )Nr  r   r   )r  r  r  z"No spec decode metadata for medusar   r  )target_hidden_statesr  r  zGsampled_token_ids should be a python list whenpadded-batch is disabled.zGsampled_token_ids should be a torch.Tensor whenpadded-batch is enabled.c                       g | ]}|d   qS r   r   r   hr  r   r   r     r  z:GPUModelRunner.propose_draft_token_ids.<locals>.<listcomp>r   r;  c                    s   g | ]}|  qS r   r   rZ  )r  r   r   r         c                    rY  r   r   rZ  )ra  r   r   r     r  )rQ  )
target_token_idstarget_positionsrX  rA  token_indices_to_sampler  r  mm_embed_inputsnum_rejected_tokens_gpur  ),rd  r)  r[  r\  r   r  r   r]  proposero  r  r  r   r   r   r;  r   r  r  r   rk  r  r_  r^  r   r   r6  prepare_next_token_ids_cpurg  r  r   r7  r  r  r8  r}  r  rY  r?  prepare_inputsprepare_inputs_paddedr  rL  rh  )r   r   r   r  r   r   r   r   r  r  r=  r   r  r1  rT  	num_draftr   rA  rB  ra  r_  r]  r^  rX  r`  r   )r  r  ra  r   r,  _  s  
  
  

u	




z&GPUModelRunner.propose_draft_token_ids	overridesc                 C   sZ   ddh}|  D ]"\}}||v sJ d| d| t| |}t||}t| || qd S )Nr&  r"  zConfig `z"` not supported. Allowed configs: )r  r  r   setattr)r   rg  allowed_config_namesconfig_nameconfig_overridesconfig
new_configr   r   r   r     s   


zGPUModelRunner.update_configzLoading (GPU))	span_nameeep_scale_upc              
   C   s   t jd| jjdd |rt| jnd\}}}| jjr%t| j| j| _	d}zt
 }t }t| j}|j| j| jd| _| jrK| | j| j| j| _t| drt d | j| j t| jd	rt| jjr| jjr| jj}	|	d
usvJ |	jd
us}J t d|	jj |r|| nd
}
|r|| nd
}| j	d
u rt| j| j| _	| j	| jj|	j|
|| |d7 }| jrt|  std|  }|rt d| n| j }| j | t }W d
   n1 sw   Y  |j!| _"W n  t#j$j%y } zd}| d| d}t &| |d
}~ww t jdt'| j"|| dd t(| j t)| dd
 }r:t)|d	d
 }r:t(| | jj*}t+|  oM|d
uoM|, | _,t| jr| jjrt d| jj |ri|| nd
}
|rr|| nd
}| j	d
us|J | j	| j| j|
|| | j	j-r| j	j.|d | jj/j0t1j2kr| jj/3| j}t4 j5d7  _5| jj6d|d d
S | j/j7}|d
usJ |8 r| jj9st:| j| jt;j<d| _d
S | jj9r|8 rt=| j| jt;j<| j| _d
S t=| j| jt;j>| j| _d
S d
S )z_
        Args:
            eep_scale_up: the model loading is for elastic EP scale up.
        zStarting to load model %s...globalscope)NNNr   )r  r"  r]  zLoading drafter model...r)  Nz%EPLB is enabled for drafter model %s.r   zRModel does not support EAGLE3 interface but aux_hidden_state_outputs was requestedz2Using auxiliary layers from speculative config: %szFailed to load model - not enough GPU memory. Try lowering --gpu-memory-utilization to free memory for weights, increasing --tensor-parallel-size, or using --quantization. See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ for more tips.z (original error: )z1Model loading took %s GiB memory and %.6f secondslocalzEPLB is enabled for model %s.)rank_mappingT)	fullgraphr  )r  )?r:  	info_oncer"  r)  r   get_eep_stater'  r  r  rS  rT   timeperf_counterr2   r&  
load_modelr  r%  load_lora_modelr  r]  r9   r)  re  	add_modelrY  r:   r  r  "_get_eagle3_aux_layers_from_configinfo"get_eagle3_aux_hidden_state_layersset_aux_hidden_state_layersconsumed_memorymodel_memory_usager   r   OutOfMemoryErrorr  rU   r%   r  multimodal_configr<   r2  is_asyncstart_async_loopr$  r  r   r  init_backendr   stock_torch_compile_countcompilerr  has_full_cudagraphsr  r   r   r  r   rs  )r   ro  global_expert_loads#old_global_expert_indices_per_modelru  eplb_modelsmtime_before_loadmodel_loaderr=  global_expert_loadold_global_expert_indices
aux_layerstime_after_loademsgcombined_msgr]  drafter_model	mm_configr  rr  r   r   r   r{    s  









E






zGPUModelRunner.load_modelc                 C   sN   | j r| j js	dS | j jj}t|dsdS |j}|r%t|ttfr%t|S dS )am  Extract Eagle3 auxiliary layer indices from speculative config.

        These indices specify which hidden states from the base model should
        be used as auxiliary inputs for the Eagle3 drafter model during
        speculative decoding.

        Returns:
            Tuple of layer indices if found in draft model config,
            None otherwise.
        N eagle_aux_hidden_state_layer_ids)r)  re  r  r  r  r  r   rk  )r   r  	layer_idsr   r   r   r~    s   

z1GPUModelRunner._get_eagle3_aux_layers_from_configweights_iteratorweights_pathis_checkpoint_formatc                 C   s  |du r|st d |  }dd | D }t }|du rOt| j}t|ds4t	d| jj
 d|dur<|| j_|| j|}tttttjf  |}t jdd	d
 | jjjp_| jjj}t|: |rxt| ||}	t|| j n t jdd	d
 t }	|D ]\}
}||
}|| |	|
 qW d   n1 sw   Y  t }|| }t jd|d	d
 | jj du r|	dur||	 }|rt d| dS dS dS dS )a  
        Reload weights from a weights iterator or from disk

        :param weights_iterator: weights to load into model
        :param weights_path: path to load weights from if weights_iterator is not
            provided. Use path of original model if neither is provided.
        :param is_checkpoint_format: set to False if weights have already been processed
            into kernel format (repacking, renaming, ect.)
        NzReloading from disk means that weights will be in checkpoint format. Please use `is_checkpoint_format=True` to avoid weight reloading errorsc                 S      h | ]\}}|qS r   r   )r   r  rT  r   r   r   	<setcomp>  r\  z0GPUModelRunner.reload_weights.<locals>.<setcomp>get_all_weightszModel reloading with `z` formatzReloading weights inplace...rt  rq  zhReloading with `is_checkpoint_format=True` requires that weights be in kernel format and already shardedz2Reloading and processing weights took %.2f secondsz5Following weights were not loaded from checkpoint: %s)!r:  warningr  named_parametersry  rz  r2   r&  r  NotImplementedErrorload_formatr"  r)  r  r   r   rk  r  r   r   rw  r  r  device_configr4   load_weightsr3   warning_oncer  get_parameterrj  r  quantization)r   r  r  r  r)  weights_to_loadcounter_before_reloadingr  load_deviceloaded_weightsr  loaded_weightr  counter_after_reloadingdiff_secondsweights_not_loadedr   r   r   reload_weights  sl   




zGPUModelRunner.reload_weightstensorizer_configr   c                 C   s   t j|  || jd d S )N)r  r"  )r1   
save_modelr  r"  )r   r  r   r   r   save_tensorized_model/  s
   
z$GPUModelRunner.save_tensorized_modelc                 C   s  | j }|si S | jj}i }g }| D ]\}}||}	|	d u r!q| j| }
|
jd u r,qt|
j}t	|
jj
| jdd}||}|sRt|d |d }|||< |
j}|d }|| }|	|krd|	}n|}|| |||< |dkrtq| jj| }| jj|  }||||  }| j|}||||  }| j|}| j|||\}}}}t||| }|j| j|dd |j| j|dd |j| j|dd q|D ]}||= ||= q|r|   |S )NTr   r   r   )rh  ro  in_progress_prompt_logprobs_cpur  r  rg  r  r;  r   rk  r   r  rx   	empty_cpur  r  r  r  r  ri  r)  r  rR  compute_logprobsgather_logprobsslicelogprob_token_idsrj  r   selected_token_ranksr  )r   r   r  num_prompt_logprobs_dictin_progress_dictr  completed_prefill_reqsr  rh  r  r9  r  r  r   r`  	start_toknum_remaining_tokensr  r  rT  prompt_hidden_statesr   tgt_token_idsr   r  r<  rT  chunk_slicer   r   r   r  9  sp   







z(GPUModelRunner._get_prompt_logprobs_dictc                 C   s   zA|d u rdd | j jD W S i }| jdd  }| j jD ]}| j j| }|d ur:||jd k r:t|| nd||< q!|W S  t	yL   i  Y S w )Nc                 S   s   i | ]}|d qS r   r   )r   r  r   r   r   r    r  z6GPUModelRunner._get_nans_in_logits.<locals>.<dictcomp>r   r;  r   )
ro  r  isnanr  r   r  r  r   r   
IndexError)r   r   r  num_nans_for_indexr  r/  r   r   r   r    s   z"GPUModelRunner._get_nans_in_logitsc                 #   s     j jj}tjo|dk}|sdV  dS |durCtjdtjf fdd}t	
d |j| d|d dd	 dV  |d dS tjdtjf fd
d}|dusVJ t	
d |j| d|d dd	 dV  |d dS )z
        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
        This is to help balance expert-selection
         - during profile_run
         - during DP rank dummy run
        r   Nr   c                      s   t j jjd j dS )Nr   )lowhigh)r   randint_liker}  r  r"  rm  r   r   r   r   rand_input_ids  s
   z=GPUModelRunner.maybe_randomize_inputs.<locals>.rand_input_idsz'Randomizing dummy input_ids for DP Rankr   Tr   c                      s   t  jjS r   )r   
randn_liker  r  r   r   r   r   rand_inputs_embeds  s   zAGPUModelRunner.maybe_randomize_inputs.<locals>.rand_inputs_embedsz+Randomizing dummy inputs_embeds for DP Rank)r  r'  r  r  VLLM_RANDOMIZE_DP_DUMMY_INPUTS	functoolscacher   r   r:  rt  rj  r  r  )r   r}  r  dp_sizerandomize_inputsr  r  r   r   r   maybe_randomize_inputs  s*   



z%GPUModelRunner.maybe_randomize_inputsrN  max_items_per_batchc                 C   st   | j dusJ | jj| j|di| j jd}|d | d }|dus%J dtdd t||fg| | j| jd	D S )
z<Dummy data for profiling and precompiling multimodal models.Nr   )	mm_countsr  rP  r   z!Item should not already be cachedc                 s   s    | ]\}}}|V  qd S r   r   )r   rT  rU  r   r   r   r    s
    
z5GPUModelRunner._get_mm_dummy_batch.<locals>.<genexpr>rL  )	r  rH  get_dummy_mm_inputsr"  r  r8  rH   r  r  )r   rN  r  dummy_mm_inputsdummy_mm_itemr   r   r   rZ    s   z"GPUModelRunner._get_mm_dummy_batchr  force_attentionr  	skip_eplbcreate_mixed_batchremove_lorais_graph_capturingr  c           0      C   s  | j jj}|r|jrtg tg fS |du s| sJ |r#| jn|}|| jj	ks-J | jj
}|rS|r7J t|d |d }|| }|d }dg| |g }|}n8|rs|rYJ t|t||}|g| }|| dkrr|| |d< nt||}|| }|g| }|d  || 7  < t||ksJ t||ksJ tj|tjd}t| }tj|tjd}| j||||d||p|tjk||dk|d
\}}}}}|du r|}n||ksJ d	| d
| d|j}|jdur|jn|}t||||| j jj\}}td|| d} | j||||d\}!}"|s|tjkrk|r'dg| |d g }#n|}#|#| j jd|< d| j j|d< | j !  | "|\}$}|$| j#jd|d < | j#!  |tjk}%| j$||||%rc|n||
|!d\} }| %| j&|||	|M || j'ksJ | ( }&| j)r| jj*s| +|\}'}(i |&| ,|}&n| j-rd}'| j.j/d| }(| ( }&n
| j0j/d| }'d}(| j1r| j2j/ddd|f })n| j3dkr| j4j/ddd|f })n| j5j/d| })t6 j7rd}*n| j8du r| j9j:| j'| jj;| j<d| _8| =|dd}*|dur|d j}|dur||dd< | >|'|(3 t?| | j ||||||"d | j9d|'|)|*|(d|&}+W d   n	1 sKw   Y  W d   n	1 s[w   Y  | j@ri|+\},}n|+},| jAr| jAB s{| jAC rtD| jEtFtGB sJ | jAdusJ |
o|tjHkp|
 o|tjko| jAjI }-| jJjKr|dkrd}-| jEjL||-|
|"d W d   n	1 sw   Y  | M  |s| jNd|d tO|d }.tP|.jQ| j<dd}/|,|,|/ fS )a6  
        Run a dummy forward pass to warm up/profile run or capture the
        CUDA graph for the model.

        Args:
            num_tokens: Number of tokens to run the dummy forward pass.
            cudagraph_runtime_mode: used to control the behavior.
                - if not set will determine the cudagraph mode based on using
                    the self.cudagraph_dispatcher.
                - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
                - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
                - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
                    needed.
            force_attention: If True, always create attention metadata. Used to
                warm up attention backend when mode is NONE.
            uniform_decode: If True, the batch is a uniform decode batch.
            skip_eplb: If True, skip EPLB state update.
            is_profile: If True, this is a profile run.
            create_mixed_batch: If True, create a mixed batch with both decode
                (1 token) and prefill (multiple tokens) requests.
            remove_lora: If False, dummy LoRAs are not destroyed after the run
            num_active_loras: Number of distinct active LoRAs to capture for.
                LoRA is activated when num_active_loras > 0.
        Nr   rE  r   r   r  F)
r  r  r  r  r  r  r  r  r  r  z7Cudagraph runtime mode mismatch in dummy_run. Expected z
, but got rR  r   r  )r  r  r  r  r  r  )
batch_sizer  r  )r  r  r  r  r  r  r  )use_cudagraphsr  r  T)r  r  r   r   )Rr  r"  r  mm_encoder_onlyr   rk  valid_runtime_modesr  r(  r  r9  r  rR   r  r;  r  r  r|  r   r  r  r   rs  r  r  r   r'  r  r:  r;  r  r  r  rh  rc  r  r  maybe_dummy_run_with_lorar%  r8  r  rL  rM  r  r\  r0  r  r  r}  rI  r  rJ  r  r  r!   r  r  r)  make_empty_intermediate_tensorsr  r  r  r  r'   rY  r)  r_  r^  r  r]  r   r   r  enforce_eagerr$  cudagraph_specialize_lora	dummy_runr  r  r^  r  r   )0r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  num_decode_tokensnum_prefill_tokensr  num_scheduled_tokens_listmin_tokens_per_reqr  r  r  _cudagraph_moder  r  r  rT  r  r  r  r!  r  r%  r  r  cum_num_tokensr#  r  r}  r  r  r  outputsr   r  logit_indiceslogit_indices_devicer   r   r   r    s  
'





	



	





 
	
 	
zGPUModelRunner._dummy_runc              
      s  j jj}|r|jrtg S t|}j|}|	d  fdd}t
d'i d|dddddd	|d
d||	dd di dd dddd d|dd|dd|dddd t D ddd t D dd di dt }z	j||d}W n ty } zdt|v rtd   d!||d }~ww jrd"d t D }t|j}	td#d$ |D }
d }tj|
  |jd% j|jd&}|	||| |S )(Nr   c                    s   t j f| jdS )Nr  )r   r@  r  )r{  r  r   r   r   r   b  r\  z3GPUModelRunner._dummy_sampler_run.<locals>.<lambda>temperatureg      ?
all_greedyF
all_randomtop_pg?top_kr   r  max_num_logprobsno_penaltiesTr  frequency_penaltiesg?presence_penaltiesrepetition_penaltiesr  c                 S   r  r   r   r  r   r   r   r   q      z5GPUModelRunner._dummy_sampler_run.<locals>.<listcomp>spec_token_idsc                 S   r  r   r   r  r   r   r   r   r  r  allowed_token_ids_maskbad_words_token_idsr  r  out of memoryz9CUDA out of memory occurred when warming up sampler with m dummy requests. Please try lowering `max_num_seqs` or `gpu_memory_utilization` when initializing the engine.c                 S   s   g | ]}d gqS r  r   r  r   r   r   r     r  c                 s   s    | ]}t |V  qd S r   )r;  )r   r  r   r   r   r    s    z4GPUModelRunner._dummy_sampler_run.<locals>.<genexpr>r   r/  r   )r  r"  r  r  r   rk  	rand_liker)  r  r  r   r  r   rR  r  r  r)  r   
make_dummyr  r  randnr   r  rb  )r   r   r  r   dummy_tensorsdummy_metadatar  r  r  dummy_spec_decode_metadatar  draft_probsr   r  r   _dummy_sampler_runO  s   
	



	


z!GPUModelRunner._dummy_sampler_runr  c              
   C   sR  |j d }| jj}t||}|| }t||}|d  || 7  < t||ks+J t||ks3J || }t	|}	tj
||ftj| jd}
tt|  }t|d}|| j |j|}|| t|	|
|g| dd t|D d}|j||	|jd z|j||d	W S  ty } zd
t|v rtd|d| d||d }~ww )Nr   r   r!  )r  c                 S   r  r   )r~   r  r   r   r   r     r  z9GPUModelRunner._dummy_pooler_run_task.<locals>.<listcomp>)r  r  r  pooling_states)r  r  r  r  z9CUDA out of memory occurred when warming up pooler (task=z) with r  )r   r(  r9  r  r  r@  r  r;  r   r  r  r|  r  r   r@   r  rI   verifyr"  r  r	  r
  r}   r  r  r  r  )r   r   r  r  r  r  r  r  req_num_tokensdummy_prompt_lensdummy_token_idsr)  dummy_pooling_paramsr*  r  r  r   r   r   _dummy_pooler_run_task  s^   




z%GPUModelRunner._dummy_pooler_run_taskc                 C   s   | j jj}|r|jrtg S |  }|std| jj dt	t
tf  }|D ]}| ||}tdd |D ||< ~q(t| dd dd }| ||S )	NzModel zq does not support any pooling tasks. See https://docs.vllm.ai/en/latest/models/pooling_models.html to learn more.c                 s   s    | ]
}|d ur|j V  qd S r   )nbytes)r   or   r   r   r    s    z3GPUModelRunner._dummy_pooler_run.<locals>.<genexpr>c                 S   s   | d S r  r   r   r   r   r   r     s    z2GPUModelRunner._dummy_pooler_run.<locals>.<lambda>)keyr   )r  r"  r  r  r   rk  ru  r  r)  r  rM   floatr  r  rl  r  )r   r   r  supported_pooling_tasksoutput_sizer  r   max_taskr   r   r   _dummy_pooler_run  s   


z GPUModelRunner._dummy_pooler_runc                 C   s*  | j re| jj}|d ur|jrtd nQ| j}|d usJ |  }dkre|js.td n7|	 }|j
| }td||| | ||}| jjd	i |}t||d t|D ]\}}	|	| jd| < qX| j| jdd\}
}t jr| jr|| |
}	n| |}	nd }	|   ~
~	| j  t  d S )
NzCSkipping memory profiling for multimodal encoder and encoder cache.r   zfSkipping encoder profiling for embedding-only mode (all modality limits=0 with enable_mm_embeds=True).zxEncoder cache will be initialized with a budget of %s tokens, and profiled with %s %s items of the maximum feature size.r+  tmp_T)r  r   )rL  r"  r  skip_mm_profilingr:  r  r  get_encoder_budgetrX  rY  mm_max_items_per_batchrZ  r)  r9  r   r  rX  r  r8  r!   rZ  rA   r  r  r  r   gccollect)r   r  r  encoder_budgetr[  max_mm_items_per_batchbatched_dummy_mm_inputsdummy_encoder_outputsr   r   r   last_hidden_statesr   r   r   profile_run  s`   	



zGPUModelRunner.profile_runzCapture modelc           
   	   C   s   | j jtjkrtd dS t jd7  _t	 }t
dd }td | B t| jd, tj d }| j D ]\}}| j||d q:tj  tj d }W d    n1 s\w   Y  W d    n1 skw   Y  td	 t  t	 }|| }|| }	tjd
||	d dd |	S )NzrSkipping CUDA graph capture. To turn on CUDA graph capture, ensure `cudagraph_mode` was not manually set to `NONE`r   r   c                  s   s\    t   tj } | rt   zd V  W | r t   t   d S d S | r-t   t   w w r   )r  r   r  VLLM_ENABLE_CUDAGRAPH_GCfreezeunfreeze)should_freezer   r   r   	freeze_gcL  s   
z/GPUModelRunner.capture_model.<locals>.freeze_gcTr  )batch_descriptorsr  Fz4Graph capturing finished in %.0f secs, took %.2f GiBr
  rt  rq  )r$  rr  r   rs  r:  r  r   num_gpu_runner_capture_triggersry  rz  r   r   r#   r  r   r   mem_get_infor  get_capture_descs_capture_cudagraphsr   r   rw  )
r   
start_timer+  start_free_gpu_memoryr  batch_descsend_free_gpu_memoryend_timeelapsed_timecuda_graph_sizer   r   r   capture_model?  sJ   

 zGPUModelRunner.capture_modelr,  c                 C   s  |t jkr	| sJ d| |sd S |d j}|t jk}tj| j|dd|d}t r?t	|| j
j d|r8dnd|jd	}|D ]7}|j}|j}| jjo\|t jko\|o\t| jj||d
}	t| jjD ]}
||t j|	|d qc||||	|dd qA| | j d S )Nz Invalid cudagraph runtime mode: r   TF)r  r  r  r  zCapturing CUDA graphs ({}, {})decodezmixed prefill-decode)disabledesc)rl  r  r  )r  r  r  )r  r  r  r  )r   rs  r  uniformr  r  partialr  r$   r   r&  use_tqdm_on_loadformatr  r  r  r'  r  r   r  r  r$  cudagraph_num_of_warmupsmaybe_remove_all_lorasr%  )r   r,  r  r  r  r  r  r  r  r  rT  r   r   r   r0    sl   


	

z"GPUModelRunner._capture_cudagraphsr  c           
         s   t jdksJ dG dd dt dtdtt tt f tt	t
  f f fdd}d	t tt f d
tdtt fdd}g }g }|jD ]}||}||d  ||d  qG||j tj t|D ]\}}	j||	| qldS )zT
        Initialize the attention backends and attention metadata builders.
        r   z*Attention backends are already initializedc                   @   s"   e Zd ZU ee ed< eed< dS )zAGPUModelRunner.initialize_attn_backend.<locals>.AttentionGroupKeyattn_backendr  N)r   r   r   r  rZ   r  rn   r   r   r   r   AttentionGroupKey  s   
 rC  kv_cache_group_specr   c           	         s   t tt t}tj|| j}i  tt}| jD ]5}|| 	 }|j
v r*td|}| }| j}t|tr;|j| }||f}|| |< || | q fdd| D tdd   D fS )NFastPrefillc                    s   i | ]	\}} | |qS r   r   ry  attn_backendsr   r   r    r   z_GPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_group.<locals>.<dictcomp>c                 s   s    | ]}|j V  qd S r   )rB  )r   	group_keyr   r   r   r     s    z^GPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_group.<locals>.<genexpr>)r   r  r   r-   r   r  r  r   r   get_attn_backendr  rb   full_cls_namer  r  rq   r  r  r  r  r  )	rD  
layer_typelayersattn_backend_layersr  rB  rJ  layer_kv_cache_specr  rC  r   rF  r   get_attn_backends_for_group  s2   




zKGPUModelRunner.initialize_attn_backend.<locals>.get_attn_backends_for_groupattn_backends_mapkv_cache_group_idc                 S   s6   g }|   D ]\\}}}t||||}|| q|S r   )r  r   r  )rQ  rR  rW  rB  r  r  r  r   r   r   create_attn_groups  s   zBGPUModelRunner.initialize_attn_backend.<locals>.create_attn_groupsr   N)r;  rW  r   rm   rk  r  r   r  r  r  rZ   r   r   r  r   _check_and_update_cudagraph_moder   r  r  )
r   r  rP  rS  attention_backend_mapsattention_backend_listrD  rG  r   attn_backend_mapr   rO  r   initialize_attn_backend  s6   %


z&GPUModelRunner.initialize_attn_backendr  c                 C   sj   t t|jD ]'}| j| D ]}|j| j| j|t|k r || nd| jjs'dn| jj	d qq| 
  dS )zW
        Create the metadata builders for all KV cache groups and attn groups.
        Nr   )num_metadata_builders)r  r;  r  rW  create_metadata_buildersr  r  r'  r  r  !calculate_reorder_batch_threshold)r   r  r  rR  r  r   r   r   initialize_metadata_builders%  s   z+GPUModelRunner.initialize_metadata_buildersattention_backendsr  c                 C   s  t j}d}t||D ] \}}|D ]}| }|| j|j}	|	j|jk r)|	}|j}qq
| j	j
}
|
dus5J |
 tjkr||t jkr|d|
j d| d| d}|t jkr[|d7 }t|| j	 rl|d7 }tj }
| j	_
n|d7 }tj }
| j	_
t| |
 tjkr|t jkrd|
j d| d| d}| j	jtjkr| j	 s| j	jr|d	7 }tj }
| j	_
n|d
7 }tj }
| j	_
t| |
 tjkr| jdkr|jt jjk rd|
j d| d| d}| j	 r|d7 }tj }
| j	_
n|d7 }tj }
| j	_
t| |
 r|t jkrtd|
j d| d| d|
 tjkrJ|
  rJ| jdkrJ| j	!| j| j"j# | j	j$}|durG|ng | _%|
| j	_
| j&'|
| j | j(rq| j() rst*| j+t,siJ | j+'|
 dS dS dS )z
        Resolve the cudagraph_mode when there are multiple attention
        groups with potential conflicting CUDA graph support.
        Then initialize the cudagraph_dispatcher based on the resolved
        cudagraph_mode.
        NzCUDAGraphMode.z is not supported with z backend (support: rs  zU; please try cudagraph_mode=PIECEWISE, and make sure compilation mode is VLLM_COMPILEz+; setting cudagraph_mode=FULL_AND_PIECEWISEz); setting cudagraph_mode=FULL_DECODE_ONLYzJ; setting cudagraph_mode=PIECEWISE because attention is compiled piecewisezI; setting cudagraph_mode=NONE because attention is not compiled piecewiser   z9 is not supported with spec-decode for attention backend z (support: z"; setting cudagraph_mode=PIECEWISEz; setting cudagraph_mode=NONEz backend (support:zW) ; please try cudagraph_mode=PIECEWISE, and make sure compilation mode is VLLM_COMPILE)-r[   ALWAYSr   get_builder_clsget_cudagraph_supportr  r  r  r   r$  rr  
mixed_moder   r  r  NEVERra  splitting_ops_contain_attentionFULL_AND_PIECEWISEFULL_DECODE_ONLYr:  r  decode_moder  r   VLLM_COMPILEuse_inductor_graph_partitionr  rs  r  UNIFORM_BATCHr  separate_routine&adjust_cudagraph_sizes_for_spec_decoder'  r  rq  ru  r  initialize_cudagraph_keysr)  r_  r  r]  r   )r   r]  r  min_cg_supportmin_cg_backend_nameattn_backend_setr  rB  builder_cls
cg_supportrr  r  capture_sizesr   r   r   rT  <  s   
















z/GPUModelRunner._check_and_update_cudagraph_modec                 C   s@   dd }dd |   D }t|dkrd| _dS t||| _dS )a  
        Choose the minimum reorder batch threshold from all attention groups.
        Backends should be able to support lower threshold then what they request
        just may have a performance penalty due to that backend treating decodes
        as prefills.
        c                 S   s"   |d u r| S | d u r|S t | |S r   )r  )abr   r   r   r     s   " zBGPUModelRunner.calculate_reorder_batch_threshold.<locals>.<lambda>c                 S   s   g | ]}|  jqS r   )r  r  r   rG  r   r   r   r         zDGPUModelRunner.calculate_reorder_batch_threshold.<locals>.<listcomp>r   N)_attn_group_iteratorr;  r  r
   )r   min_none_highreorder_batch_thresholdsr   r   r   r[    s   z0GPUModelRunner.calculate_reorder_batch_thresholdkv_manager_block_sizerW  c                 C   s   dt tt  dtdtfdd}dd |D }||| r| S tdd	 |D }t|d
dD ]}| | dkr5q,|||r>|  S q,td|  d)a  
        Select a block size that is supported by all backends and is a factor of
        kv_manager_block_size.

        If kv_manager_block_size is supported by all backends, return it directly.
        Otherwise, return the max supported size.

        Args:
            kv_manager_block_size: Block size of KV cache
            attn_groups: List of attention groups

        Returns:
            The selected block size

        Raises:
            ValueError: If no valid block size found
        backendsrn  r   c                 S   sl   | D ]1}d}|  D ]#}t|tr||krd}q
t|tr'||j dkr&d}q
td| |s3 dS qdS )zO
            Check if the block size is supported by all backends.
            FTr   zUnknown supported size: ) get_supported_kernel_block_sizesr  r   r`   basera  )r{  rn  r  is_supportedsupported_sizer   r   r   block_size_is_supported  s    

zHGPUModelRunner.select_common_block_size.<locals>.block_size_is_supportedc                 S   s   g | ]}|j qS r   )r  ru  r   r   r   r     r  z;GPUModelRunner.select_common_block_size.<locals>.<listcomp>c                 s   s,    | ]}|  D ]
}t|tr|V  qqd S r   )r|  r  r   )r   r  r  r   r   r   r  *  s    z:GPUModelRunner.select_common_block_size.<locals>.<genexpr>T)reverser   zNo common block size for z. )r   r  rZ   r   r   r  rt  ra  )rz  rW  r  r{  all_int_supported_sizesr  r   r   r   select_common_block_size  s(   




z'GPUModelRunner.select_common_block_sizec           
      C   s  dd |j D }g }t| j| j}t|j D ]1\}}t|jtr!qt||| t	  }t|jt
rB| jjr7|nd|jj }	t||	}|| q|| jjgksV|| jjgkr| jjdks`J dt| j|| j| j| j| j |||t| jj| jj| jj| jd| _dS dS )a]  
        Re-initialize the input batch if the block sizes are different from
        `[self.cache_config.block_size]`. This usually happens when there
        are multiple KV cache groups.

        Args:
            kv_cache_config: The KV cache configuration.
            kernel_block_sizes: The kernel block sizes for each KV cache group.
        c                 S   s    g | ]}t |jts|jjqS r   )r  r  rj   rn  )r   r  r   r   r   r   D  s    
z?GPUModelRunner.may_reinitialize_input_batch.<locals>.<listcomp>r   r   zCannot re-initialize the input batch when CPU weight offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 for more details.)r  r  r  r  r  r   r  r  max_num_blocks_per_reqr  r  r  rA   N)r  rl  r  rO  r  r  r  rj   rR   r   ro   r#  enable_prefix_cachingnum_speculative_blocksr  rn  r,  r   r  r8  r  r  r"  rm  r   r  r)  ro  r  r  rA   )
r   r  r  r  max_num_blocksr  r   r  r  mamba_blocks_per_reqr   r   r   may_reinitialize_input_batch8  sV   
z+GPUModelRunner.may_reinitialize_input_batchc                 C   s   i }|j D ]}tj|jtj| jd}|jD ]}|||< qqt }|jD ]}|j	D ]}|| j
v r0q(|| q(q#|t| ksCJ d|S )au  
        Initializes the KV cache buffer with the correct size. The buffer needs
        to be reshaped to the desired shape before being used by the models.

        Args:
            kv_cache_config: The KV cache config
        Returns:
            dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        r!  z)Some layers are not correctly initialized)kv_cache_tensorsr   r  r  int8r  	shared_byr  r  r  r  r  r  )r   r  kv_cache_raw_tensorskv_cache_tensorrk  r  r  rG  r   r   r   _allocate_kv_cache_tensorst  s&   





z)GPUModelRunner._allocate_kv_cache_tensorsc                 C   s   t j| jS r   )	itertoolschainfrom_iterablerW  r   r   r   r   rw    r  z#GPUModelRunner._attn_group_iteratorc                 c   s(    | j jsd S | jD ]}|E d H  q
d S r   )r  r  rW  )r   rW  r   r   r   "_kv_cache_spec_attn_group_iterator  s   
z1GPUModelRunner._kv_cache_spec_attn_group_iteratorc           	      C   s   g }t |jD ]H\}}|j}t|trtt|j }t|t	r"qt|t
r<| j| }|jj}| ||}|| qt|trH||j qtd|j |S )a  
        Generate kernel_block_sizes that matches each block_size.

        For attention backends that support virtual block splitting,
        use the supported block sizes from the backend.
        For other backends (like Mamba), use the same block size (no splitting).

        Args:
            kv_cache_config: The KV cache configuration.

        Returns:
            list[int]: List of kernel block sizes for each cache group.
        zunknown kv cache spec )r  r  r  r  rq   r8  iterr  r  rj   rg   rW  rn  r  r  ro   r  )	r   r  r  r  r  r  rW  rz  selected_kernel_sizer   r   r   _prepare_kernel_block_sizes  s(   





z*GPUModelRunner._prepare_kernel_block_sizesr  c                    sD  i }d\}}|   D ]	}|j}|j}	|jt|krq
||j }
|jD ]}|| jv r+q#|| }| |j dks:J | |j }t	|t
rd}|j|
 }|| }|	j||
|j|j| jjd |j}z|	 tt ksqJ W n ttfy   ttt Y nw t fddD  fddttD }|| | j| ||< q#t	|trd}|| }g }d}t|j|jD ]G\}}t|}|j| }|g|R }t|  }|g|d	d
 R }|| dksJ tj!|||||| d}|"| ||d | 7 }q|||< q#tq
|r |r | #| |S )a  
        Reshape the KV cache tensors to the desired shape and dtype.

        Args:
            kv_cache_config: The KV cache config
            kv_cache_raw_tensors: The KV cache buffer of each layer, with
                correct size but uninitialized shape.
            kernel_block_sizes: The kernel block sizes for each KV cache group.
        Returns:
            Dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        FFr   T)cache_dtype_strc                 3   s    | ]} | V  qd S r   r   r  )kv_cache_shaper   r   r    r  z;GPUModelRunner._reshape_kv_cache_tensors.<locals>.<genexpr>c                    s   g | ]}  |qS r   )re  r  )kv_cache_stride_orderr   r   r     rv  z<GPUModelRunner._reshape_kv_cache_tensors.<locals>.<listcomp>r   N)r  stridestorage_offset)$r  r  r  rR  r;  r  r  r  page_size_bytesr  rg   rn  get_kv_cache_shaper  	head_sizer#  r-  r  get_kv_cache_stride_orderAttributeErrorr  rk  r  viewpermutero   r   shapesdtypesrX   r   r  r  
as_stridedr  %_update_hybrid_attention_mamba_layout)r   r  r  r  rT  has_attn	has_mambarG  r  rB  kernel_block_sizer  
raw_tensor
num_blocksnum_blocks_per_kv_blockkernel_num_blocksr  	inv_orderstate_tensorsstorage_offset_bytesr   
dtype_sizenum_element_per_pagetarget_shaper  target_striderk  r   )r  r  r   _reshape_kv_cache_tensors  s   







H
z(GPUModelRunner._reshape_kv_cache_tensorsrT  c                 C   s   |   D ]H}|j}|jD ]?}|| }t|trK|jd dkrK|jd dks-J d|j |jdd  }|j|j|d| g| dd R d qqdS )z
        Update the layout of attention layers from (2, num_blocks, ...) to
        (num_blocks, 2, ...).

        Args:
            kv_caches: The KV cache buffer of each layer.
        r   rE  r   zkFail to determine whether the layout is (2, num_blocks, ...) or (num_blocks, 2, ...) for a tensor of shape N)r  r  )	r  r  r  r  rg   r   r  as_strided_r  )r   rT  rG  r  r  kv_cachehidden_sizer   r   r   r  0  s$   
z4GPUModelRunner._update_hybrid_attention_mamba_layoutc                 C   s   | j j}| | j|r | || j|| j|\}}}|| _|| _n| |}| 	|||}| j
 D ]\}}	td||	 ||	 ||< q1| jjjdkrLdnd}
t|| jj| j|
 |S )a\  
        Initialize the memory buffer for KV cache.

        Args:
            kv_cache_config: The KV cache config
            kernel_block_sizes: The kernel block sizes for each KV cache group.

        Returns:
            Dict[str, torch.Tensor]: A map between layer names to their
            corresponding memory buffer for KV cache.
        z%s reuses KV cache of %slongcat_flashrE  r   )r#  r-  use_uniform_kv_cacherW  allocate_uniform_kv_cachesr  rU  rV  r  r  r  r  r:  r;  r"  r  
model_typer   r$  r  rT  )r   r  r  r-  rT  rU  rB  r  r  target_layer_namenum_attn_moduler   r   r   initialize_kv_cache_tensorsK  s8   	
z*GPUModelRunner.initialize_kv_cache_tensorsc                 C   sb   | j sdS t| j |j| j | jjr-t| jt}t	|D ]}|| j v r*| j
| q dS dS dS )z
        Add layers that re-use KV cache to KV cache group of its target layer.
        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
        N)r  r   r  r  r#  r  r   r  r+   reversedr  r  )r   r  r  r  r   r   r   .maybe_add_kv_sharing_layers_to_kv_cache_groups  s   
z=GPUModelRunner.maybe_add_kv_sharing_layers_to_kv_cache_groupsc                 C   s   t |}|| _|   | | | | | |}| || | || | ||}| j	rI| j	
 s9| j	 rIt| jttB sCJ | j| t rnt }| jdurd| jdus[J || j| j n|| |t | jjrx|   dS dS )z
        Initialize KV cache based on `kv_cache_config`.
        Args:
            kv_cache_config: Configuration for the KV cache, including the KV
            cache size of each layer
        N)r   r  .may_add_encoder_only_layers_to_kv_cache_configr  rX  r  r\  r  r  r)  r_  r^  r  r]  r   r   validate_same_kv_cache_groupr   r   rU  rV  register_cross_layers_kv_cacheregister_kv_cachesset_host_xfer_buffer_opsr   r"  r  init_routed_experts_capturer)r   r  r  rT  kv_transfer_groupr   r   r   initialize_kv_cache  s<   





z"GPUModelRunner.initialize_kv_cachec                 C   sd   t d| jj t }| jj}| jj	t
| jj d | | _|j| jj| j| jd | | d S )NzFInitializing routed experts capturer, enable_return_routed_experts: %sr   )r  max_num_kv_tokensr  )r:  r  r"  r  r.   creater#  rn  r  r  r;  r  r  init_bufferr(  r  r  _bind_routed_experts_capturer)r   routed_experts_capturerrn  r   r   r   r    s"   z+GPUModelRunner.init_routed_experts_capturerr  c                 C   sd   ddl m} ddlm} | jj D ]}t||r/t|j|r/|j	}||fdd}|j
| qd S )Nr   )FusedMoE)
BaseRouterc                 S   s   | ||  d S r   )capture)topk_ids	_layer_id	_capturerr   r   r   _capture_fn  s   zAGPUModelRunner._bind_routed_experts_capturer.<locals>._capture_fn)*vllm.model_executor.layers.fused_moe.layerr  7vllm.model_executor.layers.fused_moe.router.base_routerr  r$  r  r  r  routerlayer_idset_capture_fn)r   r  r  r  r  r  r  r   r   r   r    s   z,GPUModelRunner._bind_routed_experts_capturerc           	      C   s   | j jj}tt}t| j t}| D ]"\}}|jt	j
kr5t||j|j| jd}|| | | j| qt|dkrYt|dksFJ d| \}}| jjt||d dS dS )zA
        Add encoder-only layers to the KV cache config.
        )rn  r  r  r  r   r   z0Only support one encoder-only attention spec now)r  r  N)r  r#  rn  r   r   r   r+   r  	attn_typer^   ENCODER_ONLYrj   r  r  r.  r  r  r  r;  popitemr  r  rm   )	r   rn  encoder_only_attn_specsr  r  attn_module	attn_specr  r  r   r   r   r    s.   

z=GPUModelRunner.may_add_encoder_only_layers_to_kv_cache_configc                 C   s~   t  r	t jr	i S i }ttt t}t| j|}|	 D ] \}}t
|tr0|j }r0|| j|< q|| j }r<|||< q|S )a0  
        Generates the KVCacheSpec by parsing the kv cache format from each
        Attention module in the static forward context.
        Returns:
            KVCacheSpec: A dictionary mapping layer names to their KV cache
            format. Layers that do not need KV cache are not included.
        )r   r   r  r   r  r   r-   r   r  r  r  r+   kv_sharing_target_layer_namer  get_kv_cache_spec)r   r  rK  r  r  r  kv_tgt_layerr  r   r   r   r    s   


z GPUModelRunner.get_kv_cache_specc                 C   s>   | j d |jd  }|j|dd | j  | j  | S rQ  )r  r   rj  r  r   r   r   )r   r   pinnedr   r   r   r  ,  s
   	

zGPUModelRunner._to_listc                 C   sL   | j  dd | j D }| j  |W  d   S 1 sw   Y  dS )z
        Get encoder timing stats for all requests and clear the registry.

        Returns:
            Dictionary mapping request_id to stats dict.
        c                 S   s   i | ]	\}}||  qS r   )to_dict)r   r  	stats_objr   r   r   r  C  r  z;GPUModelRunner.get_encoder_timing_stats.<locals>.<dictcomp>N)rz  rw  r  r   )r   statsr   r   r   get_encoder_timing_stats;  s   
$z'GPUModelRunner.get_encoder_timing_statsr=  group_lora_refsrH  rI  c                 c   sz   |sdV  dS ||||  }dd |D }t j  t }zQdV  W t j  t | }|tt|d }	| j, |D ] }
|
| jvrLt	 | j|
< | j|
 }| j
|	7  _
| jd7  _q?W d   dS 1 skw   Y  dS t j  t | }|tt|d }	| j+ |D ] }
|
| jvrt	 | j|
< | j|
 }| j
|	7  _
| jd7  _qW d   w 1 sw   Y  w )a@  
        Context manager to time encoder forward operations.

        Args:
            should_time: Whether timing is enabled
            group_lora_refs: Full list of (request_id, pos_info) tuples
            current_item_idx: Starting index for this group
            num_items: Number of items in this group
        Nc                 S   r  r   r   )r   r  rT  r   r   r   r  `  r\  z9GPUModelRunner.timed_encoder_operation.<locals>.<setcomp>r   )r   r   r   ry  rz  rl  r;  rz  rw  EncoderTimingStatsencoder_forward_timenum_encoder_calls)r   r=  r  rH  rI  
group_refsgroup_request_idsr1  elapsedper_request_timer  r  r   r   r   r7  J  sD   



"


z&GPUModelRunner.timed_encoder_operation)r   N)r   r   r   Nr   )F)	NNNNFFNNN)r   r   r  r  )NNNN)TFNNNr   )NNT)r  r   r   N)
NFFTFFFTFr   )r   r   r   r   r   r  r   r   r  r  r  inference_moder  r   r  SymIntr  r   r   r{  r  r  rv  r  r:  r   rF  r  re   r   r   r  r  rE   rV  r\  r  ndarrayrk  rc  r  r  rn   r  r   r  r   r   r   r_   r  r   r]   r  r  r  r  r  rF   rG   r%  rP  rh  nnModuler  rL   ro  rM   ru  rN   rr  rK   r  r  rv   ry   rs   r  r  r  ru   r  r{   r  rw   rx   r  r   r  r  staticmethodr  r   r&   r   r  r  r  r*  rD  r5  r3  rt   rP  r-  r  r8  r  r   r,  r   rO   r{  r~  r   r  r  r  r  r  rZ  r  r  rz   r  r  r&  r8  r0  rl   rX  r\  r  r  rZ   rm   rT  r[  r   r  r  r  r   rw  r  r  r  r  r  r  r  r  r.   r  r  r  r  r  r  r7  r   r   r   r   r  I  s   
  +
" x
0
"


~

-
 a	

 u
$
a1/
O

+
  
m


<


t







 
})"
J  6
 C

"!
	"

 - -
T


i

,
	
  ON
7KF
NQ !G< +h73  &r  c                   @   sH   e Zd ZU dZdZeed< 	 dZeed< 	 de	e
eeB f fddZd	S )
r  z7Per-request timing statistics for encoder forward pass.g        r  r   r  r   c                 C   s   | j | jdS )Nr  r  r  r   r   r   r   r    s   zEncoderTimingStats.to_dictN)r   r   r   r  r  r  r  r  r   r  r  r  r   r   r   r   r  w  s   
 r  (  r  r  r  rx  ry  collectionsr   collections.abcr   r   r   
contextlibr   r   r   dataclassesr	   r
   typingr   r   r   r   r   r  r  r   torch.distributedtorch.nnr  r   	vllm.envsr  vllm.compilation.counterr   vllm.compilation.cuda_graphr   r   vllm.compilation.monitorr   vllm.configr   r   r   r   r   vllm.distributed.ec_transferr   r    vllm.distributed.eplb.eplb_stater   vllm.distributed.kv_transferr   r   /vllm.distributed.kv_transfer.kv_connector.utilsr   vllm.distributed.parallel_stater    r!   r"   r#   r$   r%   vllm.forward_contextr&   r'   vllm.loggerr(   vllm.lora.layersr)   r*   $vllm.model_executor.layers.attentionr+   r,   /vllm.model_executor.layers.attention_layer_baser-   <vllm.model_executor.layers.fused_moe.routed_experts_capturerr.   +vllm.model_executor.layers.rotary_embeddingr/   r0    vllm.model_executor.model_loaderr1   r2   'vllm.model_executor.model_loader.reloadr3   r4   %vllm.model_executor.models.interfacesr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   *vllm.model_executor.models.interfaces_baser@   rA   rB   vllm.multimodalrC   vllm.multimodal.encoder_budgetrD   vllm.multimodal.inputsrE   rF   rG   vllm.multimodal.utilsrH   vllm.pooling_paramsrI   vllm.sampling_paramsrJ   vllm.sequencerK   
vllm.tasksrL   rM   rN   vllm.tracingrO   
vllm.utilsrP   vllm.utils.jsontreerQ   vllm.utils.math_utilsrR   rS   vllm.utils.mem_utilsrT   rU   vllm.utils.nvtx_pytorch_hooksrV   vllm.utils.platform_utilsrW   vllm.utils.torch_utilsrX   rY   vllm.v1.attention.backendrZ   r[   r\   r]   r^   r_   r`   #vllm.v1.attention.backends.gdn_attnra    vllm.v1.attention.backends.utilsrb   rc   rd   vllm.v1.core.sched.outputre   vllm.v1.cudagraph_dispatcherrf   vllm.v1.kv_cache_interfacerg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   vllm.v1.outputsrr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   vllm.v1.pool.metadatar}   r~   vllm.v1.sample.logits_processorr   r   )vllm.v1.sample.logits_processor.interfacer   vllm.v1.sample.metadatar    vllm.v1.sample.rejection_samplerr   vllm.v1.sample.samplerr   vllm.v1.spec_decode.draft_modelr   vllm.v1.spec_decode.eagler   vllm.v1.spec_decode.medusar   vllm.v1.spec_decode.metadatar   #vllm.v1.spec_decode.suffix_decodingr   vllm.v1.structured_output.utilsr   vllm.v1.utilsr   r   vllm.v1.workerr   vllm.v1.worker.cp_utilsr   r   vllm.v1.worker.dp_utilsr   .vllm.v1.worker.ec_connector_model_runner_mixinr   vllm.v1.worker.gpu_input_batchr   r   !vllm.v1.worker.gpu_ubatch_wrapperr   .vllm.v1.worker.kv_connector_model_runner_mixinr   &vllm.v1.worker.lora_model_runner_mixinr   vllm.v1.worker.ubatch_utilsr   r   r   r   vllm.v1.worker.utilsr   vllm.v1.worker.workspacer   utilsr   r   r   r   +vllm.model_executor.model_loader.tensorizerr   r   r   r\  r   r   r:  r  r  r   r  r   r   r   r   r   r  r  r   r   r   r   <module>   s&  
 4$	44D*
                                              \