o
    
۾i                     @   s$  d dl Z d dlZd dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZO d dlPmQZQ d dlRmSZS d d lTmUZU d d!lVmWZW d d"lXmYZY d d#lZm[Z[ ee\Z]G d$d% d%e[Z^dS )&    N)deepcopy)Any)
VllmConfig)CUDAGraphMode)&prepare_communication_buffer_for_model)set_forward_context)init_logger)get_model_loader)MULTIMODAL_REGISTRY)DeviceMemoryProfiler
format_gib)STR_DTYPE_TO_TORCH_DTYPE)GrammarOutputSchedulerOutput)KVCacheConfig)DraftTokenIdsModelRunnerOutput)AsyncOutput)build_attn_metadatabuild_slot_mappings_by_layerget_kv_cache_specinit_attn_backendinit_kv_cache)BlockTables)async_copy_to_gpu)CudaGraphManager)get_cudagraph_and_dp_paddingmake_num_tokens_across_dp)
InputBatchInputBuffers combine_sampled_and_draft_tokensexpand_idx_mappingget_num_sampled_and_rejectedpost_updateprepare_pos_seq_lensprepare_prefill_inputs)NO_OP_KV_CONNECTORKVConnectorget_kv_connector)	LoraState)EncoderRunner)
MRopeState)SamplerOutput)PromptLogprobsWorker)Sampler)init_speculator)rejection_sample)DraftTokensHandler)RequestState)StructuredOutputsWorker)LoRAModelRunnerMixinc                   @   s  e Zd ZdedejfddZdeddfdd	Ze	de
e fd
dZdSddZdejfddZdd ZdeddfddZdeddfddZe dddedede
ejejf fddZe dejddfdd Ze dSd!d"ZdSd#d$ZdSd%d&Zd'edefd(d)Ze defd*d+ZdSd,d-Z d.e!ddfd/d0Z"d.e!ddfd1d2Z#d.e!ddfd3d4Z$d.e!ddfd5d6Z%d.e!d7edefd8d9Z&e d:e'ee(e f dede
e(ej ejf fd;d<Z)dejded=e*dB de
e+ejejf fd>d?Z,ded@ejdAejdBejddf
dCdDZ-e dedEejdFe(ej dB dAejdBejdejfdGdHZ.e 		I	IdTd.e!dJe/dB dKedLede0dB f
dMdNZ1e d=e*dB de2e0B fdOdPZ3de4dB fdQdRZ5dS )UGPUModelRunnervllm_configdevicec                 C   s@  || _ |j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j	| _	|| _
| jj| _| j| _| jjdkr@t| jj | _d| _| j | _| jj| _| jj| _| jj| _| j | _t| _| j| j| _| jryt| j| j| j| j
d| _| jj| _| jrt | j| j| j| j
d| _!| jj"| _#t$j%&| j
| _'t$j%( | _)| jd urd| _*| jj+| _,t-| j | j
| _.n	d| _*d| _,d | _.t/| j| j| j| j,| j| j
d| _0t1| j| j| j
d| _2t3| j| j| j
| jj4| j,d	 d
| _5t6| j| _7t8| j | j| j
| _9t:| j| j,d	  | j| j
d| _;t<| jd| _=t>| j
| _?t@| _Ad S )NautoF)max_num_tokenshidden_sizedtyper7   )max_num_reqsr9   max_model_lenr7   Tr   )r<   r=   max_num_batched_tokensnum_speculative_steps
vocab_sizer7   )r<   r9   r7      )r<   r@   r7   logprobs_modenum_speculative_tokens)max_num_logitsr@   r7   )r<   )Br6   model_configcache_configcompilation_configlora_configload_configparallel_configscheduler_configspeculative_configobservability_configr7   r;   kv_cache_dtypecache_dtyper   is_pooling_modelget_vocab_sizer@   r=   r>   r9   max_num_seqsr<   get_inputs_embeds_sizeinputs_embeds_sizer
   mm_registrysupports_multimodal_inputssupports_mm_inputsr*   encoder_runner
uses_mroper+   mrope_statesasync_schedulinguse_async_schedulingtorchcudaStreamoutput_copy_streamEventoutput_copy_eventdo_spec_decoderC   r?   r/   
speculatorr2   
req_statesr   input_buffersr.   rB   samplerr-   prompt_logprobs_workerr   cudagraph_managerr3   structured_outputs_workerr)   
lora_stater1   draft_tokens_handlerr&   kv_connector)selfr6   r7    ro   S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu/model_runner.py__init__G   s   








zGPUModelRunner.__init__r=   returnNc                 C   s   || _ || j_ d S N)r=   re   )rn   r=   ro   ro   rp   update_max_model_len   s   z#GPUModelRunner.update_max_model_lenc                   C   s   dS )N)generatero   ro   ro   ro   rp   get_supported_tasks      z"GPUModelRunner.get_supported_tasksc                 O   s   t  }t 6}t| jj}td |j| j| jj	d| _
| jr,| | j
| j| j| _
| jr6| j| j
 W d    n1 s@w   Y  t  }|j| _tdt|j||  t| j
 | jrrt| jdd }|d urtt| d S d S d S )NzLoading model from scratch...)r6   rE   z*Model loading took %s GiB and %.6f secondsmodel)timeperf_counterr   r	   r6   rI   loggerinfo
load_modelrE   rx   rH   load_lora_modelr7   rc   rd   consumed_memorymodel_memory_usager   r   getattr)rn   argskwargstime_before_loadmmodel_loadertime_after_loadspeculator_modelro   ro   rp   r}      s<   

zGPUModelRunner.load_modelc                 C   s   | j S rs   )rx   rn   ro   ro   rp   	get_model   s   zGPUModelRunner.get_modelc                 C   s
   t | jS rs   )r   r6   r   ro   ro   rp   r         
z GPUModelRunner.get_kv_cache_speckv_cache_configc                 C   s   t |}|| _dd |jD }t|| j| j| j| jd| _t	| j| j
| j\| _| _| jr8| j| j| j| j g | _t| j| jj| j| j| j}t| j
|| _g | _d S )Nc                 S   s   g | ]}|j jqS ro   )kv_cache_spec
block_size).0kv_cache_groupro   ro   rp   
<listcomp>   s    z6GPUModelRunner.initialize_kv_cache.<locals>.<listcomp>)block_sizesr<   r>   r=   r7   )r   r   kv_cache_groupsr   r<   r9   r=   r7   block_tablesr   r6   attn_backendsattn_metadata_buildersrc   rd   set_attn	kv_cachesr   rG   static_forward_contextr(   rm   attn_groups)rn   r   r   kv_caches_dictro   ro   rp   initialize_kv_cache   s>   
z"GPUModelRunner.initialize_kv_cacheinput_batchc                 C   sj   | j |j}| j |j}t|| j}t| j|j|j|j	t
|j|j| j||| jd
}||_||_d S )N
r   num_reqs
num_tokensquery_start_loc_gpuquery_start_loc_cpuseq_lensmax_seq_lenr   slot_mappingsr   )r   get_dummy_block_tablesr   get_dummy_slot_mappingsr   r   r   r   r   query_start_locr]   
from_numpyquery_start_loc_npr   r=   attn_metadatar   )rn   r   r   r   slot_mappings_by_layerr   ro   ro   rp   prepare_dummy_attn_metadata  s*   

z*GPUModelRunner.prepare_dummy_attn_metadataT	skip_attnr   r   c                O   s   t || j}|| g| }|d  || 7  < t||ksJ dd t|D }t }||_||_| j	d | j
|d|d | j	d | jd usMJ | j\}	}
}|	|
j }|	|fS )Nc                 S   s   i | ]
\}}d | |qS )_dummy_req_ro   )r   inro   ro   rp   
<dictcomp>'  s    z-GPUModelRunner._dummy_run.<locals>.<dictcomp>T)	dummy_runskip_attn_for_dummy_runF)minr<   sum	enumerater   
make_emptytotal_num_scheduled_tokensnum_scheduled_tokensrm   set_disabledexecute_modelexecute_model_statelogits_indices)rn   r   r   r   r   r   num_tokens_per_requestr   dummy_scheduler_outputhidden_statesr   _sample_hidden_statesro   ro   rp   
_dummy_run  s&   
zGPUModelRunner._dummy_runr   c           	   	   C   s   |j d }| j|}tj|tj| jd}tj|tjd}tj|tj	| jd}tj|tj| jd}tj|tj| jd}| 
||||||| d S )Nr   r;   r7   r;   )shaperx   compute_logitsr]   arangeint32r7   npzerosint64rg   )	rn   r   r   logitsidx_mappingidx_mapping_npposdummy_input_idsexpanded_local_posro   ro   rp   _dummy_sampler_run;  s$   

z!GPUModelRunner._dummy_sampler_runc                 C   sd   | j | jdd\}}| | | jr%t| jj| j}| jj| jd d |d t	j
  ~~t  d S )NTr   )r   r   num_tokens_across_dp)r   r9   r   rc   r   rJ   data_parallel_sizerd   	run_modelr]   r^   synchronizegccollect)rn   r   r   r   ro   ro   rp   profile_runS  s"   



zGPUModelRunner.profile_runc                 C      | j r
| j  d S d S rs   )rW   rX   reset_mm_cacher   ro   ro   rp   r   g     zGPUModelRunner.reset_mm_cachec                 C   r   rs   )rW   rX   reset_encoder_cacher   ro   ro   rp   r   k  r   z"GPUModelRunner.reset_encoder_cacher   c                 C   s   |S rs   ro   )rn   r   ro   ro   rp   _get_num_input_tokenso  rw   z$GPUModelRunner._get_num_input_tokensc           	   
   C   s   | j  std dS t }t  tj	
  tj	 d }| | j4 d }| jr0| jj}d }| jr9| jj}| j j| j| j||| j| j| jd | jrS| j  W d    n1 s]w   Y  t }tj	 d }|| }|| }td||d  |S )NzrSkipping CUDA graph capture. To turn on CUDA graph capture, ensure `cudagraph_mode` was not manually set to `NONE`r   )rx   rf   mrope_positionsinputs_embedsr   r   r   z4Graph capturing finished in %.0f secs, took %.2f GiBi   @)ri   needs_capturer{   warningry   rz   r   r   r]   r^   empty_cachemem_get_infomaybe_setup_dummy_lorasrH   rY   rZ   r   rW   rX   r   capturerx   rf   r   r   r   rc   rd   capture_modelr|   )	rn   
start_timestart_free_gpu_memoryr   r   end_timeend_free_gpu_memoryelapsed_timecuda_graph_sizero   ro   rp   r   s  sN   

	
zGPUModelRunner.capture_modelc                 C   s:   t dd | j D r| j| jdd tj  d S d S )Nc                 s   s    | ]	}d |  v V  qdS )
FLASHINFERN)get_name)r   bro   ro   rp   	<genexpr>  s    z4GPUModelRunner.warmup_for_prefill.<locals>.<genexpr>Fr   )allr   valuesr   r9   r]   r^   r   r   ro   ro   rp   warmup_for_prefill  s   z!GPUModelRunner.warmup_for_prefillscheduler_outputc                 C   s^   |j }|j}|r||}|D ]}| j| | jr | j| | j| | j| qd S rs   )	finished_req_idspreempted_req_idsunionre   remove_requestrW   rX   rh   rk   )rn   r   r   r   req_idro   ro   rp   finish_requests  s   
zGPUModelRunner.finish_requestsc                 C   s&   | j r|jD ]
}| j| qd S d S rs   )rW   free_encoder_mm_hashesrX   free_encoder_cache)rn   r   mm_hashro   ro   rp   free_states  s
   
zGPUModelRunner.free_statesc                 C   s6  |j D ]q}|jd usJ |jd usJ |jd usJ |j}t|j}| jj|||j|jd | jj	| }| j
r?| j||j | jrO| jj|| j|j|jd | jj||jdd | j|||j | j|||j | j|||j q|j r| j  | j| jjj| jjj| jj | jr| j  d S d S d S )N)r   
prompt_lenprefill_token_idsnum_computed_tokens)mm_featuresT	overwrite)scheduled_new_reqsprompt_token_idsr  sampling_paramsr   lenre   add_requestr  req_id_to_indexrW   rX   r  rY   rZ   init_prefill_mrope_positionsrx   r   append_block_ids	block_idsrg   rh   rk   lora_requestapply_staged_writesgpuprefill_lenr   r  )rn   r   new_req_datar   r  	req_indexro   ro   rp   add_requests  sV   


zGPUModelRunner.add_requestsc                 C   sH   |j }t|j|jD ]\}}|d ur!| jj| }| jj||dd q
d S )NFr  )scheduled_cached_reqszipnew_block_idsreq_idsre   r  r   r  )rn   r   reqsreq_new_block_idsr   r  ro   ro   rp   update_requests  s   zGPUModelRunner.update_requestsnum_tokens_after_paddingc           !         s  |j }|dks	J |j}t|}t||jd}t|j|}tj|tj|d}t| j	j
j|}	tj|	tj|d}
t|
| jd}|j  sjd}|}tj|d tjd}tj|d | jtjd}|}tj|tj| jd}nKtj fd	d
|D tjd}t| }|| }|d }tj|d tjd}d|d< tj||dd  d t|| jd}| jd }t||||\}}| j|}tj| jd tjd}d|d< tj||d|d  d |||d d < t|| jjd |d |d  }t|}| jjd |d  }t| jj| j	j ||| j	j!j"| j	j#j"| j	j$j" t%||| j	j$j"| jj&| jj' | jj'd | }| j(r?| j)*||| j	j#j"| j	j$j" t+| jj|| j	j,||| j	j#j"| j	j-||	}| j.||| jj&d | }t/|| j0}t1| j2||||| jj'| j3||| j0d
}| jjd | }| jj&d | }d } | j(r| j)j4} | d d d |f } t5d$i d|d|d|d|
d|d|d|d|d|d|d|d|d|d|d|d| dd d|d|d |d!|d"|d#|j6S )%Nr   )key)r;   count)r7   rA   r   )r7   r;   r   c                    s   g | ]
}t  |d qS )ro   )r  get)r   r   draft_tokensro   rp   r     s    z1GPUModelRunner.prepare_inputs.<locals>.<listcomp>outr   r  r   r   r   expanded_idx_mappingr   r   r   r!  num_draft_tokensr   r   r   	input_ids	positionsr   r   r   r   r   cu_num_logitscu_num_logits_nphas_structured_output_reqsro   )7r   r   r  sortedr$  mapr   fromiterr   re   r  r   r7   scheduled_spec_decode_tokensr   r]   r   arrayintr   emptycumsumr?   r!   r   gather_block_tablesr<   rf   r   r   r%   r+  next_prefill_tokensr  r  r  r  r$   r,  r   rY   rZ   prepare_mrope_positionsr    last_sampled_tokensr&  compute_slot_mappingsr   r   r   r   r=   r   r   has_structured_output_requests)!rn   r   r!  r   num_tokens_per_reqr   r  numtoks_iterr   idx_mapping_iterr   r   total_num_draft_tokenstotal_num_logitsr.  r-  r)  r   r*  
num_logitsmax_expand_lenr   r   r   r   r   r   r   r   r   r+  r,  r   ro   r%  rp   prepare_inputs  s4  


		
zGPUModelRunner.prepare_inputsscheduled_encoder_inputsc              	   C   sb   | j |\}}| j | j|| | j |j|j|j|j| j	j
j|j | j	j|j \}}||fS rs   )rX   prepare_mm_inputsexecute_mm_encoderrx   gather_mm_embeddingsr  r   r   r   re   r  r   r   num_computed_prefill_tokens)rn   rF  r   	mm_hashes	mm_kwargs	mm_embedsis_mm_embedro   ro   rp   get_mm_embeddings  s   z GPUModelRunner.get_mm_embeddingsgrammar_outputc              	   C   s   ||j  }|j|j  }|j|j  }| j|}|d ur&| j|||j|j | 	||j
|j|j|||j}|jdkrFtj|jtj| jd}	nt|j||j| j\}
}	|
|_t|	|j|j|j| jjj\}	}||	|fS )Nr   r   )r   r,  r+  rx   r   rj   apply_grammar_bitmaskstructured_output_request_idsgrammar_bitmaskrg   r)  r   r.  r   r*  r]   onesr   r   r7   r0   sampled_token_idsr-  r?   r"   r   r   re   r  r  )rn   r   r   rP  r   
sample_posr+  r   sampler_outputnum_sampledsampled_tokensnum_rejectedro   ro   rp   sample  sN   



zGPUModelRunner.samplerY  rX  rZ  c              	   C   sd   t |j| jjj| jj| jjj||||j	 |j
}| jj}||  |j7  < tj|| jjj|d d S )Nr'  )r#   r   re   r  r  r;  rg   penalties_stateoutput_bin_countsr   r   rJ  r   r   minimumr  )rn   r   rY  rX  rZ  r   computed_prefillro   ro   rp   postprocess  s    
zGPUModelRunner.postprocesslast_hidden_statesaux_hidden_statesc                 C   sF   | j d usJ | j |||||| jj| jj| jjjj| jjj	j	}|S rs   )
rd   proposere   r;  r9  rg   sampling_statestemperaturer  seeds)rn   r   ra  rb  rX  rZ  r&  ro   ro   rp   propose_draft  s   	

zGPUModelRunner.propose_draftFintermediate_tensorsr   r   c                 C   s*  |d u sJ |s.|  | | | | | | | | j  |jdkr.| j|}|S | j	
|j|j }t|j|| jj| jj\}}}	|dkrT| j|}|S |s| ||}
| jro| j|
j|
j|
j}| j|  | jr| |j|
\}}| j| j|
j||}|d |
j |
_ n(t!|| j"}t#j$||| j%| j&d}
| j'r| j(j)d d d |f |
_)|s| *|
 |r| j+| | j	,|
j}n@|
j-}| j'r|
j)d usJ |
j)}t.|
j/| j0|
jt1j2|	|
j3d | j+| | j|
j||
j d}W d    n	1 sw   Y  | j4|}||
|f| _5d S )Nr   )r   r   rf   r7   )r   cudagraph_runtime_moder   slot_mapping)r+  r,  r   )6r   r  r  r   r   r  r   rm   
no_forwardri   get_cudagraph_sizer   r   r   rJ   r   data_parallel_rankrE  rH   rk   make_lora_inputsr  r   _set_active_lorasrW   rO  rF  rX   get_inputs_embedsrx   r+  r!  r   r   r<   r   
make_dummyrf   r7   rY   rZ   r   r   pre_forwardrunr,  r   r   r6   r   NONEr   post_forwardr   )rn   r   rh  r   r   empty_outputcudagraph_sizeuse_cudagraphr!  r   r   lora_inputsrM  rN  r   r   r   r,  kv_connector_outputro   ro   rp   r     s   







	zGPUModelRunner.execute_modelc              
   C   s   | j d usJ | j \}}}d | _ | |||\}}}| j| jj||| jjj| jj	j| jj
| jjj| jj}t|jdd t|jD d ||d}	t|	||| j| jd}
| ||j|| | jrv| ||d ||}|| jj|j< | j|| | jr{|
S |
 S )Nc                 S   s   i | ]\}}||qS ro   ro   )r   r   r   ro   ro   rp   r     s    z0GPUModelRunner.sample_tokens.<locals>.<dictcomp>)r  r  rU  prompt_logprobs_dictrz  )model_runner_outputrW  num_sampled_tokenscopy_stream
copy_event)r   r[  rh   compute_prompt_logprobsrx   r   re   r  r  r  r  r  r   rJ  r   r  r   r   r`   rb   r`  rU  rc   rg  r&  r   rl   set_draft_tokensr\   
get_output)rn   rP  r   r   rz  rW  rX  rZ  r{  r|  async_outputr&  ro   ro   rp   sample_tokens  s\   
	
zGPUModelRunner.sample_tokensc                 C   s
   | j  S rs   )rl   get_draft_tokensr   ro   ro   rp   take_draft_token_ids  r   z#GPUModelRunner.take_draft_token_ids)rr   N)NFF)6__name__
__module____qualname__r   r]   r7   rq   r5  rt   staticmethodtuplestrrv   r}   nnModuler   r   r   r   r   r   inference_modeboolTensorr   r   r   r   r   r   r   r   r   r   r  r  r   rE  dictlistrO  r   r,   r[  r`  rg  r   r   r   r   r  r   r  ro   ro   ro   rp   r5   F   s    
k
(


,0

 )
8
t>r5   )_r   ry   copyr   typingr   numpyr   r]   torch.nnr  vllm.configr   vllm.config.compilationr   vllm.distributed.parallel_stater   vllm.forward_contextr   vllm.loggerr    vllm.model_executor.model_loaderr	   vllm.multimodalr
   vllm.utils.mem_utilsr   r   vllm.utils.torch_utilsr   vllm.v1.core.sched.outputr   r   vllm.v1.kv_cache_interfacer   vllm.v1.outputsr   r   vllm.v1.worker.gpu.async_utilsr   vllm.v1.worker.gpu.attn_utilsr   r   r   r   r   vllm.v1.worker.gpu.block_tabler   vllm.v1.worker.gpu.buffer_utilsr   "vllm.v1.worker.gpu.cudagraph_utilsr   vllm.v1.worker.gpu.dp_utilsr   r   vllm.v1.worker.gpu.input_batchr   r   r    r!   r"   r#   r$   r%   vllm.v1.worker.gpu.kv_connectorr&   r'   r(   vllm.v1.worker.gpu.lora_utilsr)   $vllm.v1.worker.gpu.mm.encoder_runnerr*   !vllm.v1.worker.gpu.mm.mrope_utilsr+    vllm.v1.worker.gpu.sample.outputr,   (vllm.v1.worker.gpu.sample.prompt_logprobr-   !vllm.v1.worker.gpu.sample.samplerr.   vllm.v1.worker.gpu.spec_decoder/   /vllm.v1.worker.gpu.spec_decode.rejection_sampler0   $vllm.v1.worker.gpu.spec_decode.utilsr1   vllm.v1.worker.gpu.statesr2   %vllm.v1.worker.gpu.structured_outputsr3   &vllm.v1.worker.lora_model_runner_mixinr4   r  r{   r5   ro   ro   ro   rp   <module>   sR   (
