o
    
۾i                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ e
eZG d
d deZedd ZedefddZdS )    )contextmanager)AnyN)
VllmConfig)init_logger)	get_model)
instrument)CpuGpuBuffer)GPUModelRunnerc                       s   e Zd Zdedejf fddZdddZed	d
dde	ddfddZ
dejfddZedd
dddZdddZdddZdedeeejdB f fddZ  ZS )CPUModelRunnervllm_configdevicec                    sp   t   t || W d    n1 sw   Y  |tdks#J | jd u s,J dd| _d| _|   d S )Ncpuzspec decode is not supported.F)	_torch_cuda_wrappersuper__init__torchr   speculative_configuse_cuda_graphcascade_attn_enabled_postprocess_tensors)selfr   r   	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/cpu_model_runner.pyr      s   zCPUModelRunner.__init__returnNc                 C   s   dt dtdd fdd}t|  D ]}t|tr|j|_qt| j	 D ]\}}|
dr>t|tjr>|| j||d d  q$| jjjD ]}t| D ]}t|trW|j|_qLqDd S )Nobjcpu_attr_namer   c                 S   s`   t | |d }t | |d }|d ur,|d ur.t|tjsJ t|tjs$J t| || d S d S d S N)getattr
isinstancer   Tensorsetattr)r   r   device_attr_name
cpu_tensordevice_tensorr   r   r   replace_tensor"   s   z;CPUModelRunner._postprocess_tensors.<locals>.replace_tensor_cpu_tensori)r   strvarsvaluesr    r   r   gpuinput_batchitemsendswithr   r!   block_tableblock_tables)r   r&   vkr/   r   r   r   r       s    

z#CPUModelRunner._postprocess_tensorszLoading (CPU))	span_nameFeep_scale_upc                 C   sB   t d| jj t| jd| _| jr| | j| j| j| _d S d S )NzStarting to load model %s...)r   )	loggerinfomodel_configmodelr   r   lora_configload_lora_modelr   )r   r4   r   r   r   
load_model7   s
   zCPUModelRunner.load_modelc                 C   s   | j S r   )r8   r   r   r   r   r   ?   s   zCPUModelRunner.get_modelzWarmup (CPU)c                 C   s^   t d t| j | ttd| j| jj	 W d    n1 s#w   Y  t d d S )Nz'Warming up model for the compilation...   zWarming up done.)
r5   r6    _set_global_compilation_settingsr   
_dummy_runminmaxmax_num_reqsscheduler_configmax_num_batched_tokensr<   r   r   r   warming_up_modelB   s   

zCPUModelRunner.warming_up_modelc                 C      d S r   r   r<   r   r   r   _init_device_propertiesP      z&CPUModelRunner._init_device_propertiesc                 C   rF   r   r   r<   r   r   r   _sync_deviceS   rH   zCPUModelRunner._sync_device
num_tokensc                 C   s   dS )N)r   Nr   )r   rJ   r   r   r   get_dp_paddingV   s   zCPUModelRunner.get_dp_paddingr   N)F)__name__
__module____qualname__r   r   r   r   r   r   boolr;   nnModuler   rE   rG   rI   inttupler!   rK   __classcell__r   r   r   r   r
      s    


(r
   c                  c   sf    G dd d} G dd d}t j}t jj}z| t _|t j_d V  W |t _|t j_d S |t _|t j_w )Nc                   @      e Zd ZdddZdS )z._torch_cuda_wrapper.<locals>._EventPlaceholderr   Nc                 _   s   dd | _ dd | _d S )Nc                   S   rF   r   r   r   r   r   r   <lambda>_       zI_torch_cuda_wrapper.<locals>._EventPlaceholder.__init__.<locals>.<lambda>c                   S   rF   r   r   r   r   r   r   rW   `   rX   )recordsynchronizer   argskwargsr   r   r   r   ^   s   
z7_torch_cuda_wrapper.<locals>._EventPlaceholder.__init__rL   rM   rN   rO   r   r   r   r   r   _EventPlaceholder]       r_   c                   @   rV   )z/_torch_cuda_wrapper.<locals>._StreamPlaceholderr   Nc                 _   rF   r   r   r[   r   r   r   r   c   rH   z8_torch_cuda_wrapper.<locals>._StreamPlaceholder.__init__rL   r^   r   r   r   r   _StreamPlaceholderb   r`   ra   )r   EventcudaStream)r_   ra   
cuda_eventcuda_streamr   r   r   r   [   s   
r   configc                 c   sP    dd l m  m} | jj}|j}z|ddrd|_d V  W ||_d S ||_w )Nr   max_autotuneFT)torch._inductor.config	_inductorrg   compilation_configinductor_compile_configfreezingget)rg   torch_inductor_configinductor_configfreezing_valuer   r   r   r>   q   s   r>   )
contextlibr   typingr   r   torch.nnrQ   vllm.configr   vllm.loggerr    vllm.model_executor.model_loaderr   vllm.tracingr   vllm.v1.utilsr   vllm.v1.worker.gpu_model_runnerr	   rM   r5   r
   r   r>   r   r   r   r   <module>   s    H
