o
    
۾i                     @   s  d dl mZmZ d dlmZmZmZ d dlmZm	Z	 d dl
mZmZ d dlmZmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( erzd dl%m)Z) G dd deZ*dS )    )ABCabstractmethod)AsyncGeneratorIterableMapping)TYPE_CHECKINGAny)ModelConfig
VllmConfig)WeightTransferInitRequestWeightTransferUpdateRequest)
PromptTypeStreamingInput)LoRARequest)PoolingRequestOutputRequestOutput)IOProcessor)PoolingParams)BaseRenderer)
DictPrompt	TokPrompt)SamplingParams)SupportedTask)EngineCoreRequest)InputProcessor)	PauseModec                   @   sZ  e Zd ZU dZeed< eed< eed< edB ed< e	e
defdd	Ze	e
defd
dZe	e
defddZe	e
defddZe	e
defddZe
ddddddddeeB eB eB eedf B dedededB dedB deeef dB deeef dB dededB dee df fddZ!e
				dddeeB eB de"dededB deeef dB dedeeef dB dee#df fd d!Z$e
dee%e B ddfd"d#Z&e
defd$d%Z'e
ded&d'Z(e
ded(d)Z)e
ded*d+Z*e
ded,d-Z+e
ded.d/Z,e
ded0d1Z-e
	2dfd3ed4edefd5d6Z.e
dgd8eddfd9d:Z/e
dhd;e0e dB ddfd<d=Z1e
defd>d?Z2e
dedefd@dAZ3e
dBd2dCdDdEdFdGedHeddfdIdJZ4e
dedKdLZ5e
defdMdNZ6	OdidPedQeddfdRdSZ7		T	djdUedVe8dB dWe9dXedB fdYdZZ:de9e;d[f fd\d]Z<d^e=ddfd_d`Z>dae?ddfdbdcZ@dS )kEngineClientz$Protocol class for Clients to Enginevllm_configmodel_configinput_processorNio_processorreturnc                 C      d S N selfr$   r$   H/home/ubuntu/.local/lib/python3.10/site-packages/vllm/engine/protocol.pyrenderer%      zEngineClient.rendererc                 C   r"   r#   r$   r%   r$   r$   r'   
is_running)   r)   zEngineClient.is_runningc                 C   r"   r#   r$   r%   r$   r$   r'   
is_stopped-   r)   zEngineClient.is_stoppedc                 C   r"   r#   r$   r%   r$   r$   r'   errored1   r)   zEngineClient.erroredc                 C   r"   r#   r$   r%   r$   r$   r'   
dead_error5   r)   zEngineClient.dead_errorr   )prompt_textlora_requesttokenization_kwargstrace_headersprioritydata_parallel_rankpromptsampling_params
request_idr.   r/   r0   r1   r2   r3   c          
      C      dS )zGenerate outputs for a request.Nr$   )
r&   r4   r5   r6   r.   r/   r0   r1   r2   r3   r$   r$   r'   generate9   s   zEngineClient.generatepooling_paramsc                 C   r7   )z4Generate outputs for a request from a pooling model.Nr$   )r&   r4   r9   r6   r/   r1   r2   r0   r$   r$   r'   encodeN   s   zEngineClient.encodec                       dS )zAbort a request.

        Args:
            request_id: The unique id of the request,
                        or an iterable of such ids.
        Nr$   )r&   r6   r$   r$   r'   abort\      zEngineClient.abortc                       d S r#   r$   r%   r$   r$   r'   is_tracing_enabledf      zEngineClient.is_tracing_enabledc                    r>   r#   r$   r%   r$   r$   r'   do_log_statsi   r@   zEngineClient.do_log_statsc                    r;   )zRaise if unhealthyNr$   r%   r$   r$   r'   check_healthl      zEngineClient.check_healthc                    r;   )zStart profiling the engineNr$   r%   r$   r$   r'   start_profileq   rC   zEngineClient.start_profilec                    r;   )zStop profiling the engineNr$   r%   r$   r$   r'   stop_profilev   rC   zEngineClient.stop_profilec                    r;   )zReset the multi-modal cacheNr$   r%   r$   r$   r'   reset_mm_cache{   rC   zEngineClient.reset_mm_cachec                    r;   )zReset the encoder cacheNr$   r%   r$   r$   r'   reset_encoder_cache   rC   z EngineClient.reset_encoder_cacheFreset_running_requestsreset_connectorc                    r;   )zDReset the prefix cache and optionally any configured connector cacheNr$   )r&   rH   rI   r$   r$   r'   reset_prefix_cache   s   zEngineClient.reset_prefix_cache   levelc                    r;   )zSleep the engineNr$   )r&   rL   r$   r$   r'   sleep   rC   zEngineClient.sleeptagsc                    r;   )zWake up the engineNr$   )r&   rN   r$   r$   r'   wake_up   rC   zEngineClient.wake_upc                    r;   )z$Check whether the engine is sleepingNr$   r%   r$   r$   r'   is_sleeping   rC   zEngineClient.is_sleepingc                    r;   )z<Load a new LoRA adapter into the engine for future requests.Nr$   )r&   r/   r$   r$   r'   add_lora   rC   zEngineClient.add_lorar<   T)modewait_for_inflight_requestsclear_cacherR   r   rS   rT   c                   r;   )a~  Pause new generation/encoding requests.

        Args:
            mode: How to handle in-flight requests:
                - ``"abort"``: Abort all in-flight requests immediately
                  and return partial results with "abort" reason (default).
                - ``"wait"``: Wait for in-flight requests to complete.
                - ``"keep"``: Freeze requests in queue; they resume on
                  :meth:`resume_generation`.
            wait_for_inflight_requests: DEPRECATED. Use ``mode="wait"`` instead.
            clear_cache: DEPRECATED. Whether to clear KV and prefix caches
                after draining.
        Nr$   )r&   rR   rS   rT   r$   r$   r'   pause_generation   s   zEngineClient.pause_generationc                    r;   )z.Resume accepting generation/encoding requests.Nr$   r%   r$   r$   r'   resume_generation   rC   zEngineClient.resume_generationc                    r;   )z.Return whether the engine is currently paused.Nr$   r%   r$   r$   r'   	is_paused   rC   zEngineClient.is_paused,  new_data_parallel_sizedrain_timeoutc                       t )zScale the engineNotImplementedError)r&   rY   rZ   r$   r$   r'   scale_elastic_ep      zEngineClient.scale_elastic_epr$   methodtimeoutargskwargsc                    r[   )z0Perform a collective RPC call to the given path.r\   )r&   r`   ra   rb   rc   r$   r$   r'   collective_rpc   r=   zEngineClient.collective_rpc.c                    r[   )zGet supported tasksr\   r%   r$   r$   r'   get_supported_tasks      z EngineClient.get_supported_tasksinit_requestc                    r[   )z+Initialize weight transfer for RL training.r\   )r&   rg   r$   r$   r'   init_weight_transfer_engine   r_   z(EngineClient.init_weight_transfer_enginerequestc                    r[   )z&Batched weight update for RL training.r\   )r&   ri   r$   r$   r'   update_weights   rf   zEngineClient.update_weights)NNr   N)r!   N)FF)rK   r#   )rX   )Nr$   N)A__name__
__module____qualname____doc__r
   __annotations__r	   r   r   propertyr   r   r(   boolr*   r+   r,   BaseExceptionr-   r   r   r   r   r   r   r   strr   dictr   r   intr   r8   r   r   r:   r   r<   r?   rA   rB   rD   rE   rF   rG   rJ   rM   listrO   rP   rQ   rU   rV   rW   r^   floattuplerd   r   re   r   rh   r   rj   r$   r$   r$   r'   r      s8  
 




		
	


r   N)+abcr   r   collections.abcr   r   r   typingr   r   vllm.configr	   r
   %vllm.distributed.weight_transfer.baser   r   vllm.inputs.datar   r   vllm.lora.requestr   vllm.outputsr   r   vllm.plugins.io_processorsr   vllm.pooling_paramsr   vllm.renderersr   vllm.renderers.inputsr   r   vllm.sampling_paramsr   
vllm.tasksr   vllm.v1.enginer   vllm.v1.engine.input_processorr   r   r   r$   r$   r$   r'   <module>   s&   