o
    -i                     @   s   d dl mZmZ d dlmZmZmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  G dd deZ!dS )    )ABCabstractmethod)AsyncGeneratorIterableMapping)Any)ModelConfig
VllmConfig)
PromptType)LoRARequest)PoolingRequestOutputRequestOutput)IOProcessor)PoolingParams)RendererLike)SamplingParams)SupportedTask)EngineCoreRequest)InputProcessorc                   @   s  e Zd ZU dZeed< eed< eed< edB ed< e	e
defdd	Ze	e
defd
dZe	e
defddZe	e
defddZe	e
defddZe
ddddddddeeB dedededB dedB deeef dB deeef dB dededB deedf fddZe
					dZdededededB deeef dB ded edB deeef dB dee df fd!d"Z!e
dee"e B ddfd#d$Z#e
defd%d&Z$e
d[d'd(Z%e
d[d)d*Z&e
d[d+d,Z'e
d[d-d.Z(e
d[d/d0Z)e
	1d\d2ed3edefd4d5Z*e
d]d7eddfd8d9Z+e
d^d:e,e dB ddfd;d<Z-e
defd=d>Z.e
dedefd?d@Z/e
d1dAdBdCedDeddfdEdFZ0e
d[dGdHZ1e
defdIdJZ2	Kd_dLedMeddfdNdOZ3		P	d`dQedRe4dB dSe5dTedB fdUdVZ6de5e7dWf fdXdYZ8dS )aEngineClientz$Protocol class for Clients to Enginevllm_configmodel_configinput_processorNio_processorreturnc                 C      d S N selfr   r   Q/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/engine/protocol.pyrenderer      zEngineClient.rendererc                 C   r   r   r   r   r   r   r    
is_running!   r"   zEngineClient.is_runningc                 C   r   r   r   r   r   r   r    
is_stopped%   r"   zEngineClient.is_stoppedc                 C   r   r   r   r   r   r   r    errored)   r"   zEngineClient.erroredc                 C   r   r   r   r   r   r   r    
dead_error-   r"   zEngineClient.dead_errorr   )prompt_textlora_requesttokenization_kwargstrace_headersprioritydata_parallel_rankpromptsampling_params
request_idr'   r(   r)   r*   r+   r,   c          
      C      dS )zGenerate outputs for a request.Nr   )
r   r-   r.   r/   r'   r(   r)   r*   r+   r,   r   r   r    generate1   s   zEngineClient.generatepooling_paramstruncate_prompt_tokensc	           	      C   r0   )zGenerate outputs for a request from a pooling model.

        NOTE: truncate_prompt_tokens is deprecated in v0.14.
        TODO: Remove this argument in v0.15.
        Nr   )	r   r-   r2   r/   r(   r*   r+   r3   r)   r   r   r    encodeB   s   zEngineClient.encodec                       dS )zAbort a request.

        Args:
            request_id: The unique id of the request,
                        or an iterable of such ids.
        Nr   )r   r/   r   r   r    abortU      zEngineClient.abortc                       d S r   r   r   r   r   r    is_tracing_enabled_      zEngineClient.is_tracing_enabledc                    r8   r   r   r   r   r   r    do_log_statsb   r:   zEngineClient.do_log_statsc                    r5   )zRaise if unhealthyNr   r   r   r   r    check_healthe      zEngineClient.check_healthc                    r5   )zStart profiling the engineNr   r   r   r   r    start_profilej   r=   zEngineClient.start_profilec                    r5   )zStop profiling the engineNr   r   r   r   r    stop_profileo   r=   zEngineClient.stop_profilec                    r5   )zReset the multi-modal cacheNr   r   r   r   r    reset_mm_cachet   r=   zEngineClient.reset_mm_cacheFreset_running_requestsreset_connectorc                    r5   )zDReset the prefix cache and optionally any configured connector cacheNr   )r   rA   rB   r   r   r    reset_prefix_cachey   s   zEngineClient.reset_prefix_cache   levelc                    r5   )zSleep the engineNr   )r   rE   r   r   r    sleep   r=   zEngineClient.sleeptagsc                    r5   )zWake up the engineNr   )r   rG   r   r   r    wake_up   r=   zEngineClient.wake_upc                    r5   )z$Check whether the engine is sleepingNr   r   r   r   r    is_sleeping   r=   zEngineClient.is_sleepingc                    r5   )z<Load a new LoRA adapter into the engine for future requests.Nr   )r   r(   r   r   r    add_lora   r=   zEngineClient.add_loraT)wait_for_inflight_requestsclear_cacherK   rL   c                   r5   )a\  Pause new generation/encoding requests.

        Args:
            wait_for_inflight_requests: When ``True`` waits for in-flight requests
                to finish before pausing. When ``False`` (default), aborts in-flight
                requests immediately.
            clear_cache: Whether to clear KV and prefix caches after draining.
        Nr   )r   rK   rL   r   r   r    pause_generation   s   zEngineClient.pause_generationc                    r5   )z.Resume accepting generation/encoding requests.Nr   r   r   r   r    resume_generation   r=   zEngineClient.resume_generationc                    r5   )z.Return whether the engine is currently paused.Nr   r   r   r   r    	is_paused   r=   zEngineClient.is_paused,  new_data_parallel_sizedrain_timeoutc                       t )zScale the engineNotImplementedError)r   rQ   rR   r   r   r    scale_elastic_ep   s   zEngineClient.scale_elastic_epr   methodtimeoutargskwargsc                    rS   )z0Perform a collective RPC call to the given path.rT   )r   rW   rX   rY   rZ   r   r   r    collective_rpc   r7   zEngineClient.collective_rpc.c                    rS   )zGet supported tasksrT   r   r   r   r    get_supported_tasks   s   z EngineClient.get_supported_tasks)NNr   NN)r   N)FF)rD   r   )rP   )Nr   N)9__name__
__module____qualname____doc__r	   __annotations__r   r   r   propertyr   r   r!   boolr#   r$   r%   BaseExceptionr&   r   r
   r   strr   dictr   r   intr   r   r1   r   r   r4   r   r6   r9   r;   r<   r>   r?   r@   rC   rF   listrH   rI   rJ   rM   rN   rO   rV   floattupler[   r   r\   r   r   r   r    r      s  
 	

	

	
	

r   N)"abcr   r   collections.abcr   r   r   typingr   vllm.configr   r	   vllm.inputs.datar
   vllm.lora.requestr   vllm.outputsr   r   vllm.plugins.io_processorsr   vllm.pooling_paramsr   vllm.renderersr   vllm.sampling_paramsr   
vllm.tasksr   vllm.v1.enginer   vllm.v1.engine.input_processorr   r   r   r   r   r    <module>   s   