o
    -i)                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZmZ d d
lmZ d dlmZmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) e!e*Z+G dd deZ,dS )    N)AsyncGenerator)Sequence)Request)EngineClient)RequestLogger)ChatCompletionLogProbChatCompletionLogProbsChatCompletionLogProbsContent)ErrorResponsePromptTokenUsageInfoRequestResponseMetadata	UsageInfo)OpenAIServingclamp_prompt_logprobs)OpenAIServingModels)GenerateRequestGenerateResponseGenerateResponseChoice)TokensPrompt)init_logger)Logprob)RequestOutput)SamplingParams)as_listc                       s   e Zd ZdZdddddddedededB ded	ed
ededef fddZ	dde	de
dB deeB fddZde	deedf dedededeeB fddZ	ddee deeeef dB  dedB defddZ  ZS ) ServingTokensz;Provides Tokens IN <> Tokens OUT functionality to vLLM API.F)force_no_detokenizereturn_tokens_as_token_idslog_error_stackenable_prompt_tokens_detailsenable_log_outputsengine_clientmodelsrequest_loggerNr   r   r   r   r   c          	         s>   t  j|||||d || _|| _|| _|rtd d S d S )N)r    r!   r"   r   r   zPTokens-only mode is enabled, skipping detokenization step for incoming requests.)super__init__r   r   r   loggerinfo)	selfr    r!   r"   r   r   r   r   r   	__class__ b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/entrypoints/serve/disagg/serving.pyr$   -   s   zServingTokens.__init__requestraw_requestreturnc              
      s  |  |I d H }|d urtd| |S | jjr| jjd }| j|dd}| j|}d| 	||j
 }t|d}|rA||j_t|jd}|jd urPd |d< t|dr_|jd ur_|j|d< d }	z4|j}
| jrkd	|
_| j|t|jd|
|d
 |d u r~d n| |jI d H }| jj||
||||jd}	W n ty } z| t|W  Y d }~S d }~ww z|	d usJ | ||	|||I d H W S  ty } z| t|W  Y d }~S d }~ww )NzError with model %sT)supports_default_mm_loraszgenerate-tokens-)
request_id)prompt_token_idsmulti_modal_data
cache_saltF)paramslora_request)r5   trace_headerspriority)_check_modelr%   errorr    errored
dead_error_maybe_get_adaptersr!   
model_name_base_request_idr0   r   staterequest_metadatar   	token_idsfeatureshasattrr3   sampling_paramsr   
detokenize_log_inputs_get_trace_headersheadersgenerater7   
ValueErrorcreate_error_responsestrserve_tokens_full_generator)r'   r,   r-   error_check_retr5   r=   r0   r@   engine_promptresult_generatorrD   r6   er*   r*   r+   serve_tokensI   sp   



	
	
zServingTokens.serve_tokensrP   r0   r=   r@   c              
      s  t t }d }|j}z|2 z3 d H W }	|	}q6 W n' tjy)   | d Y S  tyA }
 z| t|
W  Y d }
~
S d }
~
ww |d usHJ g }d}|jD ]=}|j	}|j
}|j
rl|d usbJ d| j|||j
d}nd }t|j||jrx|jndt|j	d}|| |t|j	7 }qO|jd usJ t|j}|jd ur|t|j7 }t|||| d}| jr|jrt|jd|_||_t|||||t|j|jd	}| jr| jr|D ]"}d }|jt|jk r|j|j j	}|r| jj |d
||jddd q|S )NzClient disconnectedr   zDid not output logprobs)rA   top_logprobsnum_output_top_logprobsstop)indexlogprobsfinish_reasonrA   )prompt_tokenscompletion_tokenstotal_tokens)cached_tokens)idcreatedmodelchoicesusageprompt_logprobskv_transfer_params F)r0   outputsoutput_token_idsrX   is_streamingdelta)!inttimerD   asyncioCancelledErrorrK   rJ   rL   re   rA   rW   _create_tokens_logprobsr   rV   rX   r   appendlenr1   encoder_prompt_token_idsr   r   num_cached_tokensr   prompt_tokens_detailsfinal_usage_infor   r   rb   rc   r   r"   log_outputs)r'   r,   rP   r0   r=   r@   created_time	final_resrD   resrQ   r`   num_generated_tokensoutputrA   out_logprobsrW   choice_datanum_prompt_tokensra   responsechoicerf   r*   r*   r+   rM      s   



	z)ServingTokens.serve_tokens_full_generatorrA   rS   rT   c           	   
      s   g }t |D ]?\}}d| || }|du s||du r'|td q|| }|tt|jd fddt | D d qt|dS )	zCreate OpenAI-style logprobs.z	token_id:N)token    c                    s4   g | ]\}} r| k rt t|d  jddqS )   r   )r   logprob)r   maxr   ).0iprT   r   r*   r+   
<listcomp>  s    z9ServingTokens._create_tokens_logprobs.<locals>.<listcomp>)r   r   rS   )content)	enumerategetrn   r	   r   r   itemsr   )	r'   rA   rS   rT   logprobs_contentr   token_idstep_top_logprobs
step_tokenr*   r   r+   rm      s*   



z%ServingTokens._create_tokens_logprobs)N)__name__
__module____qualname____doc__r   r   r   boolr$   r   r   r   r
   rR   r   r   rL   r   rM   GenericSequenceri   dictr   r   rm   __classcell__r*   r*   r(   r+   r   *   sn    	

P

fr   )-rk   rj   collections.abcr   r   r   fastapir   vllm.engine.protocolr   vllm.entrypoints.loggerr   0vllm.entrypoints.openai.chat_completion.protocolr   r   r	   'vllm.entrypoints.openai.engine.protocolr
   r   r   r   &vllm.entrypoints.openai.engine.servingr   r   &vllm.entrypoints.openai.models.servingr   &vllm.entrypoints.serve.disagg.protocolr   r   r   vllm.inputs.datar   vllm.loggerr   vllm.logprobsr   vllm.outputsr   vllm.sampling_paramsr   vllm.utils.collection_utilsr   r   r%   r   r*   r*   r*   r+   <module>   s(   