o
    ie*                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZmZmZ d d	lmZmZ d d
lmZ d dlmZmZmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) e!e*Z+G dd deZ,dS )    N)AsyncGenerator)Sequence)Request)EngineClient)RequestLogger)ChatCompletionLogProbChatCompletionLogProbsChatCompletionLogProbsContent)ErrorResponsePromptTokenUsageInfoRequestResponseMetadata	UsageInfo)OpenAIServingclamp_prompt_logprobs)OpenAIServingModels)GenerateRequestGenerateResponseGenerateResponseChoice)TokensPrompt)init_logger)Logprob)RequestOutput)SamplingParams)as_listc                       s   e Zd ZdZdddddddedededB ded	ed
ededef fddZ	dde	de
dB deeB fddZde	deedf dedededeeB fddZ	ddee deeeef dB  dedB defddZ  ZS ) ServingTokensz;Provides Tokens IN <> Tokens OUT functionality to vLLM API.F)force_no_detokenizereturn_tokens_as_token_idslog_error_stackenable_prompt_tokens_detailsenable_log_outputsengine_clientmodelsrequest_loggerNr   r   r   r   r   c          	         s>   t  j|||||d || _|| _|| _|rtd d S d S )N)r    r!   r"   r   r   zPTokens-only mode is enabled, skipping detokenization step for incoming requests.)super__init__r   r   r   loggerinfo)	selfr    r!   r"   r   r   r   r   r   	__class__ [/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/entrypoints/serve/disagg/serving.pyr$   -   s   zServingTokens.__init__requestraw_requestreturnc              
      s  |  |I d H }|d urtd| |S | jjr| jjd }| j|dd}| j|}d| 	||j
 }t|d}|rA||j_| j||jd dI d H }t|dksUJ |d }	d }
z?|j}| jred	|_| j|t|jd
||d |d u rxd n| |jI d H }|| j}| }| jj|	||||||jd}
W n ty } z| t|W  Y d }~S d }~ww z|
d usJ |  ||
|||I d H W S  ty } z| t|W  Y d }~S d }~ww )NzError with model %sT)supports_default_mm_loraszgenerate-tokens-)
request_id)prompt_inputprompt_embeds   r   F)prompt_token_ids)paramslora_request)r6   tokenization_kwargstrace_headerspriority)!_check_modelr%   errorr    errored
dead_error_maybe_get_adaptersr!   
model_name_base_request_idr0   r   staterequest_metadata_preprocess_completion	token_idslensampling_paramsr   
detokenize_log_inputsr   _get_trace_headersheadersbuild_tok_paramsmodel_configget_encode_kwargsgenerater9   
ValueErrorcreate_error_responsestrserve_tokens_full_generator)r'   r,   r-   error_check_retr6   r?   r0   rB   engine_promptsengine_promptresult_generatorrF   r8   
tok_paramsr7   er*   r*   r+   serve_tokensI   sz   

	


zServingTokens.serve_tokensrV   r0   r?   rB   c              
      s  t t }d }|j}z|2 z3 d H W }	|	}q6 W n' tjy)   | d Y S  tyA }
 z| t|
W  Y d }
~
S d }
~
ww |d usHJ g }d}|jD ]?}|j	}|j
}|j
d urn|d usdJ d| j|||j
d}nd }t|j||jrz|jndt|j	d}|| |t|j	7 }qO|jd usJ t|j}|jd ur|t|j7 }t|||| d}| jr|jrt|jd|_||_t|||||t|j|jd	}| jr| jr|D ]"}d }|jt|jk r|j|j j	}|r| jj |d
||jddd q|S )NzClient disconnectedr   zDid not output logprobs)rD   top_logprobsnum_output_top_logprobsstop)indexlogprobsfinish_reasonrD   )prompt_tokenscompletion_tokenstotal_tokens)cached_tokens)idcreatedmodelchoicesusageprompt_logprobskv_transfer_params F)r0   outputsoutput_token_idsr_   is_streamingdelta)!inttimerF   asyncioCancelledErrorrP   rO   rQ   rl   rD   r^   _create_tokens_logprobsr   r]   r_   r   appendrE   r4   encoder_prompt_token_idsr   r   num_cached_tokensr   prompt_tokens_detailsfinal_usage_infor   r   ri   rj   r   r"   log_outputs)r'   r,   rV   r0   r?   rB   created_time	final_resrF   resrX   rg   num_generated_tokensoutputrD   out_logprobsr^   choice_datanum_prompt_tokensrh   responsechoicerm   r*   r*   r+   rR      s   




	z)ServingTokens.serve_tokens_full_generatorrD   rZ   r[   c           	   
      s   g }t |D ]?\}}d| || }|du s||du r'|td q|| }|tt|jd fddt | D d qt|dS )	zCreate OpenAI-style logprobs.z	token_id:N)token    c                    s>   g | ]\}} d ur|t  dk rtt |d jddqS )Nr3   r   )r   logprob)maxr   r   ).0ipr[   r   r*   r+   
<listcomp>  s    z9ServingTokens._create_tokens_logprobs.<locals>.<listcomp>)r   r   rZ   )content)	enumerategetru   r	   r   r   itemsr   )	r'   rD   rZ   r[   logprobs_contentr   token_idstep_top_logprobs
step_tokenr*   r   r+   rt      s*   



z%ServingTokens._create_tokens_logprobs)N)__name__
__module____qualname____doc__r   r   r   boolr$   r   r   r   r
   rY   r   r   rQ   r   rR   GenericSequencerp   dictr   r   rt   __classcell__r*   r*   r(   r+   r   *   sn    	

U

fr   )-rr   rq   collections.abcr   r   r   fastapir   vllm.engine.protocolr   vllm.entrypoints.loggerr   0vllm.entrypoints.openai.chat_completion.protocolr   r   r	   'vllm.entrypoints.openai.engine.protocolr
   r   r   r   &vllm.entrypoints.openai.engine.servingr   r   &vllm.entrypoints.openai.models.servingr   &vllm.entrypoints.serve.disagg.protocolr   r   r   vllm.inputs.datar   vllm.loggerr   vllm.logprobsr   vllm.outputsr   vllm.sampling_paramsr   vllm.utils.collection_utilsr   r   r%   r   r*   r*   r*   r+   <module>   s(   