o
    iRn                     @   sd  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d d	lmZmZmZmZ d d
lmZmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 e(e:Z;G dd deZ<dS )    N)AsyncGeneratorAsyncIterator)Sequence)cast)Request)EngineClient)RequestLogger)CompletionLogProbsCompletionRequestCompletionResponseCompletionResponseChoiceCompletionResponseStreamChoiceCompletionStreamResponse)ErrorResponsePromptTokenUsageInfoRequestResponseMetadata	UsageInfo)GenerationErrorOpenAIServingclamp_prompt_logprobs)OpenAIServingModels)get_max_tokensshould_include_usage)VLLMValidationError)init_logger)Logprob)RequestOutput)	TokPrompt)BeamSearchParamsSamplingParams)TokenizerLike)merge_async_iterators)as_list)%validate_logits_processors_parametersc                       sZ  e Zd ZddddddedededB deded	ed
ef fddZdede	e
 eB fddZ	d)dededB deedf eB eB fddZdede	e
 deeeef  dedededededB dedeedf fddZde	e dedededededB dedefdd Z	!	d*d"ee d#eeeef dB  d$ededB d%ed&edB defd'd(Z  ZS )+OpenAIServingCompletionF)return_tokens_as_token_idsenable_prompt_tokens_detailsenable_force_include_usagelog_error_stackengine_clientmodelsrequest_loggerNr%   r&   r'   r(   c                   s<   t  j|||||d | jj| _|| _|| _| j | _d S )N)r)   r*   r+   r%   r(   )super__init__model_configlogits_processorsr&   r'   get_diff_sampling_paramdefault_sampling_params)selfr)   r*   r+   r%   r&   r'   r(   	__class__ `/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/entrypoints/openai/completion/serving.pyr-   3   s   
	z OpenAIServingCompletion.__init__requestreturnc                    s   |  |I dH }|dur|S | jjr| jj|jdur!| dS |jr.|jdur.| dS |jdur=|jdur=| dS z| j	||j
|jdI dH }W |S  ttttjfyn } ztd | |W  Y d}~S d}~ww )z
        render completion request by validating and preprocessing inputs.

        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
        Nz!suffix is not currently supportedz'Echo is unsupported with prompt embeds.z5prompt_logprobs is not compatible with prompt embeds.)prompt_inputprompt_embedsz$Error in preprocessing prompt inputs)_check_modelr)   errored
dead_errorsuffixcreate_error_responseechor:   prompt_logprobs_preprocess_completionprompt
ValueError	TypeErrorRuntimeErrorjinja2TemplateErrorlogger	exception)r2   r7   error_check_retengine_promptser5   r5   r6   render_completion_requestN   s4   


	
z1OpenAIServingCompletion.render_completion_requestraw_requestc           "         s  |  |I dH }t|tr|S |}d| ||j }tt }t|d}|r-||j_	z| 
|}W n tttfyS }	 ztd | |	W  Y d}	~	S d}	~	ww | |}
g }zt|D ]\}}| |}t| j|j| || j}|jr||| j}n||| jj| j}t| j| | d| }| j ||||d |du rdn| !|j"I dH }t|t#r| j$|||||d}n)|%| j}|& }| j'j(|||||||j)|
d}| j*j+||||||j)|||
d		}|,| q`W n ty }	 z| |	W  Y d}	~	S d}	~	ww t-| }| j./|}t0|}|j1o|j }| j2j3}|r4| j4|||||||||d
	S dg| }zF|2 z3 dH W \}}|||< q<6 t|D ]\}}|dusZJ |j5du rj|| }| ||_5qOt6t7t8 |}| 9|||||||} W n> t:j;y   | d Y S  t<y }	 z| =|	W  Y d}	~	S d}	~	w ty }	 z| |	W  Y d}	~	S d}	~	ww |j1r| >  dt?t@df f fdd}!|! S | S )aq  Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        Nzcmpl-)
request_idz"Error preparing request components-)paramslora_request)rC   rP   rR   rS   trace_headers)rS   tokenization_kwargsrT   prioritydata_parallel_rank)rS   rT   rV   prompt_textrU   rW   )num_prompts	tokenizerrequest_metadatazClient disconnectedr8   c                     s   d  dV  dV  d S )Ndata: 

data: [DONE]

r5   r5   response_jsonr5   r6   fake_stream_generator1  s   
zHOpenAIServingCompletion.create_completion.<locals>.fake_stream_generator)ArN   
isinstancer   _base_request_idrP   inttimer   stater[   _maybe_get_adaptersrD   rE   rF   rI   rJ   r?   _get_data_parallel_rank	enumerate_extract_prompt_textr   max_model_len
max_tokens_extract_prompt_lenr1   use_beam_searchto_beam_search_paramsto_sampling_paramsr.   logits_processor_patternr#   r/   _log_inputs_get_trace_headersheadersr   beam_searchbuild_tok_paramsget_encode_kwargsinput_processorprocess_inputsrV   r)   generateappendr!   r*   
model_namelenstreamrendererrZ   completion_stream_generatorrC   r   listr   %request_output_to_completion_responseasyncioCancelledErrorr   %_convert_generation_error_to_responsemodel_dump_jsonr   str)"r2   r7   rO   resultrL   rP   created_timer[   rS   rM   rW   
generatorsiengine_promptrX   rl   sampling_paramsrequest_id_itemrT   	generator
tok_paramsrU   engine_requestresult_generatorr|   rY   r~   rZ   final_res_batchres	final_resfinal_res_batch_checkedresponsera   r5   r_   r6   create_completion{   s  




	
M
	z)OpenAIServingCompletion.create_completionrL   r   rP   r   r|   rY   rZ   r[   c
           .      C  s  |j d u rdn|j }
dg|
 | }dg|
 | }dg|
 | }dg| }d }d}|j}t|| j\}}zf|2 z$3 d H W \}}|j}|j}|rN|j}d}|j}|d u r^|| }| |}|d urht	|||< |j
D ]}|j||
  }d }|jd us}J |jr|| s|d usJ |jrd}|d usJ |jdkr|}|}|}n||j }g ||j}g |pg |jpg }|}d||< n|j}|j}|j}|| s|jr|}d||< |s|s|| sqk|jd ur|d usJ d| j|||j||| |jd} nd } ||  t	|j7  < ||  t	|j7  < |j}!|j}"| |!| t|||t||| |!|"||jr3t|jnd dgd	}#|rO|| }$|| }%t|$|%|$|% d
|#_|#jdd}&d|& dV  qkq86 t|}'t|}(t|'|(|'|( d
})| jr}|r}t|d|)_ |rt|||g |)d}*|*jddd}+d|+ dV  |)|	_!W n@ t"y }, zd| #|, dV  W Y d },~,n(d },~,w t$y }, zt%&d | '|,}-d|- dV  W Y d },~,nd },~,ww dV  d S )N   r   FT Did not output logprobs)	token_idstop_logprobsnum_output_top_logprobsrZ   initial_text_offsetreturn_as_token_id)indextextlogprobsfinish_reasonstop_reasonprompt_token_idsr   )idcreatedmodelchoicesprompt_tokenscompletion_tokenstotal_tokens)exclude_unsetr\   r]   cached_tokens)r   r   r   r   usage)r   exclude_nonez%Error in completion stream generator.r^   )(nstream_optionsr   r'   r   rA   num_cached_tokensrC   rj   r}   outputsr   rl   r@   return_token_idsr   r   r   _create_completion_logprobsr%   r   r   _raise_if_errorr   r   r"   r   r   r   sumr&   r   prompt_tokens_detailsfinal_usage_infor   /_convert_generation_error_to_streaming_response	ExceptionrI   rJ   create_streaming_error_response).r2   r7   rL   r   rP   r   r|   rY   rZ   r[   num_choicesprevious_text_lensprevious_num_tokens
has_echoednum_prompt_tokensr   first_iterationr   include_usageinclude_continuous_usage
prompt_idxr   r   rA   rX   r   outputr   prompt_token_ids_to_return
delta_textdelta_token_idsout_logprobsr   r   r   chunkr   r   r`   total_prompt_tokenstotal_completion_tokensr   final_usage_chunkfinal_usage_datarM   datar5   r5   r6   r   9  s  






	|
 


z3OpenAIServingCompletion.completion_stream_generatorr   c                 C   s  g }d}	d}
d }d }|D ]}|}|j }|d usJ t|j}|j}|jD ]}| |j| |jd us4J |jrw|j	r<d}|d usBJ |jdkrN|}|}|}n2g ||j
}|jd u r]d }n|d uscJ |jd usjJ g ||j}||j }n	|j
}|j}|j}|jd ur|d usJ d| j||||j|jd}nd }tt||||j|j|j|j	r|nd |j	rt|j
nd d}|| |
t|j
7 }
q$|	t|7 }	qt|	|
|	|
 d}| jr|r|jrt|jd|_||_|r|d j}t||||||dS )	Nr   r   r   )r   r   rZ   r   r   )r   r   r   r   r   rA   r   r   r   r   )r   r   r   r   r   kv_transfer_params)r   r   rA   rC   r   r   r   rl   r@   r   r   r   r   r   r%   r   r}   r   r"   r{   r   r&   r   r   r   r   r   r   )r2   r   r7   rP   r   r|   rZ   r[   r   r   num_generated_tokensr   last_final_resr   r   rA   rX   r   r   r   output_textr   choice_datar   r5   r5   r6   r     s   







z=OpenAIServingCompletion.request_output_to_completion_responser   r   r   r   r   r   c                    s:  g }g }g }	g }
d}|dur|nj t|D ]}\}}|| }|du rKr+d| }ndu r6tdddd|}|	| |d |
d n0|| }j||d}t|jd	}|	| || |
 fd
dt| D  t	|dkr|| n	||d |  t	|}qt
|||	|
dS )z*Create logprobs for OpenAI Completion API.r   Nz	token_id:z:Unable to get tokenizer because `skip_tokenizer_init=True`skip_tokenizer_initT)	parametervaluer       c                    s@   i | ]\}} |krj |d  |d dt|d  jdqS )r   r   r   r   )_get_decoded_tokenmaxlogprob).0r   top_lpr   r2   should_return_as_token_idrZ   r5   r6   
<dictcomp>  s    	zGOpenAIServingCompletion._create_completion_logprobs.<locals>.<dictcomp>)text_offsettoken_logprobstokensr   )r%   ri   r   decoder{   r   r   r   itemsr}   r	   )r2   r   r   r   rZ   r   r   out_text_offsetout_token_logprobs
out_tokensout_top_logprobslast_token_lenr   token_idstep_top_logprobstoken
step_tokentoken_logprobr5   r   r6   r   g  sb   






	
z3OpenAIServingCompletion._create_completion_logprobs)N)r   N) __name__
__module____qualname__r   r   r   boolr-   r
   r   r   r   rN   r   r   r   r   r   r   tuplerd   r   r    r   r   r   GenericSequencedictr   r	   r   __classcell__r5   r5   r3   r6   r$   2   s    	

0
 ?	


 <	
yr$   )=r   re   collections.abcr   r   r   r   typingr   rG   fastapir   vllm.engine.protocolr   vllm.entrypoints.loggerr   +vllm.entrypoints.openai.completion.protocolr	   r
   r   r   r   r   'vllm.entrypoints.openai.engine.protocolr   r   r   r   &vllm.entrypoints.openai.engine.servingr   r   r   &vllm.entrypoints.openai.models.servingr   vllm.entrypoints.utilsr   r   vllm.exceptionsr   vllm.loggerr   vllm.logprobsr   vllm.outputsr   vllm.renderers.inputsr   vllm.sampling_paramsr   r   vllm.tokenizersr    vllm.utils.async_utilsr!   vllm.utils.collection_utilsr"   vllm.v1.sample.logits_processorr#   r   rI   r$   r5   r5   r5   r6   <module>   s4    