o
    -iqv                     @   sx  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d d	lmZmZmZmZ d d
lmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= e.e>Z?G dd deZ@dS )    N)AsyncGeneratorAsyncIterator)Sequence)cast)Request)EngineClient)RequestLogger)CompletionLogProbsCompletionRequestCompletionResponseCompletionResponseChoiceCompletionResponseStreamChoiceCompletionStreamResponse)ErrorResponsePromptTokenUsageInfoRequestResponseMetadata	UsageInfo)GenerationErrorOpenAIServingclamp_prompt_logprobs)OpenAIServingModels)RenderConfig)get_max_tokensshould_include_usage)VLLMValidationError)EmbedsPromptTokensPromptis_embeds_prompt)init_logger)Logprob)RequestOutput)BeamSearchParamsSamplingParams)TokenizerLike)merge_async_iterators)as_list)%validate_logits_processors_parametersc                       s  e Zd ZddddddedededB deded	ed
ef fddZdede	e
eB  eB fddZ	d,dededB deedf eB eB fddZdede	e
eB  deeeef  dedededededB dedeedf fddZde	e dedededededB dedefdd Z	!	d-d"ee d#eeeef dB  d$ededB d%ed&edB defd'd(Z	d,ded)edB de fd*d+Z!  Z"S ).OpenAIServingCompletionF)return_tokens_as_token_idsenable_prompt_tokens_detailsenable_force_include_usagelog_error_stackengine_clientmodelsrequest_loggerNr(   r)   r*   r+   c                   s<   t  j|||||d | jj| _|| _|| _| j | _d S )N)r,   r-   r.   r(   r+   )super__init__model_configlogits_processorsr)   r*   get_diff_sampling_paramdefault_sampling_params)selfr,   r-   r.   r(   r)   r*   r+   	__class__ g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/entrypoints/openai/completion/serving.pyr0   4   s   
	z OpenAIServingCompletion.__init__requestreturnc                    s   |  |I dH }|dur|S | jjr| jj|jdur!| dS |jr.|jdur.| dS |jdur=|jdur=| dS z| 	 }|j
|j|j| |dI dH }W |S  ttttjfyu } ztd | |W  Y d}~S d}~ww )z
        render completion request by validating and preprocessing inputs.

        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
        Nz!suffix is not currently supportedz'Echo is unsupported with prompt embeds.z5prompt_logprobs is not compatible with prompt embeds.)prompt_or_promptsprompt_embedsconfigz$Error in preprocessing prompt inputs)_check_modelr,   errored
dead_errorsuffixcreate_error_responseechor=   prompt_logprobs_get_completion_rendererrender_prompt_and_embedsprompt_build_render_config
ValueError	TypeErrorRuntimeErrorjinja2TemplateErrorlogger	exception)r5   r:   error_check_retrendererengine_promptser8   r8   r9   render_completion_requestO   s6   


	
z1OpenAIServingCompletion.render_completion_requestraw_requestc           $         s  |  |I dH }t|tr|S |}d| ||j }tt }t|d}|r-||j_	z| 
|}W n tttfyS }	 ztd | |	W  Y d}	~	S d}	~	ww | |}
g }zt|D ]\}}| |\}}}d}|durwt|}n|durt|}nt| jdu ri | _t| j||| jd}|jr||| j}n||| jj| j}t| j| | d| }| j ||||d |du rdn| !|j"I dH }t#t$t%B |}t|t&r| j'|||||d}n"| j(||||||j)|
d	I dH \}}| j*j+||||||j)|||
d
	}|,| q`W n ty& }	 z| |	W  Y d}	~	S d}	~	ww t-| }| j./|}t|}|j0o<|j }| j1j2}|rR| j3|||||||||d	S dg| }zM|2 z3 dH W \}}|||< qZ6 t|D ]#\}} | dusxJ | j4du r|| }t5|rdn|6d| _4qmt#t7t8 |}!| 9|!||||||}"W n> t:j;y   | d Y S  t<y }	 z| =|	W  Y d}	~	S d}	~	w ty }	 z| |	W  Y d}	~	S d}	~	ww |j0r|">  dt?t@df f fdd}#|# S |"S )aq  Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        Nzcmpl-)
request_idz"Error preparing request components)max_model_lenr:   input_lengthr4   -)paramslora_request)rH   rW   r[   r\   trace_headers)r\   r]   prioritydata_parallel_rank)r\   r]   r^   prompt_texttokenization_kwargsr_   )num_prompts	tokenizerrequest_metadatarH   zClient disconnectedr;   c                     s   d  dV  dV  d S )Ndata: 

data: [DONE]

r8   r8   response_jsonr8   r9   fake_stream_generatorD  s   
zHOpenAIServingCompletion.create_completion.<locals>.fake_stream_generator)ArU   
isinstancer   _base_request_idrW   inttimer   staterd   _maybe_get_adaptersrJ   rK   rL   rO   rP   rC   _get_data_parallel_rank	enumerate_get_prompt_componentslenNotImplementedErrorr4   r   rX   use_beam_searchto_beam_search_paramsto_sampling_paramsr1   logits_processor_patternr&   r2   _log_inputs_get_trace_headersheadersr   r   r   r!   beam_search_process_inputsr^   r,   generateappendr$   r-   
model_namestreamrR   rc   completion_stream_generatorrH   r   getlistr    %request_output_to_completion_responseasyncioCancelledErrorr   %_convert_generation_error_to_responsemodel_dump_jsonr   str)$r5   r:   rV   resultrS   rW   created_timerd   r\   rT   r_   
generatorsiengine_promptr`   prompt_token_idsr=   rY   
max_tokenssampling_paramsrequest_id_itemr]   	generatorengine_requestra   result_generatorr   rb   r   rc   final_res_batchres	final_resfinal_res_batch_checkedresponserj   r8   rh   r9   create_completion}   s.  






		

Z
	z)OpenAIServingCompletion.create_completionrS   r   rW   r   r   rb   rc   rd   c
           .      C  s  |j d u rdn|j }
dg|
 | }dg|
 | }dg|
 | }dg| }d }d}|j}t|| j\}}zm|2 z+3 d H W \}}|j}|j}|rN|j}d}|j}|d u rd|| }t|r_d n|	d}|d urnt
|||< |jD ]}|j||
  }d }|jd usJ |jr|| s|d usJ |jrd}|d usJ |jdkr|}|}|}n||j }g ||j}g |pg |jpg }|}d||< n|j}|j}|j}|| s|jr|}d||< |s|s|| sqq|jd ur|d usJ d| j|||j||| |jd} nd } ||  t
|j7  < ||  t
|j7  < |j}!|j}"| |!| t|||t||| |!|"||jr:t|jnd d	gd
}#|rV|| }$|| }%t|$|%|$|% d|#_|#jdd}&d|& dV  qqq86 t|}'t|}(t|'|(|'|( d})| jr|rt |d|)_!|rt|||g |)d}*|*jddd}+d|+ dV  |)|	_"W n@ t#y }, zd| $|, dV  W Y d },~,n(d },~,w t%y }, zt&'d | (|,}-d|- dV  W Y d },~,nd },~,ww dV  d S )N   r   FTrH    Did not output logprobs)	token_idstop_logprobsnum_output_top_logprobsrc   initial_text_offsetreturn_as_token_id)indextextlogprobsfinish_reasonstop_reasonr   r   )idcreatedmodelchoicesprompt_tokenscompletion_tokenstotal_tokens)exclude_unsetre   rf   cached_tokens)r   r   r   r   usage)r   exclude_nonez%Error in completion stream generator.rg   ))nstream_optionsr   r*   r   rE   num_cached_tokensrH   r   r   rt   outputsr   r   rD   return_token_idsr   r   r   _create_completion_logprobsr(   r   r   _raise_if_errorr   r   r%   r   r   r   sumr)   r   prompt_tokens_detailsfinal_usage_infor   /_convert_generation_error_to_streaming_response	ExceptionrO   rP   create_streaming_error_response).r5   r:   rS   r   rW   r   r   rb   rc   rd   num_choicesprevious_text_lensprevious_num_tokens
has_echoednum_prompt_tokensr   first_iterationr   include_usageinclude_continuous_usage
prompt_idxr   r   rE   r`   r   outputr   prompt_token_ids_to_return
delta_textdelta_token_idsout_logprobsr   r   r   chunkr   r   ri   total_prompt_tokenstotal_completion_tokensr   final_usage_chunkfinal_usage_datarT   datar8   r8   r9   r   L  s&  




	 
 


z3OpenAIServingCompletion.completion_stream_generatorr   c                 C   s  g }d}	d}
d }d }|D ]}|}|j }|d usJ t|j}|j}|jD ]}| |j| |jd us4J |jrw|j	r<d}|d usBJ |jdkrN|}|}|}n2g ||j
}|jd u r]d }n|d uscJ |jd usjJ g ||j}||j }n	|j
}|j}|j}|jd ur|d usJ d| j||||j|jd}nd }tt||||j|j|j|j	r|nd |j	rt|j
nd d}|| |
t|j
7 }
q$|	t|7 }	qt|	|
|	|
 d}| jr|r|jrt|jd|_||_|r|d j}t||||||dS )	Nr   r   r   )r   r   rc   r   r   )r   r   r   r   r   rE   r   r   r   r   )r   r   r   r   r   kv_transfer_params)r   r   rE   rH   r   r   r   r   rD   r   r   r   r   r   r(   r   rt   r   r%   r   r   r)   r   r   r   r   r   r   )r5   r   r:   rW   r   r   rc   rd   r   r   num_generated_tokensr   last_final_resr   r   rE   r`   r   r   r   output_textr   choice_datar   r8   r8   r9   r     s   







z=OpenAIServingCompletion.request_output_to_completion_responser   r   r   r   r   r   c                    s:  g }g }g }	g }
d}|dur|nj t|D ]}\}}|| }|du rKr+d| }ndu r6tdddd|}|	| |d |
d n0|| }j||d}t|jd	}|	| || |
 fd
dt| D  t	|dkr|| n	||d |  t	|}qt
|||	|
dS )z*Create logprobs for OpenAI Completion API.r   Nz	token_id:z:Unable to get tokenizer because `skip_tokenizer_init=True`skip_tokenizer_initT	parametervaluer       c                    s@   i | ]\}} |krj |d  |d dt|d  jdqS )r   r   r   r   )_get_decoded_tokenmaxlogprob).0r   top_lpr   r5   should_return_as_token_idrc   r8   r9   
<dictcomp>  s    	zGOpenAIServingCompletion._create_completion_logprobs.<locals>.<dictcomp>)text_offsettoken_logprobstokensr   )r(   rr   r   decoder   r   r   r   itemsrt   r	   )r5   r   r   r   rc   r   r   out_text_offsetout_token_logprobs
out_tokensout_top_logprobslast_token_lenr   token_idstep_top_logprobstoken
step_tokentoken_logprobr8   r   r9   r   ~  sb   






	
z3OpenAIServingCompletion._create_completion_logprobsmax_input_lengthc                 C   sn   |j d ur|j | jkrtd|j  d| j dd|j d| j|j p"d }t||j|j|jt|jo3|j	 dS )Nz'max_tokens' (z=) cannot be greater than the model's maximum context length (z).r   r   r   )
max_lengthtruncate_prompt_tokensadd_special_tokens
cache_saltneeds_detokenization)
r   rX   r   r   r   r   r   boolrD   r   )r5   r:   r   max_input_tokens_lenr8   r8   r9   rI     s    
z,OpenAIServingCompletion._build_render_config)N)r   N)#__name__
__module____qualname__r   r   r   r  r0   r
   r   r   r   r   rU   r   r   r   r   r   r   tuplerm   r    r#   r   r   r   GenericSequencedictr   r	   r   r   rI   __classcell__r8   r8   r6   r9   r'   3   s    	
1
 P
	


 @	
y
Zr'   )Ar   rn   collections.abcr   r   r   r  typingr   rM   fastapir   vllm.engine.protocolr   vllm.entrypoints.loggerr   +vllm.entrypoints.openai.completion.protocolr	   r
   r   r   r   r   'vllm.entrypoints.openai.engine.protocolr   r   r   r   &vllm.entrypoints.openai.engine.servingr   r   r   &vllm.entrypoints.openai.models.servingr   vllm.entrypoints.rendererr   vllm.entrypoints.utilsr   r   vllm.exceptionsr   vllm.inputs.datar   r   r   vllm.loggerr   vllm.logprobsr   vllm.outputsr    vllm.sampling_paramsr!   r"   vllm.tokenizersr#   vllm.utils.async_utilsr$   vllm.utils.collection_utilsr%   vllm.v1.sample.logits_processorr&   r  rO   r'   r8   r8   r8   r9   <module>   s6    