o
    iܿ                     @   sn  U d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d dl m!Z! d d
l"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@mAZA d dlBmCZCmDZD d dlEmFZF d dlGmHZHmIZImJZJ d dlKmLZLmMZMmNZN d dlOmPZPmQZQmRZRmSZS d dlTmUZUmVZVmWZWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZambZb d dlcmdZdmeZemfZfmgZg d dlhmiZimjZj d dlkmlZl d dlmmnZnmoZompZp d d lqmrZr d d!lsmtZtmuZu d d"lvmwZw d d#lxmyZy d d$lzm{Z{m|Z|m}Z} d d%l~mZ d d&lmZmZmZ d d'lmZ d d(lmZmZmZmZ d d)lmZmZ d d*lmZ d d+lmZ d d,lmZmZmZ d d-lmZ d d.lmZmZ G d/d0 d0eZereZG d1d2 d2eZG d3d4 d4eeZe4efB edB eRB eMB eZB e]B eWB Zeed5< e1eeB eQB eLB eVB Zeed6< eHeJB Zeed7< eeB eB eDB eUB eaB Zeed8< e5e2B eSB ePB eIB egB eXB eNB e^B ebB Zeed9< ed:ed;Ze
d<d=G d>d? d?ee ZG d@dA dAZdBeudB dCeudB fdDdEZdS )F    N)AsyncGeneratorCallableMapping)	dataclassfield)
HTTPStatus)AnyClassVarGenericProtocol	TypeAliasTypeVar)Request)ToolChoiceFunction)
ConfigDictTypeAdapter)Headers)BeamSearchSequencecreate_sort_beams_key_function)ModelConfig)EngineClient)ChatCompletionMessageParamChatTemplateContentFormatOptionConversationMessage)RequestLogger)"ChatCompletionNamedToolChoiceParamChatCompletionRequestChatCompletionResponse)CompletionRequestCompletionResponse)	ErrorInfoErrorResponseFunctionCallFunctionDefinition)OpenAIServingModels)ConversationContextHarmonyContextParsableContextStreamingHarmonyContext)ResponseInputOutputItemResponsesRequest)construct_input_messages)TranscriptionRequestTranscriptionResponseTranslationRequest)ClassificationChatRequestClassificationCompletionRequestClassificationResponse)EmbeddingBytesResponseEmbeddingChatRequestEmbeddingCompletionRequestEmbeddingResponse)IOProcessorRequestPoolingChatRequestPoolingCompletionRequestPoolingResponse)RerankRequestScoreDataRequestScoreQueriesDocumentsRequestScoreRequestScoreResponseScoreTextRequest)GenerateRequestGenerateResponse)DetokenizeRequestTokenizeChatRequestTokenizeCompletionRequestTokenizeResponse)get_max_tokenssanitize_messageVLLMValidationError)
PromptTypeSingletonPromptTokensPrompt)init_logger)LogprobPromptLogprobs)LoRARequest)MultiModalDataDict)CompletionOutputPoolingRequestOutputRequestOutput)PoolingParams)
ChatParamsTokenizeParamsmerge_kwargs)	TokPrompt)extract_prompt_componentsextract_prompt_lenparse_model_promptprompt_to_seq)BeamSearchParamsSamplingParams)TokenizerLike)
ToolParser)contains_trace_headersextract_trace_headerslog_tracing_disabled_warning)random_uuid)collect_from_async_generatormerge_async_iteratorsc                       s(   e Zd ZdZddef fddZ  ZS )GenerationErrorz?raised when finish_reason indicates internal server error (500)Internal server errormessagec                    s   t  | tj| _d S N)super__init__r   INTERNAL_SERVER_ERRORstatus_code)selfrj   	__class__ \/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/entrypoints/openai/engine/serving.pyrm      s   zGenerationError.__init__)ri   )__name__
__module____qualname____doc__strrm   __classcell__rs   rs   rq   rt   rh      s    rh   c                   @   s   e Zd ZdedefddZdS )RendererRequestmodel_configreturnc                 C      t rk   NotImplementedError)rp   r|   rs   rs   rt   build_tok_params   s   z RendererRequest.build_tok_paramsN)ru   rv   rw   r   rW   r   rs   rs   rs   rt   r{      s    r{   c                   @   s&   e Zd ZdedB dedefddZdS )RendererChatRequestdefault_templateNdefault_template_content_formatr}   c                 C   r~   rk   r   )rp   r   r   rs   rs   rt   build_chat_params   s   z%RendererChatRequest.build_chat_params)ru   rv   rw   ry   r   rV   r   rs   rs   rs   rt   r      s    r   CompletionLikeRequestChatLikeRequestSpeechToTextRequest
AnyRequestAnyResponseRequestT)boundT)kw_onlyc                   @   s   e Zd ZU eed< dZedB ed< eed< eed< edd dZ	e
ed	< dZedB ed
< dZee dB ed< dZeee
ef df dB ed< eedZee ed< eddZdS )ServeContextrequestNraw_request
model_name
request_idc                   C   s   t t S rk   )inttimers   rs   rs   rt   <lambda>   s    zServeContext.<lambda>)default_factorycreated_timelora_requestengine_promptsresult_generatorfinal_res_batchT)arbitrary_types_allowed)ru   rv   rw   r   __annotations__r   r   ry   r   r   r   r   rP   r   listrY   r   r   tuplerS   r   r   r|   rs   rs   rs   rt   r      s   
 r   c                       sr  e Zd ZU dZee ed< ddddedede	dB d	e
d
e
f
 fddZ		ddededededB deeef dB deedf fddZdededB fddZdedeeB fddZdedeeB fddZdedeeeB df fddZdededB fddZdedeeB fd d!ZdededB fd"d#ZdededB fd$d%Zd&ej dfd'ee!B d(ed)ed*edB def
d+d,Z"d&ej dfd'ee!B d(ed)ed*edB def
d-d.Z#d/edB deddfd0d1Z$d2e%defd3d4Z&d2e%defd5d6Z'd7e(dedB fd8d9Z)d7e(dedB fd:d;Z*	dd7e(d<e
dedB fd=d>Z+d7e(de,e fd?d@Z-d7e.dAe/e0 dBede1fdCdDZ2dEedB dFe3ee4f dB dGe
dedB fdHdIZ5e6		ddJe3ee4f dB dKe3ee4f dB de3ee4f fdLdMZ7d7e8dNee/e B e/e0 B e/e/e0  B dB dOe9e/e9 B dB de/e fdPdQZ:		dd7e;dRe/e< dSedB dTe=dUe3ee4f dB dVe/e3ee4f  dB dWe>e?ge@f dB deAe/eB e/e f fdXdYZCde.fdZd[ZDde.fd\d]ZEde.fd^d_ZFd7eGdRe/eH dVe/e3ee4f  dB dWe>e?ge@f dB d`edB dae=fdbdcZI		d	ddedeedfeJdgeKdheLdedB die0deeef dB fdjdkZMdedleNeB deJeB eB dB dedB ddf
dmdnZOdoePdeeef dB fdpdqZQe6	ddreRdB dsedB dedB fdtduZSe6dreRdB de0dB fdvdwZTe6	dd7eGeUB dxe?dB dye
dze>e?ge@f dB d{edB deAe/eV dB edB f fd|d}ZWe6	dd~eXde0dxe?dB de
def
ddZYdedB de
fddZZ  Z[S )OpenAIServingu   
    A short string prepended to every request’s ID (e.g. "embd", "classify")
    so you can easily tell “this ID came from Embedding vs Classification.”
    request_id_prefixF)return_tokens_as_token_idslog_error_stackengine_clientmodelsrequest_loggerNr   r   c                   s^   t    || _|| _|| _|| _|| _| jj| _| jj| _| jj	| _	| jj
| _
| j
j| _d S rk   )rl   rm   r   r   r   r   r   input_processorio_processorrendererr|   max_model_len)rp   r   r   r   r   r   rq   rs   rt   rm      s   
	



zOpenAIServing.__init__promptr   paramsr   trace_headersr}   c           1        s2  |j }|j}|j}|j}	|j}
|j}| j}|j}|d u r$tdddd|j	}t
|tr4d|v r4td|d}|dg }|d	}d }t|t||
}d
| }t|d|	d}t|dg |||dg}g }t|D ]^}tdd |D  \}}g }| dt  }tt||D ]"\}\}}| d| } tt| jj||| ||d}!||! qdd tj| I d H D }"g }#g }$g }%t|"D ]P\}}&||  |&jd jdkrt||t ddg d d ddgd|d dV    d S |&jd j!d ur|&jd j!d }'|$"t#|'$  |%" fdd|'% D  qt&'|$}$t&'|%}%|stt&(|$|kd }(|(D ]@})||)|   |"|)|  }&|&jd j!d usFJ |&jd j!d }*|t|rZ j)|g n j) j!|*g t*|%|) d|d q-t&j+ |%|(< t&,t&-|%|d | }+|+D ]D})||)|   |"|)|  }&t.|$|) },|&jd j!d usJ |&jd j!d }*|#t j)|,g  j!|*g  j/t*|%|)  j0 j1d q|#}qk|"| t2||dd}-|-d | }.|.D ]#}/|/j)d |kr|s|/j)d }0n|/j)d  }0|3|0|/_4qt||fddt|.D d|d dV  d S ) Nz:You cannot use beam search when `skip_tokenizer_init=True`skip_tokenizer_initT	parametervalueencoder_promptz$Encoder-decoder prompt not supportedr   prompt_token_idsmulti_modal_data      )logprobs
max_tokenstemperaturer   )tokenscum_logprobr   r   mm_processor_kwargsr   c                 S   s&   g | ]}t |j|j|jd |jfqS ))r   r   r   )rL   r   r   r   r   ).0beamrs   rs   rt   
<listcomp>?  s    	z-OpenAIServing.beam_search.<locals>.<listcomp>-z-beam-)r   r   c                 S   s   g | ]}|d  qS )r   rs   )r   xrs   rs   rt   r   `  s    error )indextext	token_idscumulative_logprobr   finish_reason)r   r   outputsfinishedr   prompt_logprobsc                    s   g | ]} j |j qS rs   )r   logprob)r   obj)current_beamrs   rt   r     s    
stop)r   r   r   r   stop_reason)r   r   r   r   r   r   )keyreversec                    sH   g | ] \}}t |j|j|j d  ||j|jd ur|jnd|jdqS )Nlength)r   r   r   r   r   r   r   )rR   r   r   r   r   r   r   )r   ir   )tokenized_lengthrs   rt   r     s    
)5
beam_widthr   
ignore_eosr   length_penaltyinclude_stop_str_in_outputr   	tokenizerrI   eos_token_id
isinstancedictr   getlenr   r_   r   rangezipre   	enumerateasynciocreate_taskrf   r   generateappendgatherr   r   rT   rR   r   extendr   keysvaluesnparraywherer   floatinfargpartitionnegativer   r   r   r   sorteddecoder   )1rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   prompt_textr   r   r   sort_beams_keylogprobs_numbeam_search_params	all_beams	completed_prompts_batchlora_req_batchtasksrequest_id_batchr   individual_promptlora_reqrequest_id_itemtaskoutput	new_beamsall_beams_token_idall_beams_logprobresultr   eos_idxidxlogprobs_entrytopn_idxtoken_idsorted_completed
best_beamsr   r   rs   )r   r   rt   beam_search   s8  



	










zOpenAIServing.beam_searchctxc                    s   dS )z
        Default preprocessing hook. Subclasses may override
        to prepare `ctx` (classification, embedding, etc.).
        Nrs   rp   r  rs   rs   rt   _preprocess  s   zOpenAIServing._preprocessc                 C   s
   |  dS )z
        Default response builder. Subclass may override this method
        to return the appropriate response object.
        zunimplemented endpoint)create_error_responser  rs   rs   rt   _build_response  s   
zOpenAIServing._build_responsec                    s,   |  |2 z	3 d H W }|  S 6 | dS )Nz!No response yielded from pipeline)	_pipeliner  )rp   r  responsers   rs   rt   handle  s
   
zOpenAIServing.handlec                 C  s   |  |jI dH  }r|V  | | }r|V  | |I dH }t|tr)|V  | |I dH }t|tr9|V  | |I dH }t|trI|V  | |V  dS )z;Execute the request processing pipeline yielding responses.N)	_check_modelr   _validate_requestr  r   r!   _prepare_generators_collect_batchr  )rp   r  r   preprocess_retgenerators_retcollect_retrs   rs   rt   r    s   


zOpenAIServing._pipelinec                 C   s.   t |jdd }|d ur|| jkr| dS d S )Ntruncate_prompt_tokenszetruncate_prompt_tokens value is greater than max_model_len. Please, select a smaller truncation size.)getattrr   r   r  )rp   r  r   rs   rs   rt   r    s   
zOpenAIServing._validate_requestc                 C   s    t |jds| dS |j S )Nto_pooling_paramsz0Request type does not support pooling parameters)hasattrr   r  r"  r  rs   rs   rt   _create_pooling_params%  s
   
z$OpenAIServing._create_pooling_paramsc           
         s   g }zd|j du rdn	| |j jI dH }| |}t|tr"|W S |jdu r-| dW S t|jD ]-\}}|j	 d| }| j
||||jd | jj||||j|t|jddd}|| q2t| |_W dS  ty~ }	 z| |	W  Y d}	~	S d}	~	ww )z2Schedule the request and get the result generator.NEngine prompts not availabler   r   r   priorityr   )r   r   r'  )r   _get_trace_headersheadersr$  r   r!   r   r  r   r   _log_inputsr   r   encoder!  r   r   rg   r   	Exception)
rp   r  
generatorsr   pooling_paramsr   engine_promptr  	generatorers   rs   rt   r  0  sF   



	
z!OpenAIServing._prepare_generatorsc              
      s   zF|j du r| dW S t|j }dg| }|jdu r"| dW S |j2 z3 dH W \}}|||< q%6 d|v r=| dW S dd |D |_W dS  ty^ } z| |W  Y d}~S d}~ww )z0Collect batch results from the result generator.Nr%  zResult generator not availablez*Failed to generate results for all promptsc                 S   s   g | ]}|d ur|qS rk   rs   )r   resrs   rs   rt   r   y  s    z0OpenAIServing._collect_batch.<locals>.<listcomp>)r   r  r   r   r   r,  )rp   r  num_promptsr   r   r2  r1  rs   rs   rt   r  a  s*   




zOpenAIServing._collect_batchBadRequestErrorrj   err_typero   paramc           	      C   s   d }t |trT|}ddlm} t ||rd}tj}|j}n3t |ttt	t
fr.d}tj}d }n"t |tr;d}tj}d }n|jjdkrId}tj}d }nd}tj}d }t|}| jrkt \}}}|d urgt  nt  ttt|||j|ddS )	Nr   rH   r4  r   TemplateErrorInternalServerError)rj   typecoder6  )r   )r   r,  vllm.exceptionsrI   r   BAD_REQUESTr   
ValueError	TypeErrorRuntimeErrorOverflowErrorr   NOT_IMPLEMENTEDrr   ru   rn   ry   r   sysexc_info	traceback	print_excprint_stackr!   r    rG   r   )	rp   rj   r5  ro   r6  excrI   exc_typer   rs   rs   rt   r    sJ   



z#OpenAIServing.create_error_responsec                 C   s    t | j||||d }|S )Nrj   r5  ro   r6  )jsondumpsr  
model_dump)rp   rj   r5  ro   r6  json_strrs   rs   rt   create_streaming_error_response  s   z-OpenAIServing.create_streaming_error_responser   c                 C   s    |dkrt d| tddS )z:Raise GenerationError if finish_reason indicates an error.r   z:Request %s failed with an internal error during generationri   N)loggerr   rh   )rp   r   r   rs   rs   rt   _raise_if_error  s   zOpenAIServing._raise_if_errorr1  c                 C      | j t|d|jdS )z)Convert GenerationError to ErrorResponse.r8  r5  ro   )r  ry   ro   rp   r1  rs   rs   rt   %_convert_generation_error_to_response  
   z3OpenAIServing._convert_generation_error_to_responsec                 C   rQ  )z4Convert GenerationError to streaming error response.r8  rR  )rN  ry   ro   rS  rs   rs   rt   /_convert_generation_error_to_streaming_response  rU  z=OpenAIServing._convert_generation_error_to_streaming_responser   c                    s   d }|  |jrd S |j| jjv rd S tjr<|jr<| j|jI d H  }r<t|tr-d S t|t	r<|j
jtjjkr<|}|pL| jd|j ddtjddS )NThe model `` does not exist.NotFoundErrormodelrI  )_is_model_supportedrZ  r   lora_requestsenvs VLLM_ALLOW_RUNTIME_LORA_UPDATINGresolve_lorar   rP   r!   r   r:  r   r<  r   r  	NOT_FOUND)rp   r   error_responseload_resultrs   rs   rt   r    s0   
zOpenAIServing._check_modelc                 C   sN   |  |}t }| jj D ]}|j|v r|| qt|dkr%| S dS )z;Determine if there are any active default multimodal loras.r   N)	_get_message_typessetr   r\  r   	lora_nameaddr   pop)rp   r   message_typesdefault_mm_loraslorars   rs   rt   _get_active_default_mm_loras  s   


z*OpenAIServing._get_active_default_mm_lorassupports_default_mm_lorasc                 C   sX   |j | jjv r| jj|j  S |r| |}|d ur|S | |j r#d S td|j  d)NrW  rX  )rZ  r   r\  rk  r[  r=  )rp   r   rl  default_mm_lorars   rs   rt   _maybe_get_adapters  s   
z!OpenAIServing._maybe_get_adaptersc                 C   s   t  }t|ds
|S |j}|du st|ttfr|S |D ])}t|trEd|v rEt|d trE|d D ]}d|v rD||d 	dd  q2q|S )zRetrieve the set of types from message content dicts up
        until `_`; we use this to match potential multimodal data
        with default per modality loras.
        messagesNcontentr9  r   r   )
rd  r#  ro  r   ry   bytesr   r   rf  split)rp   r   rh  ro  rj   content_dictrs   rs   rt   rc  /  s"   
z OpenAIServing._get_message_types	input_ids
input_textc                 C   sF  t |}t|tttttttt	frB|| j
kr<tdtdtdtdt	di}|t|d}td| j
 d| d| dd|d	t||d
S t|tttfrPt||d
S t|tr\|jpZ|j}nt|dd }|| j
krvtd| j
 d| dd|d	|d ur|| | j
krtd| d| j
 d| d| d| j
 d| dd|d	t||d
S )Nscoreclassificationzembedding generationz'This model's maximum context length is z  tokens. However, you requested z tokens in the input for z(. Please reduce the length of the input.input_tokensr   )r   r   r   z# tokens. However, your request has z> input tokens. Please reduce the length of the input messages.z6'max_tokens' or 'max_completion_tokens' is too large: z). This model's maximum context length is z tokens and your request has z input tokens (z > z - z).)r   r   r3   r4   r;   r?   r<   r:   r0   r/   r   r   r9  rI   rL   rD   rC   rB   r   max_completion_tokensr   r!  )rp   r   rt  ru  	token_num
operations	operationr   rs   rs   rt   _validate_inputH  s   


	
zOpenAIServing._validate_inputrequest_chat_templatechat_template_kwargstrust_request_chat_templatec                 C   s,   |s|d us|r| dd ur| dS d S )Nchat_templatezChat template is passed with request, but --trust-request-chat-template is not set. Refused request with untrusted chat template.)r   r  )rp   r~  r  r  rs   rs   rt   _validate_chat_template  s   z%OpenAIServing._validate_chat_templaterequest_chat_template_kwargsdefault_chat_template_kwargsc                 C   s   | pi } |du r
| S || B S )zIHelper to merge server-default and request-specific chat template kwargs.Nrs   )r  r  rs   rs   rt   #_prepare_extra_chat_template_kwargs  s   z1OpenAIServing._prepare_extra_chat_template_kwargsprompt_inputprompt_embedsc                    s   | j }| j tttB   }|d ur|t| |d ur$|t|  fdd|D } }|j||fdddD dI d H S )Nc                    s$   g | ]}t |tr|nt |qS rs   )r   rq  r\   )r   r   )r|   rs   rt   r     s    z8OpenAIServing._preprocess_completion.<locals>.<listcomp>c                    &   i | ]}t  |d  d ur|qS rk   r!  r   kr   vrs   rt   
<dictcomp>  
    z8OpenAIServing._preprocess_completion.<locals>.<dictcomp>r   
cache_saltprompt_extras)	r   r|   r   rK   rq  r   r]   r   render_cmpl_async)rp   r   r  r  r   promptsparsed_prompts
tok_paramsrs   )r|   r   r  rt   _preprocess_completion  s&   

z$OpenAIServing._preprocess_completionro  r   r   default_template_kwargs
tool_dictstool_parserc                    s   ddl m} | j}	t|t|t|	j|d} | j}
 	||
|}|	j|g||
 fdddD dI d H \\}\}|d urft dd	}|d	krft ttB sZd
}t||	 }||j d ||gfS )Nr   )MistralTokenizer)toolstokenizec                    r  rk   r  r  r  rs   rt   r    r  z2OpenAIServing._preprocess_chat.<locals>.<dictcomp>r  r  tool_choicenonezPTool usage is only supported for Chat Completions API or Responses API requests.r   )vllm.tokenizers.mistralr  r   rX   r   r   r   r   r|   r   with_defaultsrender_chat_asyncr!  r   r*   r   get_tokenizeradjust_request)rp   r   ro  r   r   r  r  r  r  r   r  chat_paramsconversationr/  r  msgr   rs   r  rt   _preprocess_chat  sD   


zOpenAIServing._preprocess_chatc                 C      t | j|S rk   )rZ   r|   rp   r   rs   rs   rt   _extract_prompt_components     z(OpenAIServing._extract_prompt_componentsc                 C   s   |  |jS rk   )r  r   r  rs   rs   rt   _extract_prompt_text  r  z"OpenAIServing._extract_prompt_textc                 C   r  rk   )r[   r|   r  rs   rs   rt   _extract_prompt_len  r  z!OpenAIServing._extract_prompt_lenr  chat_template_content_formatc           
   	      s2   t |d}| j||||d ||dI d H \}}	|	S )N)request_input)r   r   r  r  r  )r+   r  )
rp   r   ro  r  r  r  r  new_messagesr   r   rs   rs   rt   _render_next_turn!  s   		zOpenAIServing._render_next_turnr   r/  sampling_paramsr  contextr'  c	              
   C  sb  |  |}	|}
d}	 | d| }| j||||d | }| jj|||||||d}| jj|||||||	|d}|2 z3 d H W }|| |V  q<6 | sRd S |	 I d H }|
| t|ttfrw| }t|d}| jt| |_n1t|tr| |j|jj|j|j|j|jI d H }|d }|  |}	t| j|jj| || j|_|
d }|d7 }q)	Nr   Tr   r&  )r   tokenization_kwargsr   r'  )r   r   r'  r   r  )r   r   ) r  r*  get_encode_kwargsr   process_inputsr   r   append_outputneed_builtin_tool_call	call_toolappend_tool_outputr   r&   r(   render_for_completionrL   r   r   r   r'   r  r   parserresponse_messagesr  tool_parser_clsr  r  rF   max_output_tokensr  default_sampling_params)rp   r   r/  r  r  r  r   r'  r   r   orig_prioritysub_requestsub_request_idr  engine_requestr0  r2  tool_outputr   r   rs   rs   rt   _generate_with_builtin_tools9  s   







z*OpenAIServing._generate_with_builtin_toolsinputsc                 C   s:   | j d u rd S | |}| j j||j|j|j||d d S )Nr&  )r   r  
log_inputsr   r   embeds)rp   r   r  r   r   
componentsrs   rs   rt   r*    s   


zOpenAIServing._log_inputsr)  c                    s0   | j  I d H }|rt|S t|rt  d S rk   )r   is_tracing_enabledrc   rb   rd   )rp   r)  r  rs   rs   rt   r(    s   z OpenAIServing._get_trace_headersr   defaultc                 C   s2   | dur| j d }dur|S |du rt S |S )z6Pulls the request id to use from a header, if providedNzX-Request-Id)r)  r   re   )r   r  req_idrs   rs   rt   _base_request_id  s   zOpenAIServing._base_request_idc                 C   sD   | du rdS | j d}|du rdS zt|W S  ty!   Y dS w )z7Pulls the data parallel rank from a header, if providedNzX-data-parallel-rank)r)  r   r   r=  )r   rank_strrs   rs   rt   _get_data_parallel_rank  s   
z%OpenAIServing._get_data_parallel_rankr   enable_auto_toolsr  rp  c           
   
   C   s  t t  }| jr%t| jtr%|d usJ |t| jj|d d }||fS | jrFt| jtrF|d us4J |t| jjj|d d }||fS | jdkrj|d usQJ t	t t
 |}|dd |D  d }||fS |r|r| jdksx| jd u r|d u rtdz||}W n ty } ztd |d }~ww |j|d ur|nd| d	}	|	d ur|	jr|d
d |	jD  |	j}|r| dkrd }||fS d |fS ||fS )Nname	argumentsrequiredc                 S   s&   g | ]}t |jtj|jd ddqS )F)ensure_asciir  )r"   r  rJ  rK  
parametersr   	tool_callrs   rs   rt   r     s    z@OpenAIServing._parse_tool_calls_from_content.<locals>.<listcomp>autoz7Tokenizer not available when `skip_tokenizer_init=True`zError in tool parser creation.r   r  c                 s   s(    | ]}t |j|jj|jjd V  qdS ))idr  r  N)r"   r  functionr  r  r  rs   rs   rt   	<genexpr>  s    
z?OpenAIServing._parse_tool_calls_from_content.<locals>.<genexpr>)r   r"   r  r   r   r   r  r   r  r   r#   validate_jsonr   r=  r?  rO  	exceptionextract_tool_callstools_called
tool_callsrp  strip)
r   r   r  r  rp  function_callsr  r  r1  tool_call_infors   rs   rt   _parse_tool_calls_from_content  sr   
<3
	&

z,OpenAIServing._parse_tool_calls_from_contentr   r  return_as_token_idc                 C   s:   |rd| S | j d ur| j S |d u rtd||gS )Nz	token_id:z:Unable to get tokenizer because `skip_tokenizer_init=True`)decoded_tokenr=  r   )r   r  r   r  rs   rs   rt   _get_decoded_token"  s   

z OpenAIServing._get_decoded_tokenr   c                 C   s   |sdS | j |S )NT)r   is_base_model)rp   r   rs   rs   rt   r[  6  s   z!OpenAIServing._is_model_supported)NN)F)Nr   Nrk   )\ru   rv   rw   r   r	   ry   r   r   r$   r   boolrm   rY   r^   rP   r   r   rT   r  r   r!   r  r   r  r  r  r  rU   r$  r  r  r   r<  r,  r  rN  rP  rh   rT  rV  r   r  rk  rn  rd  rc  objectr   r   rL   r}  r   r   r  staticmethodr  r{   rq  r  r   r   r   r   r`   ra   r   r   r  r  r  r  r*   r)   r  r_   rW   r%   r  rJ   r*  r   r(  r   r  r  r   r"   r  rN   r  r[  rz   rs   rs   rq   rt   r      s  
 

 h




	


1
"
:
	






T

"
*	
9
	
^

Lr   r   r}   c                 C   sF   | d u r| S | D ]}|d u rq|  D ]}|jtdkrd|_qq| S )Nz-infg    )r   r   r   )r   logprob_dictlogprob_valuesrs   rs   rt   clamp_prompt_logprobs<  s   r  )r   rJ  rB  r   rD  collections.abcr   r   r   dataclassesr   r   httpr   typingr   r	   r
   r   r   r   numpyr   fastapir   openai.types.responsesr   pydanticr   r   starlette.datastructuresr   	vllm.envsr]  vllm.beam_searchr   r   vllm.configr   vllm.engine.protocolr   vllm.entrypoints.chat_utilsr   r   r   vllm.entrypoints.loggerr   0vllm.entrypoints.openai.chat_completion.protocolr   r   r   +vllm.entrypoints.openai.completion.protocolr   r   'vllm.entrypoints.openai.engine.protocolr    r!   r"   r#   &vllm.entrypoints.openai.models.servingr$   )vllm.entrypoints.openai.responses.contextr%   r&   r'   r(   *vllm.entrypoints.openai.responses.protocolr)   r*   'vllm.entrypoints.openai.responses.utilsr+   /vllm.entrypoints.openai.speech_to_text.protocolr,   r-   r.   *vllm.entrypoints.pooling.classify.protocolr/   r0   r1   'vllm.entrypoints.pooling.embed.protocolr2   r3   r4   r5   )vllm.entrypoints.pooling.pooling.protocolr6   r7   r8   r9   'vllm.entrypoints.pooling.score.protocolr:   r;   r<   r=   r>   r?   &vllm.entrypoints.serve.disagg.protocolr@   rA   (vllm.entrypoints.serve.tokenize.protocolrB   rC   rD   rE   vllm.entrypoints.utilsrF   rG   r;  rI   vllm.inputs.datarJ   rK   rL   vllm.loggerrM   vllm.logprobsrN   rO   vllm.lora.requestrP   vllm.multimodalrQ   vllm.outputsrR   rS   rT   vllm.pooling_paramsrU   vllm.renderersrV   rW   rX   vllm.renderers.inputsrY    vllm.renderers.inputs.preprocessrZ   r[   r\   r]   vllm.sampling_paramsr^   r_   vllm.tokenizersr`   vllm.tool_parsersra   vllm.tracingrb   rc   rd   
vllm.utilsre   vllm.utils.async_utilsrf   rg   r,  rh   ru   rO  r{   r   r   r   r   r   r   r   r   r   r   r  rs   rs   rs   rt   <module>   s  
  




	
        i