o
    -i)                     @   s  U d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lm Z  d dl!m"Z" d d
l#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZI d dlJmKZKmLZLmMZMmNZN d dlOmPZPmQZQmRZRmSZS d dlTmUZUmVZVmWZWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZambZbmcZc d dldmeZemfZf d dlgmhZhmiZimjZjmkZk d dllmmZmmnZn d dlompZp d dlqmrZrmsZs d d ltmuZumvZvmwZw d d!lxmyZy d d"lzm{Z{m|Z| d d#l}m~Z~ d d$lmZ d d%lmZmZmZ d d&lmZ d d'lmZmZ d d(lmZ d d)lmZmZ d d*lmZ d d+lmZmZ d d,lmZmZmZ d d-lmZ d d.lmZmZmZ d d/lmZ G d0d1 d1eZeyeZe3ejB ehB eQB eLB eZB e]B eWB Zeed2< e0eiB ePB eKB eVB Zeed3< eGeIB Zeed4< eeB eB eCB eUB eeB Zeed5< e4e1B eSB eHB ekB eXB eNB e^B efB Zeed6< ed7ed8Zed9d:G d;d< d<Zed9d:G d=d> d>Zed9d:G d?d@ d@eeee Zed9d:G dAdB dBeeM Zed9d:G dCdD dDeeR ZG dEdF dFZdGe|dB dHe|dB fdIdJZdS )K    N)AsyncGeneratorCallableIterableMapping)	dataclassfield)
HTTPStatus)AnyClassVarGeneric	TypeAliasTypeVarcast)Request)ToolChoiceFunction)
ConfigDictTypeAdapter)Headers)BeamSearchSequencecreate_sort_beams_key_function)EngineClient)ChatCompletionMessageParamChatTemplateContentFormatOptionConversationMessage)RequestLogger)"ChatCompletionNamedToolChoiceParamChatCompletionRequestChatCompletionResponse)CompletionRequestCompletionResponse)	ErrorInfoErrorResponseFunctionCallFunctionDefinition)OpenAIServingModels)ConversationContextHarmonyContextParsableContextStreamingHarmonyContext)ResponseInputOutputItemResponsesRequest)construct_input_messages)TranscriptionRequestTranscriptionResponseTranslationRequest)ClassificationChatRequestClassificationCompletionRequestClassificationRequestClassificationResponse)EmbeddingChatRequestEmbeddingCompletionRequestEmbeddingRequestEmbeddingResponse)IOProcessorRequestPoolingChatRequestPoolingCompletionRequestPoolingResponse)RerankRequestScoreDataRequestScoreQueriesDocumentsRequestScoreRequestScoreResponseScoreTextRequest)BaseRendererCompletionRendererRenderConfig)GenerateRequestGenerateResponse)DetokenizeRequestTokenizeChatRequestTokenizeCompletionRequestTokenizeResponse)_validate_truncation_sizesanitize_messageVLLMValidationError)
PromptTypeTokensPrompt)PromptComponentsget_prompt_components"is_explicit_encoder_decoder_prompt)init_logger)LogprobPromptLogprobs)LoRARequest)MultiModalDataDict)CompletionOutputPoolingRequestOutputRequestOutput)PoolingParams)ReasoningParserReasoningParserManager)RendererLike)BeamSearchParamsSamplingParams)TokenizerLike)
ToolParserToolParserManager)contains_trace_headersextract_trace_headerslog_tracing_disabled_warning)random_uuid)AsyncMicrobatchTokenizercollect_from_async_generatormerge_async_iterators)EngineCoreRequestc                       s(   e Zd ZdZddef fddZ  ZS )GenerationErrorz?raised when finish_reason indicates internal server error (500)Internal server errormessagec                    s   t  | tj| _d S N)super__init__r   INTERNAL_SERVER_ERRORstatus_code)selfrn   	__class__ c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/entrypoints/openai/engine/serving.pyrq      s   zGenerationError.__init__)rm   )__name__
__module____qualname____doc__strrq   __classcell__rw   rw   ru   rx   rl      s    rl   CompletionLikeRequestChatLikeRequestSpeechToTextRequest
AnyRequestAnyResponseRequestT)boundT)kw_onlyc                   @   s,   e Zd ZU dZeedZee dB ed< dS )RequestProcessingMixinzY
    Mixin for request processing,
    handling prompt preparation and engine input.
    default_factoryNengine_prompts)	ry   rz   r{   r|   r   listr   rO   __annotations__rw   rw   rw   rx   r      s   
 r   c                   @   sZ   e Zd ZU dZdZeeeee	B f df dB e
d< eedZeee	B  e
d< eddZdS )ResponseGenerationMixinz`
    Mixin for response generation,
    managing result generators and final batch results.
    Nresult_generatorr   final_res_batchT)arbitrary_types_allowed)ry   rz   r{   r|   r   r   tupleintrZ   rY   r   r   r   r   r   model_configrw   rw   rw   rx   r      s   
 r   c                   @   s\   e Zd ZU eed< dZedB ed< eed< eed< edd dZ	e
ed	< dZedB ed
< dS )ServeContextrequestNraw_request
model_name
request_idc                   C   s   t t S ro   )r   timerw   rw   rw   rx   <lambda>   s    zServeContext.<lambda>r   created_timelora_request)ry   rz   r{   r   r   r   r   r}   r   r   r   r   rV   rw   rw   rw   rx   r      s   
 r   c                   @   s   e Zd ZdS )ClassificationServeContextN)ry   rz   r{   rw   rw   rw   rx   r      s    r   c                   @   s&   e Zd ZU dZedB ed< eed< dS )EmbeddingServeContextNchat_templatechat_template_content_format)ry   rz   r{   r   r}   r   r   rw   rw   rw   rx   r      s   
 r   c                        s  e Zd ZU dZee ed< ddddedede	dB d	e
d
e
f
 fddZ	ddedB de
deegef dB fddZdedeegef dB fddZdddZ		ddededededB deeef dB deedf fddZdefddZd edefd!d"Zdefd#d$Z d%e!de"dB fd&d'Z#d%e!de$e"B fd(d)Z%d%e!de$e"B fd*d+Z&d%e!dee$e"B df fd,d-Z'd%e!de"dB fd.d/Z(d%e!de)e"B fd0d1Z*d%e!de"dB fd2d3Z+d%e!de"dB fd4d5Z,d6e-j.dfd7ee/B d8ed9e-d:edB de"f
d;d<Z0d6e-j.dfd7ee/B d8ed9e-d:edB def
d=d>Z1d?edB deddfd@dAZ2dBe3de"fdCdDZ4dBe3defdEdFZ5d e6de"dB fdGdHZ7d e6dedB fdIdJZ8	dd e6dKe
dedB fdLdMZ9d e6de:e fdNdOZ;d e6dedPedQe
de<f
dRdSZ=d e6dTe>e? dPedB de<fdUdVZ@d e6dWe>e? dXede<fdYdZZA	[dd e6dPed\ee>e? B dQe
de<f
d]d^ZB	[dd e6dPed_eCee>e? B  dQe
dee<df f
d`daZDdbedB dceEeef dB dde
de"dB fdedfZFeG		ddgeEeef dB dheEeef dB deEeef fdidjZH	[							dd eIeJB dkeKdle>eL dmedB dneMdoe
dpe
dqe>eEeef  dB dre>eEeef  dB dceEeef dB dheEeef dB dseegef dB dQe
deNe>eO e>e< f fdtduZPddvdedwedeQe)B dedB deeef dB dxe?dye?dB deNeReEeef f fdzd{ZSd eJdkeKdle>eT dqe>eEeef  dB dmedB dneMfd|d}ZU		~ddedwe<deQdeVdedB dxe?fddZWdedeXfddZYdededeQe)B eB dB dedB ddf
ddZZde[deeef dB fddZ\eG	dde]dB dedB dedB fddZ^eGde]dB de?dB fddZ_eG	dd eJe`B dPedB de
deegef dB dedB deNe>ea dB edB f fddZbeG	ddecde?dPedB de
def
ddZddedB de
fddZe  ZfS )OpenAIServingu   
    A short string prepended to every request’s ID (e.g. "embd", "classify")
    so you can easily tell “this ID came from Embedding vs Classification.”
    request_id_prefixF)return_tokens_as_token_idslog_error_stackengine_clientmodelsrequest_loggerNr   r   c                   sd   t    || _|| _|| _|| _i | _|| _| jj| _| jj	| _	| jj
| _
| jj| _| jj| _d S ro   )rp   rq   r   r   r   r   _async_tokenizer_poolr   input_processorio_processorrendererr   max_model_len)rt   r   r   r   r   r   ru   rw   rx   rq      s   
	



zOpenAIServing.__init__tool_parser_nameenable_auto_toolsreturnc              
   C   sz   d}|r|du r
|S t d z|dkr | jjdr t d t|}W |S  ty< } z	t	d| d|d}~ww )z&Get the tool parser based on the name.Nz$"auto" tool choice has been enabled.pythoniczmeta-llama/Llama-3.2z>Llama3.2 models may struggle to emit valid pythonic tool callsz7Error: --enable-auto-tool-choice requires tool_parser:'z' which has not been registered)
loggerinfor   model
startswithwarningrc   get_tool_parser	Exception	TypeError)rt   r   r   parsererw   rw   rx   _get_tool_parser
  s.   
zOpenAIServing._get_tool_parserreasoning_parser_namec              
   C   sT   d}|sdS zt |}|dusJ W |S  ty) } z	td|d|d}~ww )z+Get the reasoning parser based on the name.Nzreasoning_parser_name=z has not been registered)r]   get_reasoning_parserr   r   )rt   r   r   r   rw   rw   rx   _get_reasoning_parser#  s   
z#OpenAIServing._get_reasoning_parserc                    s    | j   | j I d H  d S ro   )r   clear_mm_cacher   reset_mm_cachert   rw   rw   rx   r   2  s   
zOpenAIServing.reset_mm_cachepromptr   paramsr   trace_headersc           1        s<  |j }|j}|j}|j}	|j}
|j}| j}|j}|d u r$tdddd|j	}t
|r-tt|tr9|}g }d }n|d}|dg }|d}d }t|t||
}d| }t|d	|	d
}t|dg |||dg}g }t|D ]^}tdd |D  \}}g }| dt  }tt||D ]"\}\}}| d| } tt| jj||| ||d}!||! qdd tj| I d H D }"g }#g }$g }%t|"D ]P\}}&||  |&jd jdkrt ||t!ddg d d ddgd|d dV    d S |&jd j"d ur|&jd j"d }'|$#t$|'%  |%# fdd|'& D  qt'(|$}$t'(|%}%|syt')|$|kd }(|(D ]@})||)|   |"|)|  }&|&jd j"d usKJ |&jd j"d }*|t|r_ j*|g n j* j"|*g t+|%|) d|d q2t'j, |%|(< t'-t'.|%|d | }+|+D ]D})||)|   |"|)|  }&t/|$|) },|&jd j"d usJ |&jd j"d }*|#t j*|,g  j"|*g  j0t+|%|)  j1 j2d q|#}qp|#| t3||dd}-|-d | }.|.D ]#}/|/j*d |kr|s|/j*d }0n|/j*d  }0|4|0|/_5qt ||fddt|.D d|d dV  d S )Nz:You cannot use beam search when `skip_tokenizer_init=True`skip_tokenizer_initT	parametervaluer   prompt_token_idsmulti_modal_data      )logprobs
max_tokenstemperaturer   )tokenscum_logprobr   r   mm_processor_kwargsr   c                 S   s&   g | ]}t |j|j|jd |jfqS ))r   r   r   )rO   r   r   r   r   ).0beamrw   rw   rx   
<listcomp>  s    	z-OpenAIServing.beam_search.<locals>.<listcomp>-z-beam-)r   r   c                 S   s   g | ]}|d  qS )r   rw   )r   xrw   rw   rx   r     s    error )indextext	token_idscumulative_logprobr   finish_reason)r   r   outputsfinishedr   prompt_logprobsc                    s   g | ]} j |j qS rw   )r   logprob)r   obj)current_beamrw   rx   r     s    
stop)r   r   r   r   stop_reason)r   r   r   r   r   r   )keyreversec                    sH   g | ] \}}t |j|j|j d  ||j|jd ur|jnd|jdqS )Nlength)r   r   r   r   r   r   r   )rX   r   r   r   r   r   r   )r   ir   )tokenized_lengthrw   rx   r     s    
)6
beam_widthr   
ignore_eosr   length_penaltyinclude_stop_str_in_outputr   	tokenizerrM   eos_token_idrR   NotImplementedError
isinstancer}   getlenr   r`   r   rangeziprg   	enumerateasynciocreate_taskri   r   generateappendgatherr   r   rZ   rX   r   extendr   keysvaluesnparraywherer   floatinfargpartitionnegativer   r   r   r   sorteddecoder   )1rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   prompt_textr   r   r   sort_beams_keylogprobs_numbeam_search_params	all_beams	completed_prompts_batchlora_req_batchtasksrequest_id_batchr   individual_promptlora_reqrequest_id_itemtaskoutput	new_beamsall_beams_token_idall_beams_logprobresultr   eos_idxidxlogprobs_entrytopn_idxtoken_idsorted_completed
best_beamsr   r   rw   )r   r   rx   beam_search6  s@  




	










zOpenAIServing.beam_searchc                 C   s   t | j| jj| jdS )z
        Get a Renderer instance with the provided tokenizer.
        Uses shared async tokenizer pool for efficiency.
        )r   r   async_tokenizer_pool)rB   r   r   r   r   r   rw   rw   rx   _get_completion_renderer%  s
   z&OpenAIServing._get_completion_rendererr   c                 C   s   t )a  
        Build and return a `RenderConfig` for an endpoint.

        Used by the renderer to control how prompts are prepared
        (e.g., tokenization and length handling). Endpoints should
        implement this with logic appropriate to their request type.
        )r   )rt   r   rw   rw   rx   _build_render_config0  s   z"OpenAIServing._build_render_configc                 C   s*   | j |}|du rt|}|| j |< |S )zh
        Return (and cache) an `AsyncMicrobatchTokenizer` bound to the
        given tokenizer.
        N)r   r   rh   )rt   r   async_tokenizerrw   rw   rx   _get_async_tokenizer=  s
   
z"OpenAIServing._get_async_tokenizerctxc                    s   dS )z
        Default preprocessing hook. Subclasses may override
        to prepare `ctx` (classification, embedding, etc.).
        Nrw   rt   r,  rw   rw   rx   _preprocessH  s   zOpenAIServing._preprocessc                 C   s
   |  dS )z
        Default response builder. Subclass may override this method
        to return the appropriate response object.
        zunimplemented endpoint)create_error_responser-  rw   rw   rx   _build_responseR  s   
zOpenAIServing._build_responsec                    s0   |  |}|2 z	3 d H W }|  S 6 | dS )Nz!No response yielded from pipeline)	_pipeliner/  )rt   r,  
generationresponserw   rw   rx   handle\  s   

zOpenAIServing.handlec                 C  s   |  |jI dH  }r|V  | | }r|V  | |I dH }t|tr)|V  | |I dH }t|tr9|V  | |I dH }t|trI|V  | |V  dS )z;Execute the request processing pipeline yielding responses.N)	_check_modelr   _validate_requestr.  r   r!   _prepare_generators_collect_batchr0  )rt   r,  r   preprocess_retgenerators_retcollect_retrw   rw   rx   r1  h  s   


zOpenAIServing._pipelinec                 C   s.   t |jdd }|d ur|| jkr| dS d S )Ntruncate_prompt_tokenszetruncate_prompt_tokens value is greater than max_model_len. Please, select a smaller truncation size.)getattrr   r   r/  )rt   r,  r<  rw   rw   rx   r6    s   
zOpenAIServing._validate_requestc                 C   s    t |jds| dS |j S )Nto_pooling_paramsz0Request type does not support pooling parameters)hasattrr   r/  r>  r-  rw   rw   rx   _create_pooling_params  s
   
z$OpenAIServing._create_pooling_paramsc           
         s   g }zd|j du rdn	| |j jI dH }| |}t|tr"|W S |jdu r-| dW S t|jD ]-\}}|j	 d| }| j
||||jd | jj||||j|t|jddd}|| q2t| |_W dS  ty~ }	 z| |	W  Y d}	~	S d}	~	ww )z2Schedule the request and get the result generator.NEngine prompts not availabler   r   r   priorityr   r   r   rC  )r   _get_trace_headersheadersr@  r   r!   r   r/  r   r   _log_inputsr   r   encoder=  r   r   rj   r   r   )
rt   r,  
generatorsr   pooling_paramsr   engine_promptr  	generatorr   rw   rw   rx   r7    sH   



	
z!OpenAIServing._prepare_generatorsc              
      s   zF|j du r| dW S t|j }dg| }|jdu r"| dW S |j2 z3 dH W \}}|||< q%6 d|v r=| dW S dd |D |_W dS  ty^ } z| |W  Y d}~S d}~ww )z0Collect batch results from the result generator.NrA  zResult generator not availablez*Failed to generate results for all promptsc                 S   s   g | ]}|d ur|qS ro   rw   )r   resrw   rw   rx   r     s    z0OpenAIServing._collect_batch.<locals>.<listcomp>)r   r/  r   r   r   r   )rt   r,  num_promptsr   r   rM  r   rw   rw   rx   r8    s*   




zOpenAIServing._collect_batchBadRequestErrorrn   err_typers   paramc           	      C   s   d }t |trT|}ddlm} t ||rd}tj}|j}n3t |ttt	t
fr.d}tj}d }n"t |tr;d}tj}d }n|jjdkrId}tj}d }nd}tj}d }t|}| jrkt \}}}|d urgt  nt  ttt|||j|ddS )	Nr   rL   rO  r   TemplateErrorInternalServerError)rn   typecoderQ  )r   )r   r   vllm.exceptionsrM   r   BAD_REQUESTr   
ValueErrorr   RuntimeErrorOverflowErrorr   NOT_IMPLEMENTEDrv   ry   rr   r}   r   sysexc_info	traceback	print_excprint_stackr!   r    rK   r   )	rt   rn   rP  rs   rQ  excrM   exc_typer  rw   rw   rx   r/    sJ   



z#OpenAIServing.create_error_responsec                 C   s    t | j||||d }|S )Nrn   rP  rs   rQ  )jsondumpsr/  
model_dump)rt   rn   rP  rs   rQ  json_strrw   rw   rx   create_streaming_error_response"  s   z-OpenAIServing.create_streaming_error_responser   c                 C   s    |dkrt d| tddS )z:Raise GenerationError if finish_reason indicates an error.r   z:Request %s failed with an internal error during generationrm   N)r   r   rl   )rt   r   r   rw   rw   rx   _raise_if_error3  s   zOpenAIServing._raise_if_errorr   c                 C      | j t|d|jdS )z)Convert GenerationError to ErrorResponse.rS  rP  rs   )r/  r}   rs   rt   r   rw   rw   rx   %_convert_generation_error_to_response<  
   z3OpenAIServing._convert_generation_error_to_responsec                 C   rj  )z4Convert GenerationError to streaming error response.rS  rk  )rh  r}   rs   rl  rw   rw   rx   /_convert_generation_error_to_streaming_responseF  rn  z=OpenAIServing._convert_generation_error_to_streaming_responsec                    s   d }|  |jrd S |j| jjv rd S tjr<|jr<| j|jI d H  }r<t|tr-d S t|t	r<|j
jtjjkr<|}|pL| jd|j ddtjddS )NThe model `` does not exist.NotFoundErrorr   rc  )_is_model_supportedr   r   lora_requestsenvs VLLM_ALLOW_RUNTIME_LORA_UPDATINGresolve_lorar   rV   r!   r   rU  r   rW  r   r/  	NOT_FOUND)rt   r   error_responseload_resultrw   rw   rx   r5  P  s0   
zOpenAIServing._check_modelc                 C   sN   |  |}t }| jj D ]}|j|v r|| qt|dkr%| S dS )z;Determine if there are any active default multimodal loras.r   N)	_get_message_typessetr   rt  r  	lora_nameaddr   pop)rt   r   message_typesdefault_mm_loraslorarw   rw   rx   _get_active_default_mm_lorasn  s   


z*OpenAIServing._get_active_default_mm_lorassupports_default_mm_lorasc                 C   sX   |j | jjv r| jj|j  S |r| |}|d ur|S | |j r#d S td|j  d)Nrp  rq  )r   r   rt  r  rs  rX  )rt   r   r  default_mm_lorarw   rw   rx   _maybe_get_adapters  s   
z!OpenAIServing._maybe_get_adaptersc                 C   s   t  }t|ds
|S |j}|du st|ttfr|S |D ])}t|trEd|v rEt|d trE|d D ]}d|v rD||d 	dd  q2q|S )zRetrieve the set of types from message content dicts up
        until `_`; we use this to match potential multimodal data
        with default per modality loras.
        messagesNcontentrT  r  r   )
r|  r?  r  r   r}   bytesdictr   r~  split)rt   r   r  r  rn   content_dictrw   rw   rx   r{    s"   
z OpenAIServing._get_message_typesr   add_special_tokensc           
         s   |  |}| jjd ur| jjddr| }t|dd }|d u r,|||dI d H }n|dk r=|||d| jdI d H }n|||d|dI d H }|j}|}	| |||	S )Ndo_lower_caseFr<  r  r   T)r  
truncation
max_length)	r+  r   encoder_configr   lowerr=  r   	input_ids_validate_input)
rt   r   r   r   r  r*  r<  encodedr  
input_textrw   rw   rx   _normalize_prompt_text_to_input  s6   
z-OpenAIServing._normalize_prompt_text_to_input
prompt_idsc                    sz   t |dd }|d u r|}n|dk r|| j d  }n|| d  }|d u r)d}n| |}||I d H }| |||S )Nr<  r   r   )r=  r   r+  r
  r  )rt   r   r  r   r<  r  r  r*  rw   rw   rx   !_normalize_prompt_tokens_to_input  s   
z/OpenAIServing._normalize_prompt_tokens_to_inputr  r  c                 C   sF  t |}t|tttttttt	frB|| j
kr<tdtdtdtdt	di}|t|d}td| j
 d| d| dd|d	t||d
S t|tttfrPt||d
S t|tr\|jpZ|j}nt|dd }|| j
krvtd| j
 d| dd|d	|d ur|| | j
krtd| d| j
 d| d| d| j
 d| dd|d	t||d
S )Nscoreclassificationzembedding generationz'This model's maximum context length is z  tokens. However, you requested z tokens in the input for z(. Please reduce the length of the input.input_tokensr   )r   r   r   z# tokens. However, your request has z> input tokens. Please reduce the length of the input messages.z6'max_tokens' or 'max_completion_tokens' is too large: z). This model's maximum context length is z tokens and your request has z input tokens (z > z - z).)r   r   r3   r4   r<   r@   r=   r;   r0   r/   r   r   rT  rM   rO   rH   rG   rF   r   max_completion_tokensr   r=  )rt   r   r  r  	token_num
operations	operationr   rw   rw   rx   r    s   


	
zOpenAIServing._validate_inputTprompt_inputc                    s4   | j |||g|d2 z	3 dH W }|  S 6 td)zP
        A simpler implementation that tokenizes a single prompt input.
        r  Nz$No results yielded from tokenization)_tokenize_prompt_inputs_asyncrX  )rt   r   r   r  r  r  rw   rw   rx   _tokenize_prompt_input_asyncH  s   
z*OpenAIServing._tokenize_prompt_input_asyncprompt_inputsc                 C  sN   |D ]!}t |tr| j||||dI dH V  q| j|||dI dH V  qdS )zQ
        A simpler implementation that tokenizes multiple prompt inputs.
        )r   r   r  N)r  r   )r   r}   r  r  )rt   r   r   r  r  r   rw   rw   rx   r  [  s   

z+OpenAIServing._tokenize_prompt_inputs_asyncrequest_chat_templatechat_template_kwargstrust_request_chat_templatec                 C   s,   |s|d us|r| dd ur| dS d S )Nr   zChat template is passed with request, but --trust-request-chat-template is not set. Refused request with untrusted chat template.)r   r/  )rt   r  r  r  rw   rw   rx   _validate_chat_templatet  s   z%OpenAIServing._validate_chat_templaterequest_chat_template_kwargsdefault_chat_template_kwargsc                 C   s   | pi } |du r
| S || B S )zIHelper to merge server-default and request-specific chat template kwargs.Nrw   )r  r  rw   rw   rx   #_prepare_extra_chat_template_kwargs  s   z1OpenAIServing._prepare_extra_chat_template_kwargsr   r  r   r   add_generation_promptcontinue_final_message
tool_dicts	documentstool_parserc                    sJ  |||||	d|
pi }
|  |
|}
ddlm} |j|f||
ddp)t|j|d|
I d H \}}d|vrP|}| j|| |d |d	I d H }|	| n
| j
||d d
d tt|}|jd uri|j|d< t|dd  }d urw||d< |d uot|do|jdk}|rt|ttB sd}t|| }||j|d}||gfS )N)r   r  r  toolsr  r   )MistralTokenizertokenizeF)r   r  r   r   r  r   )r   r  r  r   
cache_salttool_choicenonezPTool usage is only supported for Chat Completions API or Responses API requests.r   )r  vllm.tokenizers.mistralr  render_messages_asyncr  r   r   r  get_tokenizerupdater  r   rO   r   r=  r?  r  r   r*   r   adjust_request)rt   r   r   r  r   r   r  r  r  r  r  r  r  r  r  conversationrK  
extra_datar  should_parse_toolsmsgr   rw   rw   rx   _preprocess_chat  sj   	





zOpenAIServing._preprocess_chat)data_parallel_rankrK  rC  r  c          
   
      s:   i }t | j|j| | jj||||||||d}	|	|fS )z1Use the Processor to process inputs for AsyncLLM.)r   tokenization_kwargsr   rC  r  )rJ   r   r<  r   process_inputs)
rt   r   rK  r   r   r   rC  r  r  engine_requestrw   rw   rx   _process_inputs  s    

zOpenAIServing._process_inputsc              	      s2   t |d}| j|||||||dI d H \}	}
|
S )N)request_input)r  r  r   r   )r+   r  )rt   r   r   r  r  r  r   r   new_messagesr  r   rw   rw   rx   _render_next_turn
  s   
	zOpenAIServing._render_next_turnr   sampling_paramscontextc              	   K  sf  |  |\}}	}	|}
d}	 | d| }| j||||d |d}| j||||||dI d H \}}| jj|||f||||d|}|2 z3 d H W }|| |V  qF6 | s\d S | I d H }|	| t
|ttfry| }t|d}n't
|tr| |j|j|jj|j|j|j|jI d H }|d }|  |\}}	}	| jt|d	  |_|
d
 }|d
7 }q)Nr   Tr  rB  r   rD  )r   rC  r  r  )r   r   r   )_get_prompt_componentsrG  r   r  r   r   append_outputneed_builtin_tool_call	call_toolappend_tool_outputr   r&   r(   render_for_completionrO   r'   r  r   r   r   response_messagesr  tool_parser_clsr   r   r   r   r   )rt   r   rK  r  r  r   rC  kwargsr  r  orig_prioritysub_requestsub_request_idr   r  r  rL  rM  tool_outputr   r   rw   rw   rx   _generate_with_builtin_tools#  s|   

	



	z*OpenAIServing._generate_with_builtin_toolsc                 C   s   t |S ro   )rQ   )rt   r   rw   rw   rx   r  y  s   z$OpenAIServing._get_prompt_componentsinputsc                 C   s:   | j d u rd S | |\}}}| j j||||||d d S )NrB  )r   r  
log_inputs)rt   r   r  r   r   r   r   prompt_embedsrw   rw   rx   rG  |  s   

zOpenAIServing._log_inputsrF  c                    s0   | j  I d H }|rt|S t|rt  d S ro   )r   is_tracing_enabledre   rd   rf   )rt   rF  r  rw   rw   rx   rE    s   z OpenAIServing._get_trace_headersr   defaultc                 C   s2   | dur| j d }dur|S |du rt S |S )z6Pulls the request id to use from a header, if providedNzX-Request-Id)rF  r   rg   )r   r  req_idrw   rw   rx   _base_request_id  s   zOpenAIServing._base_request_idc                 C   sD   | du rdS | j d}|du rdS zt|W S  ty!   Y dS w )z7Pulls the data parallel rank from a header, if providedNzX-data-parallel-rank)rF  r   r   rX  )r   rank_strrw   rw   rx   _get_data_parallel_rank  s   
z%OpenAIServing._get_data_parallel_rankr  r  c           
   
   C   s  t t  }| jr%t| jtr%|d usJ |t| jj|d d }||fS | jrFt| jtrF|d us4J |t| jjj|d d }||fS | jdkrj|d usQJ t	t t
 |}|dd |D  d }||fS |r|r| jdksx| jd u r|d u rtdz||}W n ty } ztd |d }~ww |j|d ur|nd| d	}	|	d ur|	jr|d
d |	jD  |	j}|r| dkrd }||fS d |fS ||fS )Nname	argumentsrequiredc                 S   s&   g | ]}t |jtj|jd ddqS )F)ensure_asciir  )r"   r  rd  re  
parametersr   	tool_callrw   rw   rx   r     s    z@OpenAIServing._parse_tool_calls_from_content.<locals>.<listcomp>autoz7Tokenizer not available when `skip_tokenizer_init=True`zError in tool parser creation.r   r  c                 s   s$    | ]}t |jj|jjd V  qdS )r  N)r"   functionr  r  r  rw   rw   rx   	<genexpr>  s    
z?OpenAIServing._parse_tool_calls_from_content.<locals>.<genexpr>)r   r"   r  r   r   r   r  r   r  r   r#   validate_jsonr   rX  rY  r   	exceptionextract_tool_callstools_called
tool_callsr  strip)
r   r   r   r  r  function_callsr  r  r   tool_call_inforw   rw   rx   _parse_tool_calls_from_content  sr   
;2
	%

z,OpenAIServing._parse_tool_calls_from_contentr   r#  return_as_token_idc                 C   s8   |rd| S | j d ur| j S |d u rtd||S )Nz	token_id:z:Unable to get tokenizer because `skip_tokenizer_init=True`)decoded_tokenrX  r
  )r   r#  r   r  rw   rw   rx   _get_decoded_token  s   


z OpenAIServing._get_decoded_tokenr   c                 C   s   |sdS | j |S )NT)r   is_base_model)rt   r   rw   rw   rx   rs    s   z!OpenAIServing._is_model_supported)NF)r   N)NN)F)T)TFNNNNNF)Nr   ro   )gry   rz   r{   r   r
   r}   r   r   r$   r   boolrq   r   ra   rb   r   r\   r   r   rN   r_   rV   r   r   rZ   r&  rA   r(  r	   rC   r)  rh   r+  r   r!   r.  r   r0  r4  r1  r6  r[   r@  r7  r8  r   rW  r   r/  rh  ri  rl   rm  ro  r   r5  r  r  r|  r{  rO   r  r   r   r  r  r  r   r  r  r  staticmethodr  r   r*   r^   r   r   r   r   r  r`   rk   r  r)   r  r%   r  rP   r  rG  r   rE  r   r  r  r   r"   r  rT   r  rs  r~   rw   rw   ru   rx   r      s  
 


	

 p








3
"
:
	






*

Y





	

b	



V

Kr   r   r   c                 C   sF   | d u r| S | D ]}|d u rq|  D ]}|jtdkrd|_qq| S )Nz-infg    )r  r   r  )r   logprob_dictlogprob_valuesrw   rw   rx   clamp_prompt_logprobs   s   r   )r   rd  r\  r   r^  collections.abcr   r   r   r   dataclassesr   r   httpr   typingr	   r
   r   r   r   r   numpyr  fastapir   openai.types.responsesr   pydanticr   r   starlette.datastructuresr   	vllm.envsru  vllm.beam_searchr   r   vllm.engine.protocolr   vllm.entrypoints.chat_utilsr   r   r   vllm.entrypoints.loggerr   0vllm.entrypoints.openai.chat_completion.protocolr   r   r   +vllm.entrypoints.openai.completion.protocolr   r   'vllm.entrypoints.openai.engine.protocolr    r!   r"   r#   &vllm.entrypoints.openai.models.servingr$   )vllm.entrypoints.openai.responses.contextr%   r&   r'   r(   *vllm.entrypoints.openai.responses.protocolr)   r*   'vllm.entrypoints.openai.responses.utilsr+   -vllm.entrypoints.openai.translations.protocolr,   r-   r.   *vllm.entrypoints.pooling.classify.protocolr/   r0   r1   r2   'vllm.entrypoints.pooling.embed.protocolr3   r4   r5   r6   )vllm.entrypoints.pooling.pooling.protocolr7   r8   r9   r:   'vllm.entrypoints.pooling.score.protocolr;   r<   r=   r>   r?   r@   vllm.entrypoints.rendererrA   rB   rC   &vllm.entrypoints.serve.disagg.protocolrD   rE   (vllm.entrypoints.serve.tokenize.protocolrF   rG   rH   rI   vllm.entrypoints.utilsrJ   rK   rV  rM   vllm.inputs.datarN   rO   vllm.inputs.parserP   rQ   rR   vllm.loggerrS   vllm.logprobsrT   rU   vllm.lora.requestrV   vllm.multimodalrW   vllm.outputsrX   rY   rZ   vllm.pooling_paramsr[   vllm.reasoningr\   r]   vllm.renderersr^   vllm.sampling_paramsr_   r`   vllm.tokenizersra   vllm.tool_parsersrb   rc   vllm.tracingrd   re   rf   
vllm.utilsrg   vllm.utils.async_utilsrh   ri   rj   vllm.v1.enginerk   r   rl   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rw   rw   rw   rx   <module>   s  
  




		          A