o
    ۷i                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&Z&d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9 ddl1m:Z: e8 rd dl;Z;d dl&m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZB e, rd dlCZCe0 rd dlDmEZE e. oe+ oe/ oe- ZFeFrd dlGZGd dlHmIZImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[ d dl\m]Z] d dl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZm d dlnmoZo d d lpmqZqmrZrmsZs G d!d" d"eod#d$ZtG d%d& d&e]d#d$ZuG d'd( d(eSd#d$ZveretZwereuZxerevZyh d)Zzh d*Z{h d+Z|e9}e~Zd,d-d.d/iZee Zd0ZG d1d2 d2ejZd3efd4d5Zd6ed7d8d9d8fd:d;ZG d<d= d=ZG d>d? d?ZeG d@dA dAZG dBdC dCe:Ze~dDkre Ze  dS dS )E    N)ArgumentParser	Namespace)AsyncGenerator	GeneratorIterable)asynccontextmanager)	dataclassfield)BytesIO)Thread)Optional	TypedDictUnion
model_infoHF_HUB_OFFLINE)DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @      e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rX   rX   S/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/commands/serving.pyrO   {      
 rO   F)totalc                   @   rN   )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rP   NrQ   rX   rX   rX   rY   r\      rZ   r\   c                   @   s.   e Zd ZU dZeed< eed< dZeed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerP   FstreamN)	rR   rS   rT   rU   bytesrW   rV   r_   boolrX   rX   rX   rY   r]      s
   
 r]   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstoprc   audiord   logprobsmetadata	functions
modalities
predictionrj   rk   rl   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   re   rf   languagery   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rR   rS   rT   r   r   r   r   rX   rX   rX   rY   r      s
    r   argsc                 C   s   t | S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommand)r   rX   rX   rY   serve_command_factory   s   r   reqmodel_generation_configr(   returnc                 K   sZ  |  ddurtdi t| d }nt|}|jdi |}| D ]\}}|dur3t||| q%|  ddurBt	| d |_
|  ddurPt	| d |_
|  ddur^t| d |_|  ddurj| d |_|  ddurv| d |_|  ddurt| d |_t| d d	krd
|_|  ddurt| d |_|  ddurt| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rP   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasrp   temperatureg        Ftop_pseedrX   )getr(   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   torchmanual_seed)r   r   kwargsrP   non_standard_kwargskvrX   rX   rY   !create_generation_config_from_req   s6   


r   c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S N)resetselfrX   rX   rY   __init__'  s   zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rX   rX   rY   r   *  s   
zToolState.resetN)rR   rS   rT   rU   r   r   rX   rX   rX   rY   r   $  s    r   c                	   @   sR   e Zd ZdZ	ddddedeed  fdd	Zd
d Zdd Z	dd Z
dd ZdS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr)   timeout_seconds	processor)r    r   c                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rV   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   rX   rX   rY   r   8  s   zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   rX   rX   rY   reset_timerE  s   
zTimedModel.reset_timerc                 C   sZ   t | dr)| jdur+| `| `d| _d| _t  tj r"tj  | j	
  dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   cudais_availableempty_cacher   r   r   rX   rX   rY   delete_modelK  s   

zTimedModel.delete_modelc                 C   s&   |    t| j d| j d d S )Nz was removed from memory after z seconds of inactivity)r   loggerinfor   r   r   rX   rX   rY   r   [  s   zTimedModel.timeout_reachedc                 C   s   t | d p
| jdu S )z)Check if the instances have been deleted.r   N)r   r   r   rX   rX   rY   
is_deleted_  s   zTimedModel.is_deletedr   )rR   rS   rT   rU   r   r   r   r   r   r   r   r   rX   rX   rX   rY   r   2  s    	

r   c                   @   s  e Zd ZU dZedddidZeed< edddidZe	ed	< ed
dg dddZ
ee	 ed< eddg dddZee	 ed< edddidZeed< ed
ddidZee	 ed< edddidZeed< edddidZeed< eddddgddZe	ed< edddidZeed< eddd idZe	ed!< ed"dd#idZeed$< ed%dd&idZeed'< ed(dd)idZe	ed*< ed
dd+idZee ed,< eddd-idZeed.< eddd/idZeed0< ed
dd1idZee	 ed2< d3d4 Zd
S )5ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    Fhelpz8Whether to use continuous batching for chat completions.)defaultrs   continuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   bfloat16float16float32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                 C   sN   | j dur#| jdu r| j | _dS | j | jkr%td| j  d| j ddS dS )z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)r   r   
ValueErrorr   rX   rX   rY   __post_init__  s   

zServeArguments.__post_init__)rR   rS   rT   rU   r	   r   ra   rW   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rX   rX   rX   rY   r   d  s   
 
r   c                   @   s$  e Zd ZedefddZdefddZdede	d	d
de
fddZdefddZdefddZdefddZ								dHdedee dee dee dee deed  dee dee defd d!Zd"d#defd$d%Zd&d' Zejdeeeef  fd(d)Zd*ededeedf fd+d,Zedd-defd.d/Zed0efd1d2Z d*ede!eddf fd3d4Z"d*ede!eddf fd5d6Z#d*ede!eddf fd7d8Z$d*ede%fd9d:Z&ededed; fd<d=Z'd>edefd?d@Z(dAefdBdCZ)dAede*d-ef fdDdEZ+dAede*d-e,f fdFdGZ-dS )Ir   parserc                 C   s$   t f}| jd|d}|jtd dS )z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsr   )r   r   serve_parserrX   rX   rY   register_subcommand  s   z ServeCommand.register_subcommandr   c                 C   s  t std|| _| jj| _| jrAt }| jjd u r'|| j_t	d|  t
 }| jj|vrAtd| d| jj d| d| jj| _| jjd urSt| jj td}|tj| jj   td}|tj| jj   i | _d | _d | _d | _d | _d S )	NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=`transformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorr   r   use_continuous_batchingr*    default_attention_implementationr   r   r   #supported_attention_implementationsr   r   r   r   r   r#   
get_loggersetLevel
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   r   default_attn_implsupported_attn_impltransformers_logger	cb_loggerrX   rX   rY   r     s@   




zServeCommand.__init__requestschema	validatorrL   unused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|r(t d|  tdd| d| jjriz|	| W n t
yQ } zt d|   td| dd}~ww ||@ }	|	rkt d|	  tdd|	 ddS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeys__mutable_keys__errorr.   r   r   validate_pythonrM   errors)
r   r	  r
  r  r  
input_keyspossible_keysunexpected_keyseunused_fields_in_requestrX   rX   rY   _validate_request  s.   

zServeCommand._validate_requestc                 C      | j |tttd d S N)r	  r
  r  r  )r  rO   response_validatorUNUSED_RESPONSE_FIELDSr   r	  rX   rX   rY   validate_response_requestA     
z&ServeCommand.validate_response_requestc                 C   r  r  )r  r\   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr   rX   rX   rY    validate_chat_completion_requestI  r"  z-ServeCommand.validate_chat_completion_requestc                 C   r  r  )r  r]   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr   rX   rX   rY   validate_transcription_requestQ  r"  z+ServeCommand.validate_transcription_requestr   N
request_idcontentr   rolefinish_reason
tool_callsr8   decode_stream	tokenizerr   c	           
   
   C   sl   |dur|dur|dur| |j|}t|tt |tt|||dd|dgddd}	d|	jd	d
 dS )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)r*  r+  r-  r   )deltaindexr,  r   zchat.completion.chunk)idcreatedr   r   system_fingerprintobjectdata: Texclude_none

)step
_tokenizerr5   r   timer6   r7   model_dump_json)
r   r)  r*  r   r+  r,  r-  r.  r/  chunkrX   rX   rY   build_chat_completion_chunkY  s(   "
z(ServeCommand.build_chat_completion_chunkresponserK   c                 C   s   d|j dd dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r6  Tr7  r9  )r=  )r   r@  rX   rX   rY   build_response_event  s   z!ServeCommand.build_response_eventc           
         s  t dtf fdd}t|d} jr%|jtdgddgdgd td d	d
lm} |	dd|dt
f fdd}|	ddt
f fdd}|	dd|f fdd}|d|d fdd}|ddd }|dd|fdd}	tj| jj jj jjd d S )!a  
        Setup and run the FastAPI server for transformers serve.

        Models will be loaded and unloaded automatically based on usage and a timeout.

        The server will expose the following endpoints:
        - POST /v1/chat/completions: Generates chat completions.
        - POST /v1/responses: Generates responses.
        - POST /v1/audio/transcriptions: Generates transcriptions from audio.
        - GET /v1/models: Lists available models for 3rd party tools.

        Requires FastAPI and Uvicorn to be installed.
        appc                   sB   d V   j  D ]}|  q	 jd ur jjddd d S d S )NT   blocktimeout)r   valuesr   r  rp   )rB  r   r   rX   rY   lifespan  s   

z"ServeCommand.run.<locals>.lifespan)rH  *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsr	  bodyc                    s:    j |d  jr || jj}n |}t|ddS Nr	  text/event-stream
media_type)r%  r   #continuous_batching_chat_completionstater)  generate_chat_completionr1   )r	  rO  outputr   rX   rY   chat_completion  s
   
z)ServeCommand.run.<locals>.chat_completionz/v1/responsesc                    s"    j | d  | }t|ddS rP  )r!  generate_responser1   )r	  rX  r   rX   rY   	responses  s   
z#ServeCommand.run.<locals>.responsesz/v1/audio/transcriptionsc              
      s   |   4 I d H 5}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  n1 I d H sDw   Y   j|d
  	|}t
|ddS )Nr^   r   )r^   r   zReceived file: z; MIME type: z; size:    z.2fz KiBrQ  rR  rS  )formr]   readr   r  filenamecontent_typesizer(  generate_transcriptionr1   )r	  r]  parsed_requestrX  r   rX   rY   audio_transcriptions  s   (

z.ServeCommand.run.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)r5  data)r0   get_gen_modelsrX   r   rX   rY   get_all_models  s   z(ServeCommand.run.<locals>.get_all_modelsz/healthc                   S   s   t ddiS )Nstatusok)r0   rX   rX   rX   rY   healthcheck  s   z%ServeCommand.run.<locals>.healthcheckhttpc                    s>   | j tptt }|| j_|| I d H }||j t< |S r   )headersr   X_REQUEST_IDrV   uuiduuid4rV  r)  )r	  	call_nextr)  r@  rX   rX   rY   get_or_set_request_id  s   
z/ServeCommand.run.<locals>.get_or_set_request_id)r   r   r   N)r   r-   r   add_middlewarer/   r   warning_oncefastapirN  postdictoptionsr   
middlewareuvicornrunr   r   r   r   )
r   rH  rB  rN  rY  r[  rd  rh  rk  rr  rX   r   rY   r{    s:   
	
"zServeCommand.runc                 C   s6   g d}t rdd |D S dd |D }dd |D S )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructc                 S   s.   g | ]}|d t j   |dd dqS )r   /r   r2  r5  r3  owned_by)datetimenow	timestampsplit.0r   rX   rX   rY   
<listcomp>  s    z/ServeCommand.get_gen_models.<locals>.<listcomp>c                 S   s   g | ]}t |qS rX   r   r  rX   rX   rY   r  #  s    c                 S   s$   g | ]}|j d |j |jdqS )r   r}  )r2  
created_atr  authorr  rX   rX   rY   r  $  s    r   )r   modelsmodel_infosrX   rX   rY   rg     s   	
zServeCommand.get_gen_modelsr   c              	      s    |d jk}_|r!jdur!jjddd d_\}}t|dr0|jn|t||jj	j
dddd	 jdu rW|j dd
_t j_j  |j|d ddd|j}fdd fdd}||d |S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   rD  r/  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rP   	streamingmessagespt)return_tensorsadd_generation_promptc              
   3   s    z6j | d dV  j| D ]#}|jtjkr&j | d dV   W d S j | |jd  |dV  qW d S  ty` } zt	t
| j|  dt
| dV  W Y d }~d S d }~ww )	N	assistantr+  r   rp   r,  r   )r)  r*  r   r.  r/  data: {"error": ""})r?  r  request_id_iterri  r+   FINISHEDgenerated_tokens	Exceptionr   r  rV   cancel_request)r)  r.  resultr  )model_id_and_revisionr   r/  rX   rY   stream_chat_completion^  s2   
 zPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completionc                   s   z't |  d}jj| | jd}||D ]}|V  tdI d H  qW d S  tjyB   j| t	
d| d Y d S w )NF)r)  r   r   zRequest z was cancelled.)r   tolistr  add_requestr   asynciosleepCancelledErrorr  r   warning)_inputsr)  r.  r>  )rP   r   r  rX   rY   cancellation_wrapperz  s   zNServeCommand.continuous_batching_chat_completion.<locals>.cancellation_wrapperr   )process_model_namer  r  rp   load_model_and_processorr   r/  r   rP   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   )r   r   r)  must_discard_cacher   r   inputsr  rX   )rP   r  r   r  r/  rY   rU  .  s<   





z0ServeCommand.continuous_batching_chat_completionr)   c                 C   sB   | j j}|t v rtj}|S |t v rtj}|S td| )NzUnknown modality: )		__class__rR   r   rG  r   r   r   r   r   )r   model_classnamemodalityrX   rX   rY   get_model_modality  s   zServeCommand.get_model_modalityr  c                 C   s~  g }| D ]}|d g d}|t jkrEt|d tr|d }n"t|d tr@g }|d D ]}|d dkr:||d  q+d|}||d< nr|t jkrt|d tr^|d d|d d nY|d D ]T}|d dkrr|d | qb|d dkrd	|d d
 v rt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d qb|| q|S )Nr+  r+  r*  r*  typerb    )r  rb   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   
isinstancerV   re  appendjoinr   resubr,   openr
   r  	b64decodetempfileNamedTemporaryFilenamesave)r  r  processor_inputsmessageparsed_messageparsed_contentr*  
image_datar  r^   r  rX   rX   rY   *get_processor_inputs_from_inbound_messages  s@   




z7ServeCommand.get_processor_inputs_from_inbound_messagesc                    sv  j jdurj j|d< |d }|d d dkrdS |d jk}_\}}||}dtD ]}|jj	d 
 v rO| nq?|j|d|d	d
ddd}|j}|ddd}	djj	d 
 v rxd}	t||	dd}
t|jd}d}|r|sj }|d jd |krj}i ||
|d|d  fdd}||
S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r  r+  r  r   Ttoolsr  )r  r  r  return_dicttokenizer)  req_0gptossFskip_special_tokensskip_promptr   	input_ids)streamerrP   return_dict_in_generatepast_key_valuesc              
   3   s   d}d }dj jd  v rd}d}fdd}t| d}d	}z#z|  t }jd
dV  | D ]}dj jd  v rH|d}||7 }|rV||v rUd}q7q7d ur| t	 d krhd|_
q7| t	 d kr|  j|d ddV  q7|j
r| j|7  _|jstd|j}	|	d u rq7|	d}	d|_tt|	ddd|d d}
n<|d	krq7d|jvrq7| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}
j|d |
gdV  q7|d	krj||dV  q7j|dd V  |  W n# ty8 } ztt| d!t| d"V  W Y d }~nd }~ww W |  d S W |  d S |  w )#NFr  r   T<|channel|>final<|message|>c                         j di | }|j_d S NrX   generater  r  r   generate_outputr   r   rX   rY   generate_with_cache     zbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cachetargetr   r   r  r  
<|return|>r   r   r-  )r)  r+  r,  r   z\"name\": \"(.*?)\"r$   )r  function
_tool_call)r  r1  r  r2  z"arguments": {{})	arguments)r  r1  r  )r)  r+  r-  r   )r*  r   rp   r  r  r  )configarchitecturesr   r   r   r   r?  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr8   r9   r   countr  r  r  r   r  rV   )r  _request_id
filter_cotcot_trace_endr  threadresults
tool_stater  	tool_nametoolr  generation_kwargsr   r  r)  r   tool_model_familyrX   rY   r    s   





zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion)r   r   r  r  r  r  r  _MODELS_WITH_TOOL_SUPPORTr  r  r   r  r   r  r   r!   r   rP   is_continuationr  get_seq_lengthshape)r   r   r  r  r   r  r  supported_model_familiesr  r  generation_streamerrP   r  seq_lenr  rX   r   rY   rW    sf   




zz%ServeCommand.generate_chat_completionc                    s   d jk}_\}td tr6dv r)dd dgng }|dd d nUtd trjdv red d d dkrXdd dgd }n3d }d |d d	< n&d }n!td trdv r}dd dgng }|d  ntd
|j	|ddd}|
j}ddd}djjd  v rd}t||dd}tjd}d}r|sՈj }	|d jd |	krՈj}|t|||d|d  fdd}
|
|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  rc   r   r+  r*  z%inputs should be a list, dict, or strTr  )r  r  rn   r  r  Fr  r  Nr  r  )r  attention_maskr  rP   r  r  c                 3   s4   d}d }dj jd  v rd}d}fdd}t| d}d}d}d}zzz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]=}dj jd  v r|d$}||7 }|r||v rd}d!}qqtd%d ||||d!d&d'gd(}|d7 }	|V  qtd)d ||d|d!d&d'gd*}|d7 }	|V  td+d |||td |jg d"d#}|d7 }|d7 }	|V  td,||td dd-d|jgg d.d}|d7 }|d7 }	|V  td/|td
 |	d-ddddii|jgdg ddddd0d}|d7 }	|V  |  W nc ty } zVtd1t|  td2|t|d3}|d7 }	|V  td4|td
 |	d5ddddiig dg dddtd6t|d7d8d}|d7 }	|V  W Y d }~nd }~ww W |  d S W |  d S |  w )9NFr  r   Tr  c                     r  r  r  r  r  rX   rY   r    r  zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cacher  zresponse.createdresp_queuedr  formatr  rb   r@  r}   r   rs   )r2  r  ri  r   r  rb   r5  r  rX  r}   rj   rs   )r  sequence_numberr@  r$   zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r2  r  ri  r+  r*  )r  r  output_indexitemzresponse.content_part.addedoutput_textr   )r  rb   annotations)r  item_idr  r  content_indexpartr  zresponse.output_text.deltagX@)tokenlogprob)r  r  r  r  r  r0  rr   zresponse.output_text.done)r  r  r  r  r  rb   rr   zresponse.content_part.donezresponse.output_item.done	completed)r2  r  ri  r+  r*  r  zresponse.completed)r2  r  ri  r   r  rb   rX  r5  r  r}   rj   rs   z"Exception in response generation: r  )r  r  r  zresponse.failedfailedserver_error)coder  )r2  r  ri  r   r  rb   rX  r5  r  r}   rj   rs   r  ) r  r  r   r   r   r<  r?   r;   r   rA  rC   rD   rF   r=   rG   r  rH   rI   r>   rb   rE   r  r<   r  r  r  r   r  rV   rA   rB   r@   )r  r  r  r  r  r  r  r  r  r  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedr  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr  error_eventresponse_failedr  r   r  r   r)  r   rX   rY   stream_response  s  





	
	


%z7ServeCommand.generate_response.<locals>.stream_response)r  r  r  r  rV   r  re  rw  r   r  r  r   r   r  r  r   r!   r   rP   r  r  r  r  r   	ones_like)r   r   r  r   r  r  r  rP   r  r	  r-  rX   r,  rY   rZ    sZ   


	 
czServeCommand.generate_responsec           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr  r  r^   )srmonor  )sampling_rater  input_features)r  rP   r  c                  3   sH    j di  } j| jddd }t|d}|jdd V  d S )NT)r  r   )rb   r7  rX   )r  batch_decode	sequencesr2   r=  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorr  rX   rY   _generate_transcription  s
   
zDServeCommand.generate_transcription.<locals>._generate_transcription)r   r   r  load_audio_model_and_processorr!   r/  r   rP   feature_extractorr1  ior
   librosaloadr  r   r   )
r   r   r  r  rP   model_sampling_rateaudio_bytesaudio_array_r<  rX   r8  rY   rb    s2   z#ServeCommand.generate_transcriptionc                 C   sx   | dp	| d}d}| jdu rd}n#t| jt|kr d}ntt| jD ]}| j| || kr6d} nq'|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r
  TNF)r   r  lenrange)r   r   r  req_continues_last_messagesirX   rX   rY   r    s   
zServeCommand.is_continuationr'   c                 C   s@   | j rtd| j| j| j| jd}|S | jrtdd}|S d}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   bnb_4bit_compute_dtyper   bnb_4bit_use_double_quantbnb_4bit_quant_storage)r   N)r   r'   r   r   r   r   )r   quantization_configrX   rX   rY   get_quantization_config   s    z$ServeCommand.get_quantization_configmodel_idc                 C   s*   | j jdur
| j j}d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        N@z@main)r   r   )r   rO  rX   rX   rY   r    s
   
zServeCommand.process_model_namer  c                 C   s>  | j }td|  d|v r|dd\}}n|d}}tj|||jd}|jdv r.|jntt	|j}| 
|}||j|d|jd}|d	urK||d
< tj|fi |}	tt|	jd }
|
j|fi |}t|dd	d	u rs||j}|jjd	u o~|jjdk}|jjd	uo|jjdk }|s|rd|j_td|  ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading rP  r$   main)revisionr   )r   Nr   )rR  r   r   
device_mapr   NrM  r   hf_device_map   r\  zLoaded model )r   r   r   r  r&   from_pretrainedr   r   getattrr   rN  r   r   r   r  r  r   rP   r   
max_length)r   r  r   rO  rR  data_processorr   rM  model_kwargsr  architecturer   has_default_max_lengthhas_short_max_new_tokensrX   rX   rY   _load_model_and_data_processor.  sB   

z+ServeCommand._load_model_and_data_processorc                 C   t   || j vs| j |  r#| |\}}t|| jj|d| j |< ||fS | j |   | j | j}| j | j}||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r   r   r^  r   r   r   r   r   r   )r   r  r   r   rX   rX   rY   r  k  s   
z%ServeCommand.load_model_and_processorc                 C   r_  )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r`  ra  )r   r  r:  r;  rX   rX   rY   r=    s   
z+ServeCommand.load_audio_model_and_processor)r   NNNNNNN).rR   rS   rT   staticmethodr   r   r   r   rw  r   r  r  r!  r%  r(  rV   r   r   re  r   r   r?  rA  r{  	functoolscacheanyrg  r   rU  r   r  r  r   rW  rZ  rb  ra   r  rN  r  r^  tupler  r    r=  rX   rX   rX   rY   r     s    -
1

	

8_-\- J  '0=

r   __main__)r  r  r   r  enumrc  r   r?  r   r  r  r   r<  ro  argparser   r   collections.abcr   r   r   
contextlibr   dataclassesr   r	   r
   r   typingr   r   r   huggingface_hubr   huggingface_hub.constantsr   tokenizers.decodersr   r   &transformers.models.auto.modeling_autor   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r    r!   utilsr"   r#   r%   r   r&   r'   r(   r)   generation.continuous_batchingr*   r+   r@  PILr,   r   rz  ru  r-   r.   fastapi.middleware.corsr/   fastapi.responsesr0   r1    openai.types.audio.transcriptionr2   .openai.types.audio.transcription_create_paramsr3   openai.types.chatr4   'openai.types.chat.chat_completion_chunkr5   r6   r7   r8   r9   *openai.types.chat.completion_create_paramsr:   openai.types.responsesr;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   -openai.types.responses.response_create_paramsrJ   pydanticrK   rL   rM   rO   r\   r]   r  r#  r&  r  r$  r'  r   rR   r   r  re  r  r  rn  Enumr   r   rw  r   r   r   r   r   r   r{  rX   rX   rX   rY   <module>   s    	D


	
;2q         
S