o
    پiW                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlmZmZmZmZmZmZmZmZ ee
ddd  ddlZddlZddlZddlmZ ddlmZ dd	lmZ dd
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZAmBZB ddlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQ ddlRmSZS eTeUZVeWeX  eI ZY	d*de?de>dee: dee:e8f fddZZG dd deZ[de?fddZ\ded edee fd!d"Z]de?de>d#efd$d%Z^	d*de?d&ed#ed'edee> dee:e8ee e>f fd(d)Z_dS )+zy
The entry point of inference server. (SRT = SGLang Runtime)

This file implements python APIs for the inference engine.
    N)AsyncIteratorCallableDictIteratorListOptionalTupleUnion_register_atexitc                  O   s   d S N )argskwargsr   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/entrypoints/engine.py<lambda>!   s    r   )
EngineBase)$run_data_parallel_controller_process)run_detokenizer_process)CloseSessionReqInput!DestroyWeightsUpdateGroupReqInputEmbeddingReqInputGenerateReqInputGetWeightsByNameReqInputInitWeightsUpdateGroupReqInput"LoadLoRAAdapterFromTensorsReqInputLoadLoRAAdapterReqInputMultimodalDataInputFormatOpenSessionReqInputReleaseMemoryOccupationReqInputResumeMemoryOccupationReqInputRpcReqInputRpcReqOutputUnloadLoRAAdapterReqInputUpdateWeightFromDiskReqInput$UpdateWeightsFromDistributedReqInputUpdateWeightsFromIPCReqInputUpdateWeightsFromTensorReqInput)MultiTokenizerRouter)run_scheduler_process)TemplateManager)TokenizerManager)?parse_remote_instance_transfer_engine_info_from_scheduler_infos)PortArgs
ServerArgs)process_tracing_inittrace_set_thread_info)MultiprocessingSerializerassert_pkg_versionconfigure_loggerget_bool_env_varget_zmq_socketis_cudakill_process_tree launch_dummy_health_check_servermaybe_reindex_device_id
numa_utilsset_prometheus_multiproc_dir
set_ulimit)TorchMemorySaverAdapter)__version__server_args	port_argsTokenizerManagerClassreturnc                 C   s8   |pt }|| |}t }|j|| j| j| jd ||fS )N)tokenizer_manager
model_pathchat_templatecompletion_template)r*   r)   initialize_templatesrC   rD   rE   )r>   r?   r@   rB   template_managerr   r   r   init_tokenizer_managera   s   
rH   c                4   @   s  e Zd ZU dZeZeed< eeZ	e
ed< eeZe
ed< eeZe
ed< dd Z																											dd
eeee ef  deeee ef  deeeee  ee f  dee dee dee deeee ef  deeee ef  deeee ef  deeeee  ee f  deeee   deeee ef  dedededeeee ef  deeee ef  deeee ef  dee dee deeee ef  dee d ee d!eeee f f0d"d#Z																											dd
eeee ef  deeee ef  deeeee  ee f  dee dee dee deeee ef  deeee ef  deeee ef  deeeee  ee f  deeee   deeee ef  dedededeeee ef  deeee ef  deeee ef  dee dee deeee ef  dee d ee d!eeee f f0d$d%Z							dd
eeee ee eee  f dee dee dee d&ee deeeee  ee f  dee deeee ef  d!efd'd(Z							dd
eeee ee eee  f dee dee dee d&ee deeeee  ee f  dee deeee ef  d!efd)d*Zd
eeee   d!efd+d,Zd-d. Zd/d0 Z d1d2 Z!d3d4 Z"	dd5ed6ee d!efd7d8Z#d6ed!dfd9d:Z$d;d< Z%d=d> Z&d?d@ Z'dAdB Z(dCdD Z)dEdF Z*	GddHedIedJedKedLedMefdNdOZ+dLefdPdQZ,	R	S	ddTe-e dUe-e dVe-e-e  dLedWedXee fdYdZZ.		Sdd[ee/ee0j1f  dXee dWefd\d]Z2	dd^edXee fd_d`Z3	Sddaeeef dWefdbdcZ4ddeedfefdgdhZ5diedjee/ee0j1f  dkefdldmZ6ddiedednefdodpZ7diefdqdrZ8ddseee  fdtduZ9ddseee  fdvdwZ:dxdy Z;	 dzefd{d|Z<d}d~ Z=dd Z>							ddeeeee f  deeeee eee  f  deee  deded!eee?  fddZ@							ddeeeee f  deeeee eee  f  deee  deded!eee?  fddZAdS )Enginea  
    The entry point to the inference engine.

    - The engine consists of three components:
        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.

    Note:
    1. The HTTP server, Engine, and TokenizerManager all run in the main process.
    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
    server_args_classinit_tokenizer_manager_funcrun_scheduler_process_funcrun_detokenizer_process_funcc           	      K   sD  d|v r	|d }nd|vrd|d< | j di |}|| _td| t| j t|| j| j	| j
d\}}}}|| _|| _|d | _|| _t|| _td}| jjdkrct|tj| jjd| _nd	| _|jrt|jd
 d}|jdkryd}n|jdkrd}t| zt | _W d	S  t y   t! | _t"| j Y d	S w )z
        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
        Please refer to `ServerArgs` for the documentation.
        r>   	log_levelerrorserver_args=)r>   rK   rL   rM   r      TNsglang	TokenizerprefillzPrefill TokenizerdecodezDecode Tokenizerr   )#rJ   r>   loggerinfoatexitregistershutdown_launch_subprocessesrK   rL   rM   rB   rG   scheduler_infor?   r+   $remote_instance_transfer_engine_infozmqContext	node_rankr4   DEALERrpc_ipc_namesend_to_rpcenable_tracer.   otlp_traces_endpointdisaggregation_moder/   asyncioget_running_looploopRuntimeErrornew_event_loopset_event_loop)	selfr   r>   rB   rG   scheduler_infosr?   contextthread_labelr   r   r   __init__   sV   






zEngine.__init__NFpromptsampling_params	input_ids
image_data
audio_data
video_datareturn_logproblogprob_start_lentop_logprobs_numtoken_ids_logprob	lora_pathcustom_logit_processorreturn_hidden_statesreturn_routed_expertsstreambootstrap_hostbootstrap_portbootstrap_roomdata_parallel_rankexternal_trace_headerridsession_paramspriorityrA   c                    s  j jr%|du rtd n|dk rtd|j jkr%tdj j tdi d|d|d|d	|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|}j|d |r fdd}| S j	
  }|S ) 
        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
        Please refer to `GenerateReqInput` for the documentation.
        N7data_parallel_rank not provided, using default dispatchr   'data_parallel_rank must be non-negativez.data_parallel_rank must be less than dp_size: textrt   rs   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   c                  3   s6    	 zj   } | V  W n
 ty   Y d S w qr   )ri   run_until_complete	__anext__StopAsyncIteration)chunk	generatorrm   r   r   generator_wrapper  s   
z*Engine.generate.<locals>.generator_wrapperr   )r>   enable_dp_attentionrV   debug
ValueErrordp_sizer   rB   generate_requestri   r   r   )rm   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   objr   retr   r   r   generate   s|   (	
zEngine.generatec                    s  | j jr)|du rtd n|dk rtd|| j jkr)td| j jd  dtd|  td!i d	|d
|d|d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|}| j|d}|d u r|S |	 I dH S )"r   Nr   r   r   z(data_parallel_rank must be in range [0,    ]zdata_parallel_rank: r   rt   rs   ru   rv   rw   rx   ry   rz   r{   r|   r~   r   r   r}   r   r   r   r   r   r   r   r   Tr   )
r>   r   rV   r   r   r   r   rB   r   r   )rm   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   async_generate)  s|   )	
zEngine.async_generate
dimensionsc	              
   C   s:   t ||||||||d}	| j|	d}
| j|
 }|S )
        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
        Please refer to `EmbeddingReqInput` for the documentation.
        r   ru   rv   rw   r   r|   r   r   Nr   rB   r   ri   r   r   )rm   rr   ru   rv   rw   r   r|   r   r   r   r   r   r   r   r   encode}  s   
zEngine.encodec	              
      s6   t ||||||||d}	| j|	d}
|
 I dH S )z
        Asynchronous version of encode method.

        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
        Please refer to `EmbeddingReqInput` for the documentation.
        r   N)r   rB   r   r   )rm   rr   ru   rv   rw   r   r|   r   r   r   r   r   r   r   async_encode  s   
zEngine.async_encodec                 C   s.   t |dd}| j|d}| j| }|S )r   T)r   is_cross_encoder_requestNr   )rm   rr   r   r   r   r   r   r   rerank  s   zEngine.rerankc                 C   s   t t dd dS )zShutdown the engineF)include_parentN)r6   osgetpidrm   r   r   r   rZ     s   zEngine.shutdownc                 C   s   | S r   r   r   r   r   r   	__enter__  s   zEngine.__enter__c                 C   s   |    dS )NF)rZ   )rm   exc_type	exc_value	tracebackr   r   r   __exit__  s   zEngine.__exit__c                 C   s   | j | j S r   )ri   r   rB   flush_cacher   r   r   r   r     s   zEngine.flush_cachecapacity_of_str_len
session_idc                 C   "   t ||d}| j| j|dS )ab  Open a session for multi-turn conversation with shared context.

        Args:
            capacity_of_str_len: Maximum string length capacity for the session.
            session_id: Optional session ID. If not provided, a UUID will be generated.

        Returns:
            The session ID (either the provided one or a newly generated UUID).
        )r   r   N)r   ri   r   rB   open_session)rm   r   r   r   r   r   r   r     s   zEngine.open_sessionc                 C   s$   t |d}| j| j|d dS )zsClose a session and release its resources.

        Args:
            session_id: The session ID to close.
        )r   N)r   ri   r   rB   close_session)rm   r   r   r   r   r   r     s   
zEngine.close_sessionc                 K   s   | j | jjdi | d S )Nr   )ri   r   rB   start_profilerm   r   r   r   r   r     s   zEngine.start_profilec                 C      | j | j  d S r   )ri   r   rB   stop_profiler   r   r   r   r     s   zEngine.stop_profilec                 C   r   r   )ri   r   rB    start_expert_distribution_recordr   r   r   r   r        z'Engine.start_expert_distribution_recordc                 C   r   r   )ri   r   rB   stop_expert_distribution_recordr   r   r   r   r     r   z&Engine.stop_expert_distribution_recordc                 C   r   r   )ri   r   rB   dump_expert_distribution_recordr   r   r   r   r     r   z&Engine.dump_expert_distribution_recordc                 C   s4   | j | j }i t| jj| j|tdS )N)internal_statesversion)	ri   r   rB   get_internal_statedataclassesasdictr>   r\   r=   )rm   r   r   r   r   get_server_info  s   zEngine.get_server_infoncclmaster_addressmaster_portrank_offset
world_size
group_namebackendc                 C   *   t ||||||d}| j| j|dS )z"Initialize parameter update group.)r   r   r   r   r   r   N)r   ri   r   rB   init_weights_update_group)rm   r   r   r   r   r   r   r   r   r   r   r        
z Engine.init_weights_update_groupc                 C       t |d}| j| j|dS )zDestroy parameter update group.)r   N)r   ri   r   rB   destroy_weights_update_group)rm   r   r   r   r   r   r   (  s   z#Engine.destroy_weights_update_groupweight_update_groupTnamesdtypesshapesr   load_formatc                 C   r   )z'Update weights from distributed source.)r   r   r   r   r   r   N)r$   ri   r   rB   update_weights_from_distributed)rm   r   r   r   r   r   r   r   r   r   r   r   4  r   z&Engine.update_weights_from_distributednamed_tensorsc                    sL   |dkr }n fddt | jjD }t|||d}| j| j|dS )zUpdate weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
        to avoid duplicated cache cleaning operation.flattened_bucketc                    s   g | ]}t  qS r   )r0   	serialize).0_r   r   r   
<listcomp>U  s    z5Engine.update_weights_from_tensor.<locals>.<listcomp>)serialized_named_tensorsr   r   N)ranger>   tp_sizer&   ri   r   rB   update_weights_from_tensor)rm   r   r   r   r   r   r   r   r   r   J  s   

z!Engine.update_weights_from_tensorrC   c                 C   r   )a  Update the weights from disk inplace without re-launching the engine.

        This method allows updating the model weights from disk without restarting
        the engine. It can be used to load a different model or update weights with
        new training.
        )rC   r   N)r#   ri   r   rB   update_weights_from_disk)rm   rC   r   r   r   r   r   r   b  s   zEngine.update_weights_from_diskzmq_handlesc                 C   r   )z:Update weights from IPC for checkpoint-engine integration.)r   r   N)r%   ri   r   rB   update_weights_from_ipc)rm   r   r   r   r   r   r   r   v  s   zEngine.update_weights_from_ipcd   nametruncate_sizec                 C   r   )zGet weights by parameter name.)r   r   N)r   ri   r   rB   get_weights_by_name)rm   r   r   r   r   r   r   r     s   zEngine.get_weights_by_name	lora_nametensorsconfig_dictc                 C   s2   t j|dd}t|||d}| j| j|d S )NT)
output_str)r   r   serialized_tensors)r0   r   r   ri   r   rB   load_lora_adapter_from_tensors)rm   r   r   r   r   lora_reqr   r   r   r     s   z%Engine.load_lora_adapter_from_tensorspinnedc                 C   s$   t |||d}| j| j|dS )z8Load a new LoRA adapter without re-launching the engine.)r   r|   r   N)r   ri   r   rB   load_lora_adapter)rm   r   r|   r   r   r   r   r   r     s   zEngine.load_lora_adapterc                 C   r   )z6Unload a LoRA adapter without re-launching the engine.)r   N)r"   ri   r   rB   unload_lora_adapter)rm   r   r   r   r   r   r     s   
zEngine.unload_lora_adaptertagsc                 C       t |d}| j| j|d S N)r   )r   ri   r   rB   release_memory_occupationrm   r   r   r   r   r   r        
z Engine.release_memory_occupationc                 C   r   r   )r   ri   r   rB   resume_memory_occupationr   r   r   r   r     r   zEngine.resume_memory_occupationc                 C   s   | j | j  dS )a<  
        To maintain a high performance server with low latency, we want to reduce the
        stalls caused by the garbage collector scanning through a large number of objects.

        It is usually helpful to start the server and warm it up with real requests to
        initialize many of the long-lived objects that do not need to be garbage collected.

        After sufficient warmup, we can call this function to freeze the garbage collector
        so that all objects created before this point are considered out of scope for garbage
        collection.
        N)ri   r   rB   	freeze_gcr   r   r   r   r     s   zEngine.freeze_gcmethodc                 K   sH   t ||d}| j| | jtj}t|tsJ |js"J |j	d S )N)r   
parameters)
r    rc   
send_pyobj
recv_pyobjr^   BLOCKY
isinstancer!   successmessage)rm   r   r   r   recv_reqr   r   r   collective_rpc  s
   zEngine.collective_rpcc                 K      | j di | d S )Nsave_remote_model)r  r  r   r   r   r   r       zEngine.save_remote_modelc                 K   r  )Nsave_sharded_model)r  r  r   r   r   r   r    r  zEngine.save_sharded_modelqueryitemslabel_token_idsapply_softmax
item_firstc              
   C   s    | j | jj|||||ddS )a  
        Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
        query = "<|user|>Is the following city the capital of France? "
        items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
        label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
        item_first = False

        This would pass the following prompts to the model:
        "<|user|>Is the following city the capital of France? Paris <|assistant|>"
        "<|user|>Is the following city the capital of France? London <|assistant|>"
        "<|user|>Is the following city the capital of France? Berlin <|assistant|>"
        The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
        The output would look like:
        [[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]


        Args:
            query: The query text or pre-tokenized query token IDs. Must be provided.
            items: The item text(s) or pre-tokenized item token IDs. Must be provided.
            label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
            apply_softmax: Whether to normalize probabilities using softmax.
            item_first: If True, prepend items to query. Otherwise append items to query.

        Returns:
            List of dictionaries mapping token IDs to their probabilities for each item.
            Each dictionary in the list corresponds to one item input.

        Raises:
            ValueError: If query is not provided, or if items is not provided,
                      or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
        Nr	  r
  r  r  r  request)ri   r   rB   score_requestrm   r	  r
  r  r  r  r   r   r   score  s   'zEngine.scorec                    s    | j j|||||ddI dH S )zh
        Asynchronous version of score method.

        See score() for detailed documentation.
        Nr  )rB   r  r  r   r   r   async_score  s   zEngine.async_score)NNNNNNFNNNNNFFFNNNNNNNN)NNNNNNNr   )r   )r   TN)NT)T)r   )F)NNNFF)B__name__
__module____qualname____doc__r-   rJ   __annotations__staticmethodrH   rK   r   r(   rL   r   rM   rq   r   r	   r   strr   intr   boolr   r   r   r   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   listr   r   torchTensorr   r   r   r   r   r   r   r   r   r   r  r  r  floatr  r  r   r   r   r   rI   v   s  
 E	 !"#
_	 !"#
W	

 	



	








	


4

rI   c                 C   sp  dt jvs| jrtt| jt jd< dt jvs| js| jr*tt| jp$| jt jd< dt jd< dt jd< t jddd	krBdt jd< t jd
d u rOdt jd
< t jdd u r\dt jd< dt  dt	dd t jd< | j
rst  t  tds| jdkrtddd trtddd | jd u rdd }ttj| ntd| j  ttj| j tjddd d S ) NNCCL_CUMEM_ENABLENCCL_NVLS_ENABLE8CUDA_DEVICE_MAX_CONNECTIONSAUTOCUDA_MODULE_LOADINGTRTLLM_ENABLE_PDL10CUTE_DSL_LOG_LEVEL30CUTE_DSL_LOG_TO_CONSOLEzsglang-run--r   i SGLANG_RUN_ID$SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK
flashinferflashinfer_pythonz0.6.3zPlease uninstall the old version and reinstall the latest version by following the instructions at https://docs.flashinfer.ai/installation.html.z
sgl-kernelz0.3.21zSPlease reinstall the latest version with `pip install sgl-kernel --force-reinstall`c                 S   s   t d tt  d S )NzIReceived sigquit from a child process. It usually means the child failed.)rV   rO   r6   r   r   )signumframer   r   r   launch_phase_sigquit_handlerd  s   z:_set_envs_and_config.<locals>.launch_phase_sigquit_handlerzUsing custom SIGQUIT handler: spawnT)force)r   environenable_symm_memr  r  enable_nccl_nvlsgettimerandomrandintenable_metricsr:   r;   r3   attention_backendr1   _is_cudacustom_sigquit_handlersignalSIGQUITrV   rO   mpset_start_method)r>   r4  r   r   r   _set_envs_and_config&  sX   









rF  scheduler_pipe_readersscheduler_procsc              	   C   s   g }t t| D ]<}z| |  }W n" ty4   td| d ||   td|| j   w |d dkr?td|	| q|S )z@Wait for the model to finish loading and return scheduler infos.zRank z< scheduler is dead. Please check if there are relevant logs.zExit code: statusreadyz;Initialization failed. Please see the error messages above.)
r   lenrecvEOFErrorrV   rO   joinexitcoderj   append)rG  rH  rn   idatar   r   r   _wait_for_scheduler_readyv  s$   
rS  rL   c                 C   sF  g }| j dkrtj| jd}g }t| j| j d}t| j| j d}t|| j|  || j| d  }|}	| j	|	 }
t|
| j|	  |
| j|	 d  }|D ]}|D ]}t
jdd\}}| j|| |
  ||
 | j  }| jrq| j nd}| j	| | j }|| | j }|| j	| j  }|| j	| j  | j	| j | j  }t|G}t
j|| |||||||d |f
d}| " t| | |  W d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  || || qQqM||fS t
jdd\}}|g}t
jtt| |||dd}|  || ||fS )Nr   )enableF)duplextargetr   )r>   r?   pipe_writerrL   )rW  r   )r   r<   createenable_memory_savermaxpp_sizennodesr   r`   r   rD  Pipebase_gpu_idgpu_id_stepr   attn_cp_sizemoe_dp_sizeep_sizer8   Processconfigure_subprocessr9   startrP  r   dict)r>   r?   rL   rH  memory_saver_adapterrG  pp_size_per_nodennodes_per_pp_rankpp_rank_rangennodes_per_tp_grouptp_size_per_nodetp_rank_rangepp_ranktp_rankreaderwritergpu_idattn_dp_sizeattn_tp_sizeattn_cp_rankmoe_dp_rankmoe_ep_rankprocr   r   r   _launch_scheduler_processes  s   







 
F	
rz  rK   rM   c                 C   s*  t |  t|  |   |du rt| }td|  t| ||d\}}| jdkrbt	||}t
ddkr=dd||fS t| j| j| j |D ]}|  td|j d|j  qHdd||fS tj|| |fd	}	|	  | jdkr||| |\}
}nt| |}
d}t	||}|d
 d |
_|
|||fS )z
    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
    NrP   )r>   r?   rL   r   "SGLANG_BLOCK_NONZERO_RANK_CHILDRENr)  z$Scheduler or DataParallelController z terminated with rV  r   max_req_input_len)r2   rF  check_server_argsr,   init_newrV   rW   rz  r`   rS  r   getenvr7   hostportr>  rN  rO   pidrO  rD  rd  rf  tokenizer_worker_numr'   r|  )r>   rK   rL   rM   r?   rH  rG  rn   ry  detoken_procrB   rG   r   r   r   r[     sT   






r[   r   )`r  rg   rX   r   loggingmultiprocessingrD  r   r<  rB  	threadingr;  typingr   r   r   r   r   r   r   r	   setattrr  uvloopr^   !sglang.srt.entrypoints.EngineBaser   ,sglang.srt.managers.data_parallel_controllerr   'sglang.srt.managers.detokenizer_managerr   sglang.srt.managers.io_structr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   )sglang.srt.managers.multi_tokenizer_mixinr'   sglang.srt.managers.schedulerr(   $sglang.srt.managers.template_managerr)   %sglang.srt.managers.tokenizer_managerr*   ;sglang.srt.model_loader.remote_instance_weight_loader_utilsr+   sglang.srt.server_argsr,   r-   sglang.srt.tracing.tracer.   r/   sglang.srt.utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   +sglang.srt.utils.torch_memory_saver_adapterr<   sglang.versionr=   	getLoggerr  rV   set_event_loop_policyEventLoopPolicyr@  rH   rI   rF  rS  rz  r[   r   r   r   r   <module>   s   (T8


     5P

k