o
    پi                    @   s  U d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZmZmZ eeddd  ddlZddlZddlZddlZddlZdd	lmZmZmZm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZR ddlSmTZT ddlUmVZVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~ ddlmZmZmZmZmZmZ dd lmZ dd!lmZmZ dd"lmZ dd#lmZ dd$lmZ dd%lmZmZ dd&lmZmZ dd'lmZmZmZmZmZmZ dd(lmZmZmZ dd)lmZ dd*lmZ eeZee  eed+d,Zeed-d.ZejG d/d0 d0Zdaee ed1< d2efd3d4Zd5efd6d7Zd5efd8d9Ze	d:efd;d<Zeeed=rdnd>d?Zeje$d@gdAd@gd@gdB ddClmZ ee eedDe dEefdFdGZee"dDe dEe"fdHdGZdIe fdJdKZedLedMdDe d5e'fdNdOZedPdQdR ZedSdTdU ZedVedWdXdY ZedZd[d\ Zed]d^d_ Zed`dadb ZejdcdddegdfeejɃdgevdDe fdhdiZejˠdjdkkrejdlddgdfeejɃdmedDe fdndoZejdpdddegdfdgehdDe fdqdrZejdsdddegdfdgegdDe fdtduZejdvdddegdfdgegdDe fdwdxZejdydzddgdfeejɃd{d| Zejd}dzddgdfeejɃd~d ZejdddgdfeejɃdd ZejddegdfeejɃdge`fddZejddgdfeejɃdd ZedeejɃdd ZejddzddgdfeejɃdVdgeeq fddZejddzddgdfeejɃdd ZejddzddgdfeejɃdd ZejddzddgdfeejɃdd ZejddzddgdfeejɃdd ZejddzddgdfeejɃdd ZedeejɃdgeydDe fddZedeejɃdgejdDe fddZedeejɃdgetdDe fddZedeejɃdVdefddZedeejɃdgekdDe fddZedeejɃdgeedDe fddZedeejɃdge|dDe fddZedeejɃdgezdDe fddZedeejɃdge{dDe fddZedeejɃdge}dDe fddZejddzddgdfeejɃdgeidDe fddZejddzddgdfeejɃdgerdDe fddZejddzddgdfeejɃdgesdDe fddĄZedšeejɃdgeadDe fddǄZejddzddgdfeejɃdgewdDe fddʄZejdddgdfeejɃdgemdDe fdd̈́ZejdddgdfdgeldDe fddЄZejdddgdfeejɃdgexdDe fddӄZejddzddgdfdgendDe fddքZejddzddgdfdgebdDe fddلZejddzddgdfeejɃdgecdDe fdd܄ZedݡeejɃdge_dDe fdd߄ZeddgeodDe fddZeddgeudDe fddZedeejɃdgepdDe fddZedeejɃdgeddDe fddZejdeegddDe?dIe fddZejdeegddDe=dIe fddZejde&eegddDeAdIe fddZejde&eegddDe>dIe fddZejde&eegdejde&eegdddDeGdIe fddZejݐd e&eegdejݐde&eegdddDe@dIe fddZejde&ddd Zejde&dd	efd
dZejݐdeegddDeFdIe fddZ ejݐdeegddDedIe fddZeddedIe fddZeݐddedIe fddZejȐddddegeegddDeHdIe fddZeejˠddeejˠdddd  Zeejˠd!d"dDe7dIe fd#d$Zeejˠd%d&dDe8dIe fd'd(Z	eejˠd)d*dIe fd+d,Z
eejˠd-d.dDe9dIe fd/d0Zejݐd1eegddDe.dIe fd2d3Zejݐd4eegddDe-dIe fd5d6Zed7d5e'fd8d9Zeݐd:dDe=dIe fd;d<Zeejˠd=d>d?e~dIe fd@dAZdBdC ZejfdDed5e&fdEdFZdGZdHefdIdJZdefdHedKeeg df  dLefdMdNZdOdP Ze3e5e4edfdHedQedRedSedLedKeeg df  fdTdUZdS (W  z
The entry point of inference server. (SRT = SGLang Runtime)

This file implements HTTP APIs for the inference engine via fastapi.
    N)asynccontextmanager)
HTTPStatus)AnyAsyncGeneratorAsyncIteratorCallableDictListOptionalUnion_register_atexitc                  O   s   d S N )argskwargsr   r   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/entrypoints/http_server.py<lambda>)   s    r   )DependsFastAPIHTTPExceptionRequest)RequestValidationError)CORSMiddleware)ORJSONResponseResponseStreamingResponse)FAKE_BOOTSTRAP_HOSTDisaggregationMode)AnthropicCountTokensRequestAnthropicMessagesRequest)AnthropicServing)_launch_subprocessesinit_tokenizer_managerrun_detokenizer_processrun_scheduler_process)OllamaChatRequestOllamaGenerateRequestOllamaShowRequest)OllamaServing)ChatCompletionRequestClassifyRequestCompletionRequestDetokenizeRequestEmbeddingRequestErrorResponse	ModelCard	ModelListResponsesRequestScoringRequestTokenizeRequestV1RerankReqInput)OpenAIServingChat)OpenAIServingClassify)OpenAIServingCompletion)OpenAIServingEmbedding)OpenAIServingRerank)OpenAIServingScore)OpenAIServingDetokenizeOpenAIServingTokenize)execute_warmups)envs)FunctionCallParser) AbortReqAttachHiCacheStorageReqInputCheckWeightsReqInputCloseSessionReqInputConfigureLoggingReqContinueGenerationReqInput!DestroyWeightsUpdateGroupReqInputDumperControlReqInputEmbeddingReqInputGenerateReqInputGetWeightsByNameReqInput-InitWeightsSendGroupForRemoteInstanceReqInputInitWeightsUpdateGroupReqInput"LoadLoRAAdapterFromTensorsReqInputLoadLoRAAdapterReqInputOpenSessionReqInputParseFunctionCallReqPauseGenerationReqInputProfileReqInputReleaseMemoryOccupationReqInputResumeMemoryOccupationReqInput#SendWeightsToRemoteInstanceReqInputSeparateReasoningReqInputSetInternalStateReqSlowDownReqInputUnloadLoRAAdapterReqInputUpdateWeightFromDiskReqInput$UpdateWeightsFromDistributedReqInputUpdateWeightsFromIPCReqInputUpdateWeightsFromTensorReqInputUpdateWeightVersionReqInputVertexGenerateReqInput)MultiTokenizerRouterTokenizerWorkerget_main_process_id$monkey_patch_uvicorn_multiprocessingread_from_shared_memorywrite_data_for_multi_tokenizer)TemplateManager)ServerStatusTokenizerManager)enable_func_timer)?parse_remote_instance_transfer_engine_info_from_scheduler_infos)ReasoningParser)PortArgs
ServerArgs)process_tracing_inittrace_set_thread_info)add_prometheus_middleware(add_prometheus_track_response_middlewaredelete_directoryget_bool_env_varkill_process_treeset_uvicorn_logging_configs)	AuthLevelapp_has_admin_force_endpoints
auth_level)get_exception_traceback)__version__SGLANG_HEALTH_CHECK_TIMEOUT   !SGLANG_WAIT_WEIGHTS_READY_TIMEOUTx   c                   @   s@   e Zd ZU eeeef ed< eed< e	ed< dZ
ee	 ed< dS )_GlobalStatetokenizer_managertemplate_managerscheduler_infoN$remote_instance_transfer_engine_info)__name__
__module____qualname__r   rh   r`   ra   __annotations__rf   r   r   r
   r   r   r   r   r      s
   
 	r   _global_stateglobal_statec                 C   s   | a d S r   r   )r   r   r   r   set_global_state   s   r   returnc                   C   s   t S r   r   r   r   r   r   get_global_state   s   r   c                     s   t  } td|  \}}}|jdu sJ ddtjddj |_tdt	
  d|j  t||}t }|j||j|j|jd	 |d
 |_tt|||d |S )z
    Initialization function for multi-process tokenizer mode.
    It read args information from shm and inits tokenizer manager for current process.
    multi_tokenizer_args_Nz0API key is not supported in multi-tokenizer modezipc://F)deletez%Start multi-tokenizer worker process z, ipc_name=)r   
model_pathchat_templatecompletion_templatemax_req_input_len)r   r   r   )rb   rd   api_keytempfileNamedTemporaryFilenametokenizer_ipc_nameloggerinfoosgetpidra   rf   initialize_templatesr   r   r   r   r   r   )main_pid	port_argsserver_argsr   r   r   r   r   r   init_multi_tokenizer   s@   


r   fast_api_appc           
      C  sD  t | ddr| j}| j}d}nt I d H }t|d}dtjj }|jr,t	t
 t  |jrLt|jd |jdkr?d| }n	|jd	krHd
| }t| ttjtj| j_ttjtj| j_ttjtj| j_ttjtj| j_ttj| j_ttjtj| j_ttj| j_ t!tj| j_"t#tj| j_$t%| jj| j_&d }|j'dkrddl(m)} | }n|j'rddl(m*} | }|+|j'I d H  zddl,m-} |tjtjdd|d| j_.W n t/y   t0 }t12d|  Y nw |j3d urt4|j|j35dtjI d H  t16d t7j8t9|d}	|	:  z
d V  W |	;  d S |	;  w )Nis_single_tokenizer_modeF	Tokenizerr   zMultiTokenizer-sglangprefillPrefilldecodeDecodedemor   )DemoToolServer)MCPToolServer)OpenAIServingResponsesT)enable_prompt_tokens_detailsenable_force_include_usagetool_serverz2Can not initialize OpenAIServingResponses, error: ,zWarmup ended)targetr   )<getattrr   warmup_thread_kwargsr   dictr   r   	worker_idenable_metricsrp   appri   enable_tracern   otlp_traces_endpointdisaggregation_modero   r7   r   stateopenai_serving_completionr5   openai_serving_chatr8   openai_serving_embeddingr6   openai_serving_classifyr:   openai_serving_scorer9   openai_serving_rerankr<   openai_serving_tokenizer;   openai_serving_detokenizer(   ollama_servingr    anthropic_servingr   )sglang.srt.entrypoints.openai.tool_serverr   r   add_tool_server/sglang.srt.entrypoints.openai.serving_responsesr   openai_serving_responses	Exceptionry   r   warningwarmupsr=   splitr   	threadingThread_wait_and_warmupstartjoin)
r   r   r   thread_labelr   r   r   r   	tracebackwarmup_threadr   r   r   lifespan   s   







r   DISABLE_OPENAPI_DOCz/openapi.json)r   openapi_url*T)allow_originsallow_credentialsallow_methodsallow_headers)routerrequestexcc                    sh   | j jdr|jt|jjd|jd}td|i|jdS td|jt	|j|jd}t|
 |jdS )zEnrich HTTP exception with status code and other details.

    For /v1/responses, emit OpenAI-style nested error envelope:
    {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
    /v1/responsesNmessagetypeparamcodeerrorcontentstatus_code)objectr   r   r   )urlpath
startswithdetailr   r   phraser   r.   str
model_dump)r   r   nested_errorr   r   r   r   validation_exception_handler|  s"   

r   c                    s   t |}t | }|r||kr| d| }n|}| jjdr5|tjjdtjjd}t	dd|idS t
|tjjtjjd}t	d| dS )	zOverride FastAPI's default 422 validation error with 400.

    For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
     r   Nr     r   r   r   )r   r   r   )r   errorsr   r   r   r   BAD_REQUESTr   valuer   r.   r   )r   r   exc_str
errors_strr   r   errr   r   r   r     s,   raw_requestc                    sJ   | j dd }|jdddd }|dkr#tddgd	d
dgddS )z;Validate that the request content-type is application/json.zcontent-type ;   )maxsplitr   zapplication/jsonheaderz:Unsupported Media Type: Only 'application/json' is allowedvalue_error)locmsgr   )r   N)headersgetlowerr   r   )r   content_type
media_typer   r   r   validate_json_request  s   r  /healthz/health_generatec                    s  t jjrtd tddS t jjtjkrtddS t	j
 s+jjdkr+tddS ddd}d	t  }t jjrCt j|| n&t jjr`t|d
g|dd t jjjtjjkr_t _d
 _n	t|d
g|dd  fdd}t| }t }t |t k rtdI dH  t jj |kr|!  t jj"#|d tj$t j_tddS t |t k s|!  t%dt&|}t%dt&t jj }t'dt d| d|  t jj"#|d tj(t j_tddS )a9  
    Check the health of the inference server by sending a special request to generate one token.

    If the server is running something, this request will be ignored, so it creates zero overhead.
    If the server is not running anything, this request will be run, so we know whether the server is healthy.
    z=Health check request received during shutdown. Returning 503.i  r   r     r          )max_new_tokenstemperatureHEALTH_CHECK_r   F)rid	input_idssampling_paramslog_metricsc                     s(   t j 2 z3 d H W }  d S 6 d S r   )r   r   generate_request)_grir   r   r   gen   s   zhealth_generate.<locals>.genNz%H:%M:%SzNHealth check failed. Server couldn't get a response from detokenizer for last z seconds. tic start time: z. last_heartbeat time: ))r   r   gracefully_exitr   r   r   server_statusrg   Startingr>   (SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATIONr
  r   r   timeis_image_gen"get_image_gen_health_check_requestis_generationrI   r   r   r   NULLr   r   bootstrap_hostbootstrap_roomrH   asynciocreate_taskHEALTH_CHECK_TIMEOUTsleeplast_receive_tstampcancelrid_to_statepopUpstrftime	localtimer   	UnHealthy)r   r  r  r  tasktictic_timelast_receive_timer   r  r   health_generate  sx   










r:  z/get_model_infoc                         t d t I dH S )zAGet the model information (deprecated - use /model_info instead).zsEndpoint '/get_model_info' is deprecated and will be removed in a future version. Please use '/model_info' instead.N)r   r   
model_infor   r   r   r   get_model_info  
   r=  /model_infoc                     s`   t jj} t jjt jjjt jjt jjjt jjj| j	| j
t| jddt| jddt jjjd
}|S )zGet the model information.
model_typeNarchitectures)
r   tokenizer_pathr&  preferred_sampling_paramsweight_versionhas_image_understandinghas_audio_understandingr@  rA  rD  )r   r   model_configr   r   rB  r&  rC  rD  is_image_understandable_modelis_audio_understandable_modelr   	hf_config)rG  resultr   r   r   r<  )  s   r<  z/get_weight_versionz/weight_versionc                      s   t ddd)zGet the current weight version.  zdEndpoint '/get_weight_version' or '/weight_version' is deprecated. Please use '/model_info' instead.)r   r   )r   r   r   r   r   rD  =  s
   rD  z/get_server_infoc                      r;  )zCGet the server information (deprecated - use /server_info instead).zuEndpoint '/get_server_info' is deprecated and will be removed in a future version. Please use '/server_info' instead.N)r   r   server_infor   r   r   r   get_server_infoG  r>  rN  z/server_infoc                     sJ   t j I dH } tt jjdrt jj`i tt jjt j| t	dS )zGet the server information.NrG  )internal_statesversion)
r   r   get_internal_statehasattrr   rG  dataclassesasdictr   rz   )rO  r   r   r   rM  Q  s   rM  z	/get_loadc                      s   t d tj I dH S )z6Get load metrics (deprecated - use /v1/loads instead).zkEndpoint '/get_load' is deprecated and will be removed in a future version. Please use '/v1/loads' instead.N)r   r   r   r   get_loadr   r   r   r   rU  e  s
   rU  z/set_internal_statePOSTPUT)methodsobjc                    s   t j| I d H }|S r   )r   r   set_internal_state)rY  r   resr   r   r   rZ  q  s   rZ  SGLANG_DUMPER_SERVER_PORTreusez/dumper/{method}methodc                    s   |  I d H }|r| I d H ni }t| |d}tj|I d H }tdd |D r:dd |D }tdd|idS d	d |D S )
N)r^  bodyc                 s   s    | ]}|j  V  qd S r   )success.0rr   r   r   	<genexpr>  s    z*_dumper_control_handler.<locals>.<genexpr>c                 S   s   g | ]}|j s|jqS r   )r`  r   ra  r   r   r   
<listcomp>  s    z+_dumper_control_handler.<locals>.<listcomp>r   r   r   c                 S   s   g | ]
}|j D ]}|qqS r   )response)rb  rK  xr   r   r   re    s    )r_  jsonrG   r   r   dumper_controlanyr   )r^  r   
body_bytesr_  rY  resultsr   r   r   r   _dumper_control_handler{  s   rm  	/generatec              
      s    j rdtt f fdd}t| dtj dS ztj  I dH }|W S  t	yI } zt
d|  t|W  Y d}~S d}~ww )zHandle a generate request.r   c               
     s   zt j 2 z3 d H W } dtj| tjd d V  q	6 W n/ tyN } z#ddt|ii} t	d|  dtj| tjd d V  W Y d }~nd }~ww dV  d S )Ns   data: )options   

r   r   [http_server] Error: s   data: [DONE]

)
r   r   r  orjsondumpsOPT_NON_STR_KEYS
ValueErrorr   r   r   )outerY  r   r   r   stream_results  s,   
z(generate_request.<locals>.stream_resultstext/event-stream)r  
backgroundNrp  )streamr   bytesr   r   r   create_abort_taskr  	__anext__rt  r   r   _create_error_response)rY  r   rx  retrv  r   rw  r   r    s(   
r  /encodec              
      N   zt j| | I dH }|W S  ty& } z
t|W  Y d}~S d}~ww )zHandle an embedding request.Nr   r   r  r~  rt  r  rY  r   r  rv  r   r   r   encode_request     r  z	/classifyc              
      r  )zdHandle a reward model request. Now the arguments and return values are the same as embedding models.Nr  r  r   r   r   classify_request  r  r  z/flush_cacheGETc                     .   t j I dH } td| jrddS tjdS )zFlush the radix cache.NzCache flushed.
Please check backend logs for more details. (When there are running or waiting requests, the operation will not be performed.)
r  r   )r   r   flush_cacher   r`  r   r   r  r   r   r   r    s   r  z/clear_hicache_storage_backendc                     r  )z4Deprecated: use POST /hicache/storage-backend/clear.NzjDeprecated endpoint. Use POST /hicache/storage-backend/clear.
Hierarchical cache storage backend cleared.
r  r   r   r   clear_hicache_storager   r`  r   r   r  r   r   r   (clear_hicache_storage_backend_deprecated  s   r  z/hicache/storage-backend/clearc                     r  )z-Clear the hierarchical cache storage backend.Nz,Hierarchical cache storage backend cleared.
r  r   r  r  r   r   r   clear_hicache_storage_backend  s   r  z/hicache/storage-backendc                    sv   t jjjs	t S t jj| j| j| j| j	dI dH }t
|dd}t|jr&dnd|r-|d nd |jr6dd	S tjd	S )
zwAttach (enable) HiCache storage backend at runtime.

    Only allowed when there are NO running / queued requests.
    )hicache_storage_backend)hicache_storage_backend_extra_config_jsonhicache_storage_prefetch_policyhicache_write_policyNr   r  z"HiCache storage backend attached.
z*Failed to attach HiCache storage backend.

r  r   )r   r   r   admin_api_key_admin_api_key_missing_responseattach_hicache_storager  r  r  r  r   r   r`  r   r   )rY  r  r  r   r   r   attach_hicache_storage_backend  s(   
	r  DELETEc                     sd   t jjjs	t S t j I dH } t| dd}t| jrdnd|r$|d nd | jr-ddS t	j
dS )	zxDetach (disable) HiCache storage backend at runtime.

    Only allowed when there are NO running / queued requests.
    Nr   r  z"HiCache storage backend detached.
z*Failed to detach HiCache storage backend.
r  r  r   )r   r   r   r  r  detach_hicache_storager   r   r`  r   r   )r  r  r   r   r   detach_hicache_storage_backend  s   
	r  c                      s8   t jjjs	t S t jjjt jjjt jjjt jjjdS )zAGet current HiCache storage backend status (tokenizer-side view).)r  $hicache_storage_backend_extra_configr  r  )	r   r   r   r  r  r  r  r  r  r   r   r   r   hicache_storage_backend_status1  s   
r  z/start_profilec                    sV   | du rt  } tjj| j| j| j| j| j| j	| j
| j| j| jd
I dH  tdddS )zStart profiling.N)

output_dir
start_step	num_steps
activities
with_stackrecord_shapesprofile_by_stagemerge_profilesprofile_prefixprofile_stageszStart profiling.
r  r   )rR   r   r   start_profiler  r  r  r  r  r  r  r  r  r  r   )rY  r   r   r   start_profile_async@  s&   r  z/stop_profilec                         t j I dH  tdddS )zStop profiling.Nz*Stop profiling. This will take some time.
r  r   )r   r   stop_profiler   r   r   r   r   stop_profile_asyncY     r  z
/freeze_gcc                      r  )z0
    See engine.freeze_gc for more details.
    NzGarbage collection frozen.
r  r   )r   r   	freeze_gcr   r   r   r   r   freeze_gc_asyncd  s   r  z!/start_expert_distribution_recordc                      r  )zJStart recording the expert distribution. Clear the previous record if any.Nz)Start recording the expert distribution.
r  r   )r   r    start_expert_distribution_recordr   r   r   r   r   &start_expert_distribution_record_asyncq  r  r  z /stop_expert_distribution_recordc                      r  )z'Stop recording the expert distribution.Nz(Stop recording the expert distribution.
r  r   )r   r   stop_expert_distribution_recordr   r   r   r   r   %stop_expert_distribution_record_async|  r  r  z /dump_expert_distribution_recordc                      r  )z Dump expert distribution record.Nz!Dump expert distribution record.
r  r   )r   r   dump_expert_distribution_recordr   r   r   r   r   %dump_expert_distribution_record_async  r  r  z/update_weights_from_diskc                    sH   t j| |I dH \}}}|||d}|rt|tjdS t|tjdS )zEUpdate the weights from disk inplace without re-launching the server.N)r`  r   num_paused_requestsr  )r   r   update_weights_from_diskr   r   OKr   )rY  r   r`  r   r  r   r   r   r   r    s    r  z,/init_weights_send_group_for_remote_instancec                    B   t j| |I d H \}}||d}|rt|ddS t|tjdS Nr`  r   r  r  )r   r   +init_weights_send_group_for_remote_instancer   r   r   rY  r   r`  r   r   r   r   r   r       
r  z /send_weights_to_remote_instancec                    r  r  )r   r   send_weights_to_remote_instancer   r   r   r  r   r   r   r    r  r  z)/get_remote_instance_transfer_engine_inforankc              
      s   | d u s	| dk rt tjdS tjd u sttjdkr!t tjdS z| tj|  d}|W S  tyL } ztd|  t tjdW  Y d }~S d }~ww )Nr   r  )r  r   zException: )	r   r   r   r   r   lenr   r   r   )r  rK  rv  r   r   r   (get_remote_instance_transfer_engine_info  s$   
r  z/init_weights_update_groupc                    B   t j| |I dH \}}||d}|rt|ddS t|tjdS )z&Initialize the parameter update group.Nr  r  r  )r   r   init_weights_update_groupr   r   r   r  r   r   r   r    s   
r  z/destroy_weights_update_groupc                    >   t j| |I dH \}}||d}t||rddS tjdS )z#Destroy the parameter update group.Nr  r  r  )r   r   destroy_weights_update_groupr   r   r   r  r   r   r   r    s   
r  z/update_weights_from_tensorc                    r  )a  Update the weights from tensor inplace without re-launching the server.
    Notes:
    1. Ensure that the model is on the correct device (e.g., GPU) before calling this endpoint. If the model is moved to the CPU unexpectedly, it may cause performance issues or runtime errors.
    2. HTTP will transmit only the metadata of the tensor, while the tensor itself will be directly copied to the model.
    3. Any binary data in the named tensors should be base64 encoded.
    Nr  r  r  )r   r   update_weights_from_tensorr   r   r   r  r   r   r   r    s   
r  z /update_weights_from_distributedc                    r  )z/Update model parameter from distributed online.Nr  r  r  )r   r   update_weights_from_distributedr   r   r   r  r   r   r   r    s   
r  z/update_weights_from_ipcc                    sR   t j| |I dH \}}||d}|r"t jjdu rdt j_t|S t|tjdS )z\Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration.Nr  FTr  )r   r   update_weights_from_ipcinitial_weights_loadedr   r   r   r  r   r   r   r  .  s   
r  z/update_weight_versionc              
      s   | j rtjjdd z| jtjj_tdd| j | jdtj	dW S  t
yC } ztddt| dtjdW  Y d	}~S d	}~ww )
zFUpdate the weight version. This operation requires no active requests.T)	abort_allzWeight version updated to )r`  r   new_versionr  Fz!Failed to update weight version: r  N)abort_all_requestsr   r   abort_requestr  r   rD  r   r   r  r   r   r   rY  r   rv  r   r   r   update_weight_version?  s*   
r  z/get_weights_by_namec              
      sd   zt j| |I dH }|du rtdW S t|ddW S  ty1 } z
t|W  Y d}~S d}~ww )zGet model parameter by name.NzGet parameter by name failedr  r  )r   r   get_weights_by_namer  r   r   r  r   r   r   r  ^  s   
r  z/release_memory_occupationc              
      J   zt j| |I dH  W dS  ty$ } z
t|W  Y d}~S d}~ww )z*Release GPU memory occupation temporarily.N)r   r   release_memory_occupationr   r  r  r   r   r   r  l     r  z/resume_memory_occupationc              
      r  )zResume GPU memory occupation.N)r   r   resume_memory_occupationr   r  r  r   r   r   r  x  r  r  z/weights_checkerc                    s:   t j| |I d H \}}t||d|rddS tjdS r  )r   r   check_weightsr   r   r   )rY  r   r`  r   r   r   r   r    s   r  z
/slow_downc              
      r  )aB  Slow down the system deliberately. Only for testing. Example scenario:
    when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
    we can use this to slow down D to let it have enough running sequences, and then disable slowdown
    to let it run in full batch size.
    N)r   r   	slow_downr   r  r  r   r   r   r    s   r  z/load_lora_adapterc                    8   t j| |I dH }|jrt|tjdS t|tjdS z8Load a new LoRA adapter without re-launching the server.Nr  )r   r   load_lora_adapterr`  r   r   r  r   rY  r   rK  r   r   r   r       r  z/load_lora_adapter_from_tensorsc                    r  )zELoad a new LoRA adapter from tensors without re-launching the server.Nr  )r   r   load_lora_adapter_from_tensorsr`  r   r   r  r   r  r   r   r   r    s   
r  z/unload_lora_adapterc                    r  r  )r   r   unload_lora_adapterr`  r   r   r  r   r  r   r   r   r    r  r  z/open_sessionc              
      sZ   zt j| |I dH }|du rtd|W S  ty, } z
t|W  Y d}~S d}~ww )z1Open a session, and return its unique session id.NzNFailed to open the session. Check if a session with the same id is still open.)r   r   open_sessionr   r  )rY  r   
session_idrv  r   r   r   r    s   r  z/close_sessionc              
      sP   zt j| |I dH  tddW S  ty' } z
t|W  Y d}~S d}~ww )zClose the session.Nr  r  )r   r   close_sessionr   r   r  r  r   r   r   r    s   r  z/configure_loggingc                    s   t j|  tddS )z&Configure the request logging options.r  r  )r   r   configure_loggingr   rw  r   r   r   r    s   
r  z/abort_requestc              
      sP   zt jj| j| jd tddW S  ty' } z
t|W  Y d}~S d}~ww )zAbort a request.)r  r  r  r  N)r   r   r  r  r  r   r   r  r  r   r   r   r    s   r  z/parse_function_callc                    sB   t | j| jd}|| j\}}|dd |D d}t|ddS )zD
    A native API endpoint to parse function calls from a text.
    )toolstool_call_parserc                 S   s   g | ]}|  qS r   )r   )rb  callr   r   r   re  
  s    z/parse_function_call_request.<locals>.<listcomp>)normal_textcallsr  r   )r?   r  r  parse_non_streamtextr   )rY  r   parserr  r  response_datar   r   r   parse_function_call_request  s   r  z/separate_reasoningc                    s6   t | j|d}|| j\}}||d}t|ddS )zB
    A native API endpoint to separate reasoning from a text.
    )r@  r   )reasoning_textr  r  r   )rk   reasoning_parserr  r  r   )rY  r   r  r  r  r  r   r   r   separate_reasoning_request  s   r  z/pause_generationc                    &   t j| I dH  tdddddS )zPause generation.NzGeneration paused successfully.okr   statusr  r   )r   r   pause_generationr   rw  r   r   r   r  &     r  z/continue_generationc                    r  )zContinue generation.Nz"Generation continued successfully.r  r  r  r   )r   r   continue_generationr   rw  r   r   r   r  1  r  r  z/v1/completions)dependenciesc                       |j jj| |I dH S )z+OpenAI-compatible text completion endpoint.N)r   r   r   handle_requestr   r   r   r   r   openai_v1_completions?     

r  /v1/chat/completionsc                    r  z+OpenAI-compatible chat completion endpoint.Nr   r   r   r  r  r   r   r   openai_v1_chat_completionsG     

r   z/v1/embeddings)response_classr  c                    r  )z&OpenAI-compatible embeddings endpoint.N)r   r   r   r  r  r   r   r   openai_v1_embeddingsQ     

r  z/v1/classifyc                    r  )z*OpenAI-compatible classification endpoint.N)r   r   r   r  r  r   r   r   openai_v1_classify]  r  r  z/v1/tokenizez	/tokenizeF)r  r  include_in_schemac                    r  )z(OpenAI-compatible tokenization endpoint.N)r   r   r   r  r  r   r   r   openai_v1_tokenizei     

r  z/v1/detokenizez/detokenizec                    r  )z*OpenAI-compatible detokenization endpoint.N)r   r   r   r  r  r   r   r   openai_v1_detokenize{  r  r	  z
/v1/models)r  c               	      s   t jjg} g }| D ]}|t||t jjjd q
t jjjr=t jj	}|
  D ]\}}|t|j|j| d dd q)t|dS )z2Show available models. OpenAI-compatible endpoint.idrootmax_model_lenr   N)r  r  parentr  )data)r   r   served_model_nameappendr/   rG  context_lenr   enable_loralora_registryget_all_adaptersitems	lora_name	lora_pathr0   )served_model_namesmodel_cardsr  r  r  lora_refr   r   r   available_models  s.   

	
	r  z/v1/models/{model:path}modelc                    sJ   t jjg}| |vrtddd|  dddddid	S t| | t jjjd
S )zHRetrieves a model instance, providing basic information about the model.rL  r   zThe model 'z' does not existinvalid_request_errorr  model_not_foundr   r   r
  )r   r   r  r   r/   rG  r  )r  r  r   r   r   retrieve_model  s$   

r   z	/v1/scorec                    r  )zYEndpoint for the decoder-only scoring API. See Engine.score() for detailed documentation.N)r   r   r   r  r  r   r   r   v1_score_request  r  r!  r   c                    sJ   t di | }|jjj||I dH }t|tr#t|dddddS |S )z6Endpoint for the responses API with reasoning support.Nry  zno-cachez
keep-alive)zCache-Control
Connection)r  r	  r   )r1   r   r   r   create_responses
isinstancer   r   )r   r   request_objrK  r   r   r   v1_responses_request  s   


r&  z/v1/responses/{response_id}response_idc                       |j jj| I dH S )zRetrieve a response by ID.N)r   r   r   retrieve_responsesr'  r   r   r   r   v1_retrieve_responses     

r+  z"/v1/responses/{response_id}/cancelc                    r(  )zCancel a background response.N)r   r   r   cancel_responsesr*  r   r   r   v1_cancel_responses  r,  r.  z
/v1/rerank)rX  r  c                    r  )z:Endpoint for reranking documents based on query relevance.N)r   r   r   r  r  r   r   r   v1_rerank_request  r  r/  SGLANG_OLLAMA_ROOT_ROUTE/c                      s   dS )z1Ollama-compatible root endpoint for health check.zOllama is runningr   r   r   r   r   ollama_root  s   r2  SGLANG_OLLAMA_CHAT_ROUTEz	/api/chatc                    r  )z Ollama-compatible chat endpoint.N)r   r   r   handle_chatr  r   r   r   ollama_chat  s   r5  SGLANG_OLLAMA_GENERATE_ROUTEz/api/generatec                    r  )z$Ollama-compatible generate endpoint.N)r   r   r   handle_generater  r   r   r   ollama_generate  r  r8  SGLANG_OLLAMA_TAGS_ROUTEz	/api/tagsc                    s   | j jj S )z'Ollama-compatible list models endpoint.)r   r   r   get_tags)r   r   r   r   ollama_tags  s   r;  SGLANG_OLLAMA_SHOW_ROUTEz	/api/showc                    s   |j jj| jS )z+Ollama-compatible show model info endpoint.)r   r   r   get_showr  r  r   r   r   ollama_show  s   r>  z/v1/messagesc                    r  )z+Anthropic-compatible Messages API endpoint.N)r   r   r   handle_messagesr  r   r   r   anthropic_v1_messages#  r  r@  z/v1/messages/count_tokensc                    r  )z-Anthropic-compatible token counting endpoint.N)r   r   r   handle_count_tokensr  r   r   r   anthropic_v1_count_tokens-  r  rB  z/pingc                      s   t ddS )z$Check the health of the http server.r  r  )r   r   r   r   r   sagemaker_health8  s   
rC  z/invocationsc                    r  r  r  r  r   r   r   sagemaker_chat_completions>  r  rD  AIP_PREDICT_ROUTEz/vertex_generate
vertex_reqc                    s   | j sg S i }dD ] | j d  r" fdd| j D | <  nq
dd | j D p,d }tdi |d|i| jp:i }t||I d H }t|trL|S td|iS )	N)r  r  input_embedsr   c                    s   g | ]}|  qS r   r
  rb  instance	input_keyr   r   re  P  s    
z#vertex_generate.<locals>.<listcomp>c                 S   s$   g | ]}| d dur| d qS )
image_dataNrH  rI  r   r   r   re  T  s
    rM  predictionsr   )	instancesr
  rI   
parametersr  r$  r   r   )rF  r   inputsrM  reqr  r   rK  r   vertex_generateI  s6   


rS  c                 C   s   t ddt| iitjdS )Nr   r   r  )r   r   r   r   )rv  r   r   r   r  d  s   r  r   c                 C   s   t ddi| dS )Nr   zThis endpoint requires admin API key, but this server was started without one (admin-api-key). Restart with --admin-api-key to enable.r   )r   r  r   r   r   r  q  s   r  ziVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAbUlEQVRYhe3VsQ2AMAxE0Y/lIgNQULD/OqyCMgCihCKSG4yRuKuiNH6JLsoEbMACOGBcua9HOR7Y6w6swBwMy0qLTpkeI77qdEBpBFAHBBDAGH8WrwJKI4AAegUCfAKgEgpQDvh3CR3oQCuav58qlAw73kKCSgAAAABJRU5ErkJggg==r   c              
      s^  i }   } jrd j |d< d}tdD ]6}td ztj|d d|d}|jd	ks9J d
|d|jd}W  n t	tj
jfyM   t }Y qw |s`td|  tt  |S | }t|dd}|d r{|rx jsxd}	nd}	nd}	|d rdnd}
dd|
di} jrdd t jD |d<  jdkr|d d |d< n<|rЈ jdkr|d rtjjddddt idd d!d"gd#g|
dd$d%}nd&g j |d <  jdkr|d  d |d <  jr|d d  t j |d< d|d d'< t j! }z jdkr0tj"||	 |||dkr|nd(d)}|jd	ks(J |j t#j$tj_%W |S t&d* d$ddd+t'g j  fd,dt jD g d-g j d.}tj"||	 |||dkrb|nd/d)}|jd	krt&d0|j d1|   t#j$tj_%W |S t&d2(|j t#j)tj_%W |S  t*y   t }td|  tt  Y dS w )3NzBearer AuthorizationFr~   r  r?     )timeoutr	  r  zres=z, res.text=Tz%Initialization failed. warmup error: rE  r&  r  rn  r     r  r   )r  r  c                 S   s   g | ]}g d qS ))
         r   )rb  r  r   r   r   re    s    z*_execute_server_warmup.<locals>.<listcomp>r  nulluser	image_urlr   zdata:image/png;base64,)r   r]  r  zDescribe the image.)r   r  )roler   r  )r  messages
max_tokensr{  r  zThe capital city of France isr  iX  )rh  r	  rV  z%Start of pd disaggregation warmup ...)r  r  
ignore_eosc                    s$   g | ]}|d  j   | j  qS )l            )dp_sizetp_size)rb  ir   r   r   re    s    )rX  rY  rZ     )r  r(  r)  r  i  z6End of prefill disaggregation mode warmup with status z, resp: z;Prefill disaggregation mode warm Up Failed, status code: {})+r   r   ranger#  r-  requestsr
  r   r  AssertionError
exceptionsRequestExceptionry   r   r   rt   r   r   rh  boolskip_tokenizer_initrb  r   r   r   r  MINIMUM_PNG_PICTURE_BASE64debug_tensor_dump_input_filer1  nploadtolistr>   SGLANG_WARMUP_TIMEOUTpostrg   r2  r   r   r   formatr5  r   )r   r	  r   r`  r  r[  last_tracebackr<  is_vlmrequest_namer  	json_datawarmup_timeoutr   r   r   _execute_server_warmup  s   
 





0



rz  launch_callbackexecute_warmup_funcc                 C   sl   | j rt  | js|| sd S ntjtj_t	d | j
r"t| j | jr+tt  |d ur4|  d S d S )Nz)The server is fired up and ready to roll!)+checkpoint_engine_wait_weights_before_ready_wait_weights_readyskip_server_warmuprg   r2  r   r   r   r   r   delete_ckpt_after_loadingrr   r   rn  rt   r   r   r   r{  r|  r   r   r   r     s   



r   c                  C   sl   t } t }t| D ]}tjjr!tdt | dd  dS td q
t	d|  dtjj  dS )z:Wait for weights to be ready within the specified timeout.zWeights are ready after z.2fz secondsNr  z$Weights are not ready after waiting z} seconds. Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. Current status: initial_weights_loaded=)
WAIT_WEIGHTS_READY_TIMEOUTr#  rf  r   r   r  r   r   r-  r   )rV  
start_timer  r   r   r   r~  8  s   r~  init_tokenizer_manager_funcrun_scheduler_process_funcrun_detokenizer_process_funcc              
   C   s  t | |||d\}}}}	t|}
tt|||d |
d | jr#tt | jdkrPdt_| t_	t
| ||dt_| js@| js@ttrOddlm} |t| j| jd nd	t_t|	| |d }zZt|  | jdkrytjt| j| j| j| jps| jd
dd n)ddlm} dgdd	d|d d< t  tjd| j| j| j| jp| jd
d| jd W | jdkr|  tjj !  dS dS | jdkr|  tjj !  w w )a3  
    Launch SRT (SGLang Runtime) Server.

    The SRT server consists of an HTTP server and an SRT engine.

    - HTTP server: A FastAPI server that routes requests to the engine.
    - The engine consists of three components:
        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.

    Note:
    1. The HTTP server, Engine, and TokenizerManager all run in the main process.
    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
    )r   r  r  r  r   )r   r   r   r   r  Tr  )add_api_key_middleware)r   r  FrU  uvloop)hostport	root_path	log_leveltimeout_keep_aliveloop)LOGGING_CONFIGdefaultINFO)handlerslevel	propagateloggersz"sglang.srt.entrypoints.http_serverz&sglang.srt.entrypoints.http_server:app)r  r  r  r  r  r  workersN)"r!   rj   r   r   r   rq   r   tokenizer_worker_numr   r   r   r   r   r  rw   sglang.srt.utils.authr  re   ru   uvicornrunr  r  fastapi_root_pathlog_level_httpr  uvicorn.configr  rc   unlinkr   r   socket_mappingclear_all_sockets)r   r  r  r  r|  r{  r   r   scheduler_infosr   r   r  multi_tokenizer_args_shmr  r   r   r   launch_serverM  s   
	





r  r   (  __doc__r*  rS  loggingr   r   r   r#  
contextlibr   httpr   typingr   r   r   r   r   r	   r
   r   setattrnumpyro  rq  rg  r  r  fastapir   r   r   r   fastapi.exceptionsr   fastapi.middleware.corsr   fastapi.responsesr   r   r   sglang.srt.disaggregation.utilsr   r   )sglang.srt.entrypoints.anthropic.protocolr   r   (sglang.srt.entrypoints.anthropic.servingr    sglang.srt.entrypoints.enginer!   r"   r#   r$   &sglang.srt.entrypoints.ollama.protocolr%   r&   r'   %sglang.srt.entrypoints.ollama.servingr(   &sglang.srt.entrypoints.openai.protocolr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   *sglang.srt.entrypoints.openai.serving_chatr5   .sglang.srt.entrypoints.openai.serving_classifyr6   1sglang.srt.entrypoints.openai.serving_completionsr7   /sglang.srt.entrypoints.openai.serving_embeddingr8   ,sglang.srt.entrypoints.openai.serving_rerankr9   +sglang.srt.entrypoints.openai.serving_scorer:   .sglang.srt.entrypoints.openai.serving_tokenizer;   r<   sglang.srt.entrypoints.warmupr=   sglang.srt.environr>   -sglang.srt.function_call.function_call_parserr?   sglang.srt.managers.io_structr@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   )sglang.srt.managers.multi_tokenizer_mixinr`   ra   rb   rc   rd   re   $sglang.srt.managers.template_managerrf   %sglang.srt.managers.tokenizer_managerrg   rh   sglang.srt.metrics.func_timerri   ;sglang.srt.model_loader.remote_instance_weight_loader_utilsrj   "sglang.srt.parser.reasoning_parserrk   sglang.srt.server_argsrl   rm   sglang.srt.tracing.tracern   ro   sglang.srt.utilsrp   rq   rr   rs   rt   ru   r  rv   rw   rx   sglang.utilsry   sglang.versionrz   	getLoggerr   r   set_event_loop_policyEventLoopPolicyintgetenvr,  r  	dataclassr   r   r   r   r   r   r   r   add_middlewaresglang.srt.entrypoints.v1_loadsr   v1_loads_routerinclude_routerexception_handlerr   r  r
  r:  r=  r<  rD  rN  rM  rU  	api_routeADMIN_OPTIONALrZ  environr   rm  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r	  r  r   r!  r   r&  r+  r.  r/  headr2  r5  r8  r;  r>  r@  rB  rC  rD  rS  r  r   r  rm  rz  r   r~  r  r   r   r   r   <module>   s  (8 " 
3o	
#M
	

	

$
				

			


	



 