o
    پix                    @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlZddlZddl Z ddl!Z dd	lm"Z" dd
l#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZK ddlLmMZMmNZN ddlOmPZPmQZQ ddlRmSZS ddlTmUZUmVZV ddlWmXZX ddlYmZZZ ddl[m\Z\ ddl]m^Z^ ddl_m`Z` ddlambZb ddlcmdZd ddlemfZfmgZgmhZh ddlimjZj dd lkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZs dd!ltmuZumvZvmwZwmxZxmyZymzZz dd"l{m|Z| dd#l}m~Z~mZmZ dd$lmZ dd%lmZ dd&lmZmZ ee  e*j ZeeZejG d'd( d(ZG d)d* d*eZG d+d, d,e\e^ZG d-d. d.eZd/d0 Zd1d2 Zd3egd4eMfd5d6ZG d7d8 d8ZdS )96TokenizerManager is a process that tokenizes the text.    N)deque)nullcontext)datetime)Enum)
HTTPStatus)Any	AwaitableDictListOptionalTupleUnion)BackgroundTasks)ModelConfig)MMReceiverHTTP)DisaggregationMode)envs)LoRARefLoRARegistry)AsyncDynamicbatchTokenizer)AsyncMMDataProcessor)start_disagg_service)AbortReqActiveRanksOutputBatchEmbeddingOutputBatchMultimodalOutputBatchStrOutputBatchTokenIDOutputBatchTokenizedEmbeddingReqInputBatchTokenizedGenerateReqInputConfigureLoggingReqContinueGenerationReqInputEmbeddingReqInputFreezeGCReqGenerateReqInputHealthCheckOutputLoadLoRAAdapterReqInputOpenSessionReqOutputPauseGenerationReqInputSessionParamsTokenizedEmbeddingReqInputTokenizedGenerateReqInputUpdateWeightFromDiskReqInputUpdateWeightFromDiskReqOutputWatchLoadUpdateReq)TensorTransportModewrap_shm_features)get_mm_processorimport_processors)RequestMetricsExporterManager)MultimodalDataItemRequestStage)is_health_check_generate_req)input_blocker_guard_region)TokenizerCommunicatorMixin)TokenizerManagerMultiItemMixin)TokenizerMetricsCollector)start_cpu_monitor_thread)SamplingParams)PortArgs
ServerArgs$set_global_server_args_for_tokenizer)SpeculativeAlgorithm)SpanAttributesextract_trace_headers trace_get_proc_propagate_contexttrace_req_finishtrace_req_start"trace_set_remote_propagate_contexttrace_slice_endtrace_slice_start)configure_gc_warning	freeze_gcget_bool_env_varget_or_create_event_loopget_zmq_socketkill_process_tree)RWLock)get_processorget_tokenizerget_tokenizer_from_processor)RequestLogger)Watchdog)TypeBasedDispatcherget_exception_tracebackc                   @   s  e Zd ZU dZeeeef  ed< eed< e	j
ed< eeef ed< eed< dZeed< dZeed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejedZee ed< ejedZee ed< ejedZ ee ed< ejedZ!ee ed< ejedZ"ee ed< ejedZ#eee  ed< ejedZ$eee  ed< ejedZ%eee  ed< ejedZ&eee  ed< ejedZ'eed< ejedZ(eed < ejedZ)eed!< ejedZ*eed"< ejedZ+ee ed#< ejedZ,ee ed$< ejedZ-ee ed%< ejedZ.ee ed&< ejedZ/ee ed'< ejedZ0ee ed(< d)S )*ReqStatezStore the state a request.out_listfinishedeventobjcreated_time        finished_timefirst_token_time	last_time   last_completion_tokensfinished_time_perffirst_token_time_perfrequest_sent_to_scheduler_tsresponse_sent_to_client_tsr   last_output_offset text)default_factory
output_idsinput_token_logprobs_valinput_token_logprobs_idxoutput_token_logprobs_valoutput_token_logprobs_idxinput_top_logprobs_valinput_top_logprobs_idxoutput_top_logprobs_valoutput_top_logprobs_idxinput_token_ids_logprobs_valinput_token_ids_logprobs_idxoutput_token_ids_logprobs_valoutput_token_ids_logprobs_idxinput_token_logprobsoutput_token_logprobsinput_top_logprobsoutput_top_logprobsinput_token_ids_logprobsoutput_token_ids_logprobsN)1__name__
__module____qualname____doc__r   r
   r   __annotations__boolasyncioEventr   r%   r#   floatr_   r`   ra   rc   intrd   re   rf   rg   rh   rj   strdataclassesfieldlistrl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~    r   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/tokenizer_manager.pyrX      sH   
 
rX   c                   @   s   e Zd ZdZdZdZdZdS )InputFormatz-Input format types for tokenization handling.rb         N)r   r   r   r   SINGLE_STRINGBATCH_STRINGSCROSS_ENCODER_PAIRSr   r   r   r   r      s
    r   c                   @   s  e Zd ZdZdedefddZdd Zdd	 Zdefd
dZ	dd Z
dd Zdd Zdd Zdd Zdd Zdd Z	ddeeef deej fddZdeeee f d ed!efd"d#Zdeeee f d$ed!eee eee  f fd%d&Zd'eee  d(eeee   d$ed)ed!eeee eee  f eeee  eeee   f f f
d*d+Z 	,ddeeee f d ed!eeee eee  f eeee  eeee   f f fd-d.Z!deeef fd/d0Z"deeef d'ee d!dfd1d2Z#deeef d!dfd3d4Z$ded!dfd5d6Z%d'eee eee  f d7ed!dfd8d9Z&			ddeeef d:ed'ee d;eeee' df  d<ee( d(eee  d!ee)e*f fd=d>Z+d?edeeef d!eee)e*f  fd@dAZ,d?edeeef d!dfdBdCZ-d?edeeef d!efdDdEZ.d!efdFdGZ/	ddeeef dHee)e*f dIee' fdJdKZ0	ddeeef dLeee)e*f  dIee' fdMdNZ1	ddeeef dOe2deej fdPdQZ3		ddeeef deej dIee' fdRdSZ4ddUedVefdWdXZ5de6fdYdZZ7de8fd[d\Z9	dde:deej d!eeef fd]d^Z;d_ed`efdadbZ<de:d!eeef fdcddZ=de>fdedfZ?dgdh Z@defdidjZAdkdl ZBdmdn ZCdoeeDeEeFeGf fdpdqZHdreIdOe2dsedtee duef
dvdwZJdreIdOe2dsedtee duedoeDdxefdydzZKd{ee' d|ee d}efd~dZLd{ee' d|ee d}efddZMdre(eeNf doeeDeEeFeGf ded!dfddZOdre(eeNf dOe2doeeDeEeFeGf ded!df
ddZPdoeNdedre(eeNf ded!df
ddZQded!efddZRdOe2doeDdefddZSdOe2deIfddZTdOe2deIfddZUdee dedefddZVeWXdeYZ fdefddZ[dd Z\dd Z]doe^fddZ_de`fddZadd Zbdd Zcdeeef d!dfddZddeeef fddZe		ddeeef dIee' deej fddZfdeeef fddZgdOe2doeeDeEeFeGf ded!e(eeNf fddZhdS )TokenizerManagerr   server_args	port_argsc                 C   s   || _ |j| _|j| _|j| _|j| _t| |   |   | | | 	  | 
  |   |   |   |   | jrCtd |   d S )N	tokenizer)r   enable_metricspreferred_sampling_paramscrash_dump_folderenable_tracer@   init_model_configinit_tokenizer_and_processorinit_ipc_channelsinit_running_status init_request_logging_and_dumpinginit_weight_update	init_lorainit_disaggregationinit_metric_collector_watchdogr<   init_request_dispatcher)selfr   r   r   r   r   __init__   s$   
zTokenizerManager.__init__c                 C   s   | j }t| dt}|j| _|j| _||| _| jj| _| jj| _| jj	| _	| jj
| _
d | _t|j}| rDt|j|j |j| _nd| _d| _d S )Nmodel_config_classr   T)r   getattrr   
model_pathserved_model_namefrom_server_argsmodel_configis_generationis_image_gencontext_lenimage_token_idmax_req_input_lenrA   from_stringspeculative_algorithmis_eaglemaxspeculative_eagle_topkspeculative_num_stepsspeculative_num_draft_tokensnum_reserved_tokensvalidate_total_tokens)r   r   r   r   r   r   r   r      s(   





z"TokenizerManager.init_model_configc                 C   s  | j }| jjrUtd tj  }rt|dd t|}t| j }t	| jj
|||| _t| j| j j| j jd| _|jrBd  | _| _n1|| _t| j| _dtjd< |   nd  | _| _|jrbd | _nt|j|j|j|jd| _|   |jr|jst| j|j|jd| _ d S d | _ d S )	Nz sglang.srt.multimodal.processorsT)	overwrite)max_concurrent_calls	timeout_sfalseTOKENIZERS_PARALLELISM)tokenizer_modetrust_remote_coderevision)max_batch_sizebatch_wait_timeout_s)!r   r   is_multimodalr3   r   $SGLANG_EXTERNAL_MM_PROCESSOR_PACKAGEget_get_processor_wrapper _determine_tensor_transport_moder2   	hf_configmm_processorr   mm_max_concurrent_callsmm_per_request_timeoutmm_data_processorskip_tokenizer_initr   	processorrS   osenviron%_initialize_multi_item_delimiter_textrR   tokenizer_pathr   r   r   enable_dynamic_batch_tokenizerr   "dynamic_batch_tokenizer_batch_size%dynamic_batch_tokenizer_batch_timeoutasync_dynamic_batch_tokenizer)r   r   mm_process_pkg
_processortransport_moder   r   r   r     sR   



z-TokenizerManager.init_tokenizer_and_processorc                 C   sr   t jd}t|t j|jd| _| jjdkr"t|t j	|j
d| _d S ddlm} t|t j	|jd}|||| _d S )Nr   Trb   r   )SenderWrapperF)zmqr   ContextrN   PULLtokenizer_ipc_namerecv_from_detokenizerr   tokenizer_worker_numPUSHscheduler_input_ipc_namesend_to_scheduler)sglang.srt.managers.multi_tokenizer_mixinr   tokenizer_worker_ipc_name)r   r   contextr   r   r   r   r   r   A  s   
z"TokenizerManager.init_ipc_channelsc                 C   sB   i | _ d | _t | _tj| _d| _d| _d| _	t
 | _i | _d S )NFr   )rid_to_state
event_loopsetasyncio_tasksServerStatusStartingserver_statusgracefully_exitlast_receive_tstampcurrent_loadr   Lockcurrent_load_locksession_futuresr   r   r   r   r   U  s   

z$TokenizerManager.init_running_statusc                 C   sj   t | jj| jj| jj| jjd| _d| _d| _g | _	t
 | _d| _g | _| jj\}}}t| j||| _d S )N)log_requestslog_requests_levellog_requests_formatlog_requests_targetri   i  F)rT   r   r   r   r   r   request_loggerdump_requests_folderdump_requests_thresholddump_request_listr   crash_dump_request_listcrash_dump_performedstraggler_request_listmetadatar4    request_metrics_exporter_manager)r   _obj_skip_namesout_skip_namesr   r   r   r   g  s    
z1TokenizerManager.init_request_logging_and_dumpingc                 C   s6   d| _ | jjr
d| _ t | _d | _d| _t | _	d S )NTF)
initial_weights_loadedr   +checkpoint_engine_wait_weights_before_readyrP   model_update_lockmodel_update_resultis_pauser   	Conditionis_pause_condr   r   r   r   r   ~  s   z#TokenizerManager.init_weight_updatec                 C   sL   t | jj| _t | _i | _| jjd ur"| jjD ]
}|| j|j< qd S d S N)	r   r   
lora_pathslora_registryr   r   lora_update_locklora_ref_cache	lora_name)r   lora_refr   r   r   r     s   
zTokenizerManager.init_lorac                 C   s>   t | jj| _t| j| _| jjrt| j| jjd| _	d S d S )N)dtype)
r   r   disaggregation_moder   bootstrap_serverlanguage_onlyr   r   r  mm_receiverr   r   r   r   r     s   z$TokenizerManager.init_disaggregationc                 C   s   | j r7d| jji}| jjr| jjD ]}d||< q| jjr#|| jj t| j|| jj| jj| jj	| jj
d| _| jjdkrCt| jj tjd| jjdtj d| _d S )N
model_nameri   )r   labelsbucket_time_to_first_tokenbucket_e2e_request_latencybucket_inter_token_latencycollect_tokens_histogramr^   r   T)
debug_namewatchdog_timeoutsofttest_stuck_time)r   r   r   'tokenizer_metrics_allowed_custom_labelsextra_metric_labelsupdater;   r  r  r   r!  metrics_collectorgc_warning_threshold_secsrJ   rU   createsoft_watchdog_timeoutr   SGLANG_TEST_STUCK_TOKENIZERr   soft_watchdog)r   r  labelr   r   r   r     s0   
	z/TokenizerManager.init_metric_collector_watchdogc              	   C   sp   t ttttf| jft| jft| j	ft
| jftdd ftdd ft| jfg| _| | j t| _t| _t| _d S )Nc                 S      d S r  r   xr   r   r   <lambda>      z:TokenizerManager.init_request_dispatcher.<locals>.<lambda>c                 S   r0  r  r   r1  r   r   r   r3    r4  )rV   r   r   r   r   _handle_batch_outputr   _handle_abort_reqr(   _handle_open_session_req_outputr.   +_handle_update_weights_from_disk_req_outputr$   r&   r   update_active_ranks_result_dispatcherinit_communicatorsr   r=   sampling_params_classSignalHandlersignal_handler_classrX   req_state_classr   r   r   r   r     s,   	


z(TokenizerManager.init_request_dispatcherNr\   requestc              	     s  |j r|j nt }   |   jr |||  jjr& |  jj	dkr1 
|  j| j|  j4 I d H   j fddI d H  W d   I d H  n1 I d H s_w   Y   jj4 I d H K  |I d H  |jr |I d H } |||} |||2 z	3 d H W }|V  q6 n |||2 z	3 d H W }|V  q6 W d   I d H  d S 1 I d H sw   Y  d S )Nrb   c                      s    j  S r  )r  r   r   r   r   r3    s    z3TokenizerManager.generate_request.<locals>.<lambda>)received_timetimeauto_create_handle_loopnormalize_batch_and_argumentsr   _trace_request_startr   r  )_handle_epd_disaggregation_encode_requestr   _attach_multi_http_worker_infor   log_received_requestr   r  wait_forr  reader_lock_validate_and_resolve_lora	is_single_tokenize_one_request_send_one_request_wait_one_response_handle_batch_request)r   r\   r@  r]   tokenized_objstateresponser   r   r   generate_request  s8   

(.z!TokenizerManager.generate_requesttextsis_cross_encoderreturnc                 C   sJ   t |trtjS |r"t|dkr"t |d tr"t|d dkr"tjS tjS )a\  Detect the format of input texts for proper tokenization handling.

        Returns:
            - InputFormat.SINGLE_STRING: Regular single text like "Hello world"
            - InputFormat.BATCH_STRINGS: Regular batch like ["Hello", "World"]
            - InputFormat.CROSS_ENCODER_PAIRS: Cross-encoder pairs like [["query", "document"]]
        r   r   )
isinstancer   r   r   lenr   r   r   )r   rU  rV  r   r   r   _detect_input_format  s   

z%TokenizerManager._detect_input_formatinput_formatc                 C   s"   |t jkr|gS |t jkr|S |S )z9Prepare input for the tokenizer based on detected format.r   r   r   )r   rU  r[  r   r   r   _prepare_tokenizer_input'  s
   

z)TokenizerManager._prepare_tokenizer_input	input_idstoken_type_idsoriginal_batch_sizec                 C   sH   |t jt jfv r |dkr |r|d ng }|r|d nd}||fS ||fS )z<Extract results from tokenizer output based on input format.rb   r   Nr\  )r   r^  r_  r[  r`  single_input_idssingle_token_type_idsr   r   r   _extract_tokenizer_results2  s   z+TokenizerManager._extract_tokenizer_resultsFc                    s  |r| j du rtd| ||}| ||}t|ts!t|nd}|r)d|ini }| jduo4|tj	k}|r_t
d | jj|d fi |I dH }|d g}	|r\|dr\|d gnd}
n!t
d	t| d
 | j |fi |}|d }	|r~|dnd}
| |	|
||S )a  
        Tokenize text(s) using the appropriate tokenizer strategy.

        This method handles multiple input formats and chooses between async dynamic
        batch tokenizer (for single texts only) and regular tokenizer.

        Args:
            texts: Text input in various formats:

                   Regular cases:
                   - Single string: "How are you?"
                   - Batch of strings: ["Hello", "World", "How are you?"]

                   Cross-encoder cases (sentence pairs for similarity/ranking):
                   - Single pair: [["query text", "document text"]]
                   - Multiple pairs: [["q1", "d1"], ["q2", "d2"], ["q3", "d3"]]

            is_cross_encoder: Whether to return token_type_ids for cross-encoder models.
                             Enables proper handling of sentence pairs with segment IDs.

        Returns:
            Single input cases:
                Tuple[List[int], Optional[List[int]]]: (input_ids, token_type_ids)
                Example: ([101, 2129, 102], [0, 0, 0]) for single text
                Example: ([101, 2129, 102, 4068, 102], [0, 0, 0, 1, 1]) for cross-encoder pair

            Batch input cases:
                Tuple[List[List[int]], Optional[List[List[int]]]]: (batch_input_ids, batch_token_type_ids)
                Example: ([[101, 2129, 102], [101, 4068, 102]], None) for regular batch

            Note: token_type_ids is None unless is_cross_encoder=True.
        Nz7texts cannot be empty and tokenizer must be initializedrb   return_token_type_idsz3Using async dynamic batch tokenizer for single textr   r^  r_  zUsing regular tokenizer for z inputs)r   
ValueErrorrZ  r]  rX  r   rY  r   r   r   loggerdebugencoder   rc  )r   rU  rV  r[  tokenizer_inputr`  tokenizer_kwargsuse_async_tokenizerresultr^  r_  encodedr   r   r   _tokenize_textsJ  s@   &


z TokenizerManager._tokenize_textsc           	         s  d}|j }d}t|to|j}|jdur$| jjstd|j}|j}n|jdur-|j}n| j	du r6td| 
||I dH \}}| jr| r|jdurXt|jtsX|jg|_|jdurht|jtsh|jg|_|jdurxt|jtsx|jg|_| | d}| jjr| jjdv r| jjr| jj|j| j|p|dI dH }|du r| jj|j|j|p||| jdI dH }|rd|v r|d }tj r|rd|v r|d D ]}t|tr|  qnd}| || tt j!|j" | #||||||S )	zTokenize one request.Nzinput_embeds is provided while disable_radix_cache is False. Please add `--disable-radix-cache` when you launch the server if you want to use input_embeds as inputs.zThe engine initialized with skip_tokenizer_init=True cannot accept text prompts. Please provide input_ids or re-initialize the engine with skip_tokenizer_init=False.)zmq_to_tokenizermooncake)img_datar   prompt)
image_data
audio_datainput_text_or_idsrequest_objr   r^  mm_items)$rj   rX  r#   is_cross_encoder_requestinput_embedsr   disable_radix_cachere  r^  r   rn  r   contains_mm_inputrs  r   
video_datart  _validate_mm_limitsr  encoder_transfer_backendr  recv_mm_datar   processr   r   SGLANG_MM_PRECOMPUTE_HASHr   r5   set_pad_value_validate_one_requestrH   r6   TOKENIZErid_create_tokenized_object)	r   r\   ry  
input_textr_  rx  r^  	mm_inputsitemr   r   r   rM    s   







z&TokenizerManager._tokenize_one_requestc              	   C   s~  | j }|durt|nd}|| j7 }|| j kr>| jjr2td| d| j  d ||d= t|}ntd| d| j  d|j	d}| j
r|dur|| |kr| jjrptd| d	| d
| j  d td|| |jd< n|| }d| j  d| d| d| d	}t|t|tr| jrtdt|tr| | t|tr|jr| jjstd|jr| jjstddS dS dS )zmValidates that the input token count and the requested token count doesn't exceed the model's context length.Nr   zThe input (z4 tokens) is longer than the model's context length (z tokens). Truncating the input.z	 tokens).max_new_tokenszRequested token count (z	 input + z* new) exceeds the model's context length (z$ tokens). Truncating max_new_tokens.zDRequested token count exceeds the model's maximum context length of z" tokens. You requested a total of z	 tokens: z$ tokens from the input messages and z tokens for the completion. Please reduce the number of tokens in the input messages or the completion to fit within the limit.zThis model does not appear to be an embedding model by default. Please add `--is-embedding` when launching the server or try another model.z|The server is not configured to return the hidden states. Please set `--enable-return-hidden-states` to enable this feature.zThe server is not configured to enable custom logit processor. Please set `--enable-custom-logit-processor` to enable this feature.)r   rY  r   r   allow_auto_truncaterf  warningre  sampling_paramsr   r   r   rX  r#   r   _validate_for_matryoshka_dimr%   return_hidden_statesenable_return_hidden_statescustom_logit_processorenable_custom_logit_processor)r   r\   r^  _max_req_leninput_token_numr  total_tokens	error_msgr   r   r   r    s   






	z&TokenizerManager._validate_one_requestc                 C   sx   | j jsd S | j j D ]-\}}t|| dd }|r9t|tr$t|nd}||kr9t|  d| d| dqd S )N_datarb   z count z exceeds limit z per request.)	r   limit_mm_data_per_requestitemsr   rX  r   rY  re  
capitalize)r   r\   modalitylimitdatacountr   r   r   r}  A  s   z$TokenizerManager._validate_mm_limitsc                 C   s   |j du rdS | jjstd| jj d|j dk rtd| jjr8|j | jjvr8td| jj d| jj d|j | jjkrHtd| jj dS )	z@Validate the request for Matryoshka dim if it has the field set.NzModel 'zc' does not support matryoshka representation, changing output dimensions will lead to poor results.rb   z+Requested dimensions must be greater than 0z' only supports zP matryoshka dimensions, using other output dimensions will lead to poor results.z>Provided dimensions are greater than max embedding dimension: )
dimensionsr   is_matryoshkare  r   matryoshka_dimensionshidden_sizer   r\   r   r   r   r  P  s&   

z-TokenizerManager._validate_for_matryoshka_dim
vocab_sizec                    sx   t |d tr$|D ]}t fdd|D r!td| d  dq	d S t fdd|D r:td| d  dd S )Nr   c                 3       | ]}| kV  qd S r  r   .0idr  r   r   	<genexpr>s      z@TokenizerManager._validate_input_ids_in_vocab.<locals>.<genexpr>zThe input_ids z. contains values greater than the vocab size (z).c                 3   r  r  r   r  r  r   r   r  y  r  )rX  r   anyre  )r   r^  r  seqr   r  r   _validate_input_ids_in_vocabl  s   z-TokenizerManager._validate_input_ids_in_vocabr  ry  r  c                 C   sV  | j ri | j |j}n|j}| jdi |}|| j || jj t|t	r|j
r4tdi |j
nd}	t|||||j|j|j|j|jf	i d|jd|jd|jd|jd|jd|jd|d	|	d
|jd|jd|jd|jd|jd|jd|jd|jd|j d|j!}
|
S t|t"rt#||||||j|j|j$|j|jd
}
|
S )z9Create a tokenized request object from common parameters.Nr  http_worker_ipcbootstrap_hostbootstrap_portbootstrap_roomlora_idry  session_paramsr  require_reasoningr  return_routed_expertsdata_parallel_rankpriority	extra_keyrouting_keyneed_wait_for_imagenum_items_assigned)r  r  r  r  r  r   )%r   r  r<  	normalizer   verifyr   r  rX  r%   r  r*   r,   return_logproblogprob_start_lentop_logprobs_numtoken_ids_logprobstreamr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r#   r+   r  )r   r\   r  r^  ry  r  r_  sampling_kwargsr  r  rQ  r   r   r   r  ~  s   

+
z)TokenizerManager._create_tokenized_object
batch_sizec                    s  t d| d | s fddt|D I dH S |   fddt|D }dd |D }tdd	 |D }||I dH \}}g }t|D ]/\}	}
 |	 ||	  |durg||	 nd}|	
|
|
j||	 dd| ttj|
j qQt d
| d |S )z/Handle batch tokenization for text inputs only.z Starting batch tokenization for z text requestsc                    s"   g | ]}  | I d H qS r  rM  r  ir\   r   r   r   
<listcomp>  s     z@TokenizerManager._batch_tokenize_and_process.<locals>.<listcomp>Nc                       g | ]} | qS r   r   r  r\   r   r   r        c                 S      g | ]}|j qS r   )rj   r  reqr   r   r   r        c                 s   s     | ]}t |to|jV  qd S r  )rX  r#   rx  r  r   r   r   r    s
    
z?TokenizerManager._batch_tokenize_and_process.<locals>.<genexpr>zCompleted batch processing for z	 requests)rf  rg  _batch_has_textrange(_validate_batch_tokenization_constraintsr  rn  	enumerater  appendr  rj   rH   r6   r  r  )r   r  r\   requestsrU  rx  input_ids_listtoken_type_ids_listtokenized_objsr  r  r_  r   r  r   _batch_tokenize_and_process  s4   z,TokenizerManager._batch_tokenize_and_processc                 C   sX   t |D ]%}| jr||  rtd|| jdurtd|| jdur)tdqdS )z7Validate constraints for batch tokenization processing.zKFor multimodal input processing do not set `enable_tokenizer_batch_encode`.NziBatch tokenization is not needed for pre-tokenized input_ids. Do not set `enable_tokenizer_batch_encode`.z^Batch tokenization is not needed for input_embeds. Do not set `enable_tokenizer_batch_encode`.)r  r   r{  re  r^  ry  r   r  r\   r  r   r   r   r    s   z9TokenizerManager._validate_batch_tokenization_constraintsc                 C   s:   t |D ]}|| jr dS | jr||  r dS qdS )z6Check if any request in the batch contains text input.TF)r  rj   r   r{  r  r   r   r   r    s   
z TokenizerManager._batch_has_textc                 C   s(   |dko| j jp| j j o| || S )a  Return True if we should run the tokenizer in batch mode.

        Current policy:
        - Respect explicit server flag `enable_tokenizer_batch_encode`.
        - Or, if no request has text or multimodal input (all use pre-tokenized input_ids or input_embeds), batch the requests without tokenization.
        - Batch tokenization does not support DP attention yet, and it will make everything goes to the first rank currently
        r   )r   enable_tokenizer_batch_encodeenable_dp_attentionr  )r   r  r  r   r   r   _should_use_batch_tokenization  s
   
z/TokenizerManager._should_use_batch_tokenizationrQ  r]   c                 C   sr   t tj|j t|j|_t|}| j| | j	g dt
 ||d}t |_|| j|j< ttj|jdd |S )NFr]   T)thread_finish_flag)rI   r6   TOKENIZER_DISPATCHr  rD   trace_contextr1   r   
send_pyobjr?  r   r   rB  rf   r   rH   )r   r\   rQ  r]   rR  r   r   r   rN  !  s   

z"TokenizerManager._send_one_requestr  c           	      C   sr   t |d trt|d}nt|d}| j| t|D ]\}}|| }| jg dt	 ||d}|| j
|j< qdS )zPSend a batch of tokenized requests as a single batched request to the scheduler.r   )batchFr  N)rX  r,   r    r   r   r  r  r?  r   r   r   r  )	r   r\   r  r]   	batch_reqr  rQ  tmp_objrR  r   r   r   _send_batch_request5  s   	
z$TokenizerManager._send_batch_requestrR  c                 C  sF  t |dd}	 ztj|j tdI dH  W n& tjy=   |dur;|js;| I dH r;| 	|j
 td|j
Y qw |jd }g |_|jr|jsXt |_|j|d d	< | jj||| jj|d
 | j rst| j|| t|d dtr|d d }|ddkr|dtjkr|st|d |V  dS |ddkr|dtjtjfv r|jj
| j v r| j |jj
= | j!j"r|jj#r| j$%|jj&I dH  |st'j(|d |d d|V  dS |V  dS |j)  |r|jst |_|j|d d	< |V  n|dur"|js"| I dH r"| 	|j
 td|j
q)z%Wait for the response of one request.r  FT)timeoutNzMRequest is disconnected from the client side (type 1). Abort request obj.rid=	meta_inforg   )is_multimodal_genr@  finish_reasontypeabortstatus_codemessage)r  detailzMRequest is disconnected from the client side (type 3). Abort request obj.rid=)*r   r   rI  r[   wait_REQUEST_STATE_WAIT_TIMEOUTTimeoutError
backgroundis_disconnectedabort_requestr  re  rY   rZ   rg   rB  r   log_finished_requestr   r  r  exporter_enabledcreate_taskwrite_recordrX  r   dictr   BAD_REQUESTSERVICE_UNAVAILABLEINTERNAL_SERVER_ERRORr\   r   r   enable_lora	lora_pathr  releaser  fastapiHTTPExceptionclear)r   r\   rR  r@  	is_streamoutr  r   r   r   rO  L  s   




	



z#TokenizerManager._wait_one_responsec              	     s@   j }g }g }t dddkr| rF| I d H } || t|D ]} | }	||	j|	j	 | ||	j	 q*nt
drPtjdnt 2 t|D ]%} | }	|	I d H }
|	|
|}||	|| ||	j	 qXW d    n1 sw   Y  n|dkrtd  fddt|D }tjfd	d
|D  I d H }t|D ]6}t|| }	t|| }
|	 |
_	t|
j|
_d|
j_d|
_|	|
|}|	|| I d H  qt|D ]4}t jD ],}t|| }	t|| }
|	 |
_	|	|
|}||	|| ||	j	 qqt do- j}|sCtjdd
 |D  I d H }|V  d S dd t|D }dd |D }|rtj| tjdI d H \}}|D ]1}| |}z|! }||d d  |d< |V  t"| }|||< W qg t#y   Y qgw |sVd S d S )Nparallel_sample_numrb   !SGLANG_ENABLE_COLOCATED_BATCH_GEN)r      zSending a single large batch with parallel sampling (n > 1) has not been well optimized. The performance might be better if you just duplicate the requests n times or use many threads to send them one by one with parallel sampling (n > 1).c                    r  r   r   r  r  r   r   r    r  z:TokenizerManager._handle_batch_request.<locals>.<listcomp>c                 3   s    | ]}  |V  qd S r  r  )r  r\   r   r   r   r    s    z9TokenizerManager._handle_batch_request.<locals>.<genexpr>r   Fr  c                 s   s    | ]}|  V  qd S r  )	__anext__r  genr   r   r   r    r  c                 S   s   i | ]\}}||qS r   r   )r  r  r  r   r   r   
<dictcomp>  s    z:TokenizerManager._handle_batch_request.<locals>.<dictcomp>c                 S   s   i | ]
}t | |qS r   )r   r  r  r  r   r   r   r    s    )return_whenr  r  index)$r  r   r  r  r  r  r  rO  r   r  rL   r8   r   r   rM  rN  rf  r  r   gathercopyregenerate_ridr  r  r  r  r	  hasattrr  r  keysFIRST_COMPLETEDpoprl  r  StopAsyncIteration)r   r\   r@  r]   r  
generatorsridsr  r  r  rQ  rR  objsr  r  outputsrid_to_indextask_mapdonetaskr  rl  new_taskr   r  r   rP    s   


	


z&TokenizerManager._handle_batch_requestri   r  	abort_allc                 C   sH   |s	|| j vr	d S t||d}| j| | jr"| j| jj d S d S )N)r  r#  )r   r   r   r  r   r)  observe_one_aborted_requestr  )r   r  r#  r  r   r   r   r  #  s   zTokenizerManager.abort_requestc              	      s   | j 4 I d H 9 d| _|jdkr| j|I d H  n	 | jdd | j I d H }|s-n	t	dI d H  qW d   I d H  d S 1 I d H sGw   Y  d S )NTr  r#  g      ?)
r  r  moder   r  r  r  	is_lockedr   sleep)r   r\   r'  r   r   r   pause_generation.  s   
.z!TokenizerManager.pause_generationc              	      sd   | j 4 I d H  d| _| j|I d H  | j   W d   I d H  d S 1 I d H s+w   Y  d S )NF)r  r  r   r  
notify_allr  r   r   r   continue_generation=  s   .z$TokenizerManager.continue_generationc              	      s  |    |jd u r| jj|_td|j |jr| jdd | j4 I d H  | j}W d   I d H  n1 I d H s:w   Y  |sE| j	j
nt }|4 I d H  | |I d H \}}}W d   I d H  n1 I d H sjw   Y  |r|jd ur| |j |d|j d7 }|||fS )Nz$Start update_weights. Load format=%sTr%  z Weight version updated to .)rC  load_formatr   rf  infoabort_all_requestsr  r  r  r  writer_lockr    _wait_for_model_update_from_diskweight_version"_update_weight_version_if_provided)r   r\   r@  	is_pausedlock_contextsuccessr  num_paused_requestsr   r   r   update_weights_from_diskC  s(   

(
(
z)TokenizerManager.update_weights_from_diskr   r-  c                 C   s    || _ || j_|| j_|| _d S r  )r   r   r   r-  )r   r   r-  r   r   r   _update_model_path_infod  s   
z(TokenizerManager._update_model_path_infoc                    s   | j | t | _| jjdkr+| jI d H }|jr#| |j	|j
 |j|j|jfS g | _| jI d H }tdd |D }|du rI| |j	|j
 dd |D }d|}dd |D }|||fS )Nrb   c                 S   r  r   )r6  r  rr   r   r   r  x  r  zETokenizerManager._wait_for_model_update_from_disk.<locals>.<listcomp>Tc                 S   r  r   )r  r:  r   r   r   r  {  r  z | c                 S   r  r   )r7  r:  r   r   r   r  }  r  )r   r  r   Futurer  r   dp_sizer6  r9  r   r-  r  r7  model_update_tmpalljoin)r   r\   rl  all_successall_messageall_paused_requestsr   r   r   r1  j  s"   


z1TokenizerManager._wait_for_model_update_from_diskc                 C   sb   | j j|j|j|jd |jd ur|j| _|jd ur|j| _|jd ur'|j| _t	d| d S )N)r   r   r   zConfig logging: obj=)
r   	configurer   r   r   r   r   r   loggingr.  r  r   r   r   configure_logging  s   


z"TokenizerManager.configure_loggingc                    s   | j t  td dS )zESend a freeze_gc message to the scheduler first, then freeze locally.zTokenizer ManagerN)r   r  r$   rK   r   r   r   r   rK     s   zTokenizerManager.freeze_gcc                    s"    fdd}t  }|| |S )Nc                     sB   t dI d H   jr j d S  jD ]} |  qd S )Nr   )r   r(  rL  r  r  )r  r  r   r   r    s   
z9TokenizerManager.create_abort_task.<locals>.abort_request)r   add_task)r   r\   r  background_tasksr   r  r   create_abort_task  s   
z"TokenizerManager.create_abort_taskc                 C   s   | j d urd S t }| j|t| j || _ t t	 u r6| 
| }|tj|j |tj|j | j|t| j d S r  )r   rM   r   addr  print_exception_wrapperhandle_loop	threadingcurrent_threadmain_threadr>  add_signal_handlersignalSIGTERMsigterm_handlerSIGQUITrunning_phase_sigquit_handlersigterm_watchdog)r   loopsignal_handlerr   r   r   rC    s    

z(TokenizerManager.auto_create_handle_loopc                    s^   	 | j   | j I dH }W d   n1 sw   Y  | | t | _| j   q)z$The event loop that handles requestsTN)r.  disabler   
recv_pyobjr:  rB  r   feedr   recv_objr   r   r   rL    s   


zTokenizerManager.handle_loopr]  c              
   C   sl  t |jD ]\}}| j|d }|d u rtd|d q||j| |j| | jj	|j
| d}| jrU| |d|| | |d|| | |d|| | |d|| t|jdd	rr| |||jj|jj|jjon| jj || t|ts||j| |j| d
 t|dr|jr|j| |d< t|dd r|j| |d< t|dd r|j| |d< t|dd r|j D ]
\}}|| ||< qt|tr| j |j!| 7  _ t|jdd	}| jj"r|r|j#$|j#|  |j#|j%d  }	t&|j#|_%n|j#$|j#|  |j#' }	|j |	|d}
n[t|t(rPt|jdd	}| jj"r<|r<|j#$|j#|  |j#|j%d  }	t&|j#|_%n|j#$|j#|  |j#' }	|	|d}
nt|t)rZt*dt|tsbJ |j+| |d}
|j| d u|_,|j,rt-- |_.t-/ |_0|j.|j1 |d< | jj2r| 3||| | jr| 4|||| t5|t6|j.d | 7|||d | j|= | jj8r|jj9rt:;| j<=|jj> |j?@|
 |jAB  | jr|jjCr| D||| | jEr|j,r|jjCr| F||
 | jGr|j,r|jjCr| H||
 q| jjIdkr0t|tt(fr2|jJd ur4tK|jJgd}| jLM| d S d S d S d S )NzReceived output for rid=z/ but the state was deleted in TokenizerManager.)r  r  prompt_tokensr2  total_retractions
queue_timeprefill_launch_delayprefill_launch_latencyprefill_finished_tsr  F)completion_tokenscached_tokenscached_tokens_detailsoutput_hidden_stateshidden_statesrouted_expertscustomized_infor  rj   rl   r  )rl   r  z"BatchMultimodalOut not implemented)	embeddingr  e2e_latency    eA)tsattrsrb   )loads)Nr  r  r   r   rf  errorfinished_reasonsr^  r   r2  retraction_countsr   _add_metric_if_presentr   r\   convert_logprob_styler  r  return_text_in_logprobsr   rX  r   r(  rd  re  r  rf  rg  ri  rj  r  r   rj   output_strsstream_outputrl   extendrh   rY  r  r   r   NotImplementedError
embeddingsrZ   rB  r_   perf_counterrd   r]   r    _calculate_spec_decoding_metrics_calculate_timing_metricsrE   r   convert_to_span_attrsr  r  r   r  r  r  r  rY   r  r[   r   log_metricscollect_metricsr   dump_requestsr   record_request_for_crash_dumpr=  loadr/   r   r  )r   r]  r  r  rR  r  kvr  output_token_idsout_dictload_update_reqr   r   r   r5    s   	







z%TokenizerManager._handle_batch_outputr  r  r  rw  c                 C   s  t |jt |jkr%|j| |jt |jd  |jt |jd  | t |jt |jkrJ|j| |jt |jd  |jt |jd  | |j|d< |j|d< |dkrt |j	t |j
kr}|j
| |j	t |j
d  |jt |j
d  | t |jt |jkr|j| |jt |jd  |jt |jd  | |j
|d< |j|d< |d urt |jt |jkr|j| |jt |jd  |jt |jd  | t |jt |jkr|j| |jt |jd  |jt |jd  | |j|d< |j|d< d S d S )Nry   rz   r   r{   r|   r}   r~   )rY  rm   ry   rz  detokenize_logprob_tokensrn   ro   rz   rp   rq   r{   detokenize_top_logprobs_tokensrr   rs   r|   rt   ru   r}   rv   rw   r~   rx   )r   r  rR  r  r  rw  r   r   r   add_logprob_to_meta_infoc  s   	







z)TokenizerManager.add_logprob_to_meta_inforecv_obj_indexc                 C   sL  |j d u rd S t|j dkr'|j | d ur'|j |j |  |j|j|  |j|j|  |j|j|  |dkrht|jdkrV|j|j|  |j|j|  |j|j|  |j	|j	|  |d urt|j
dkr|j
|j
|  |j|j|  |j|j|  |j|j|  | |||jj|jj| d S )Nr   )rm   rY  rz  rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   r  r\   r  r  )r   r  rR  r  r  rw  r]  r  r   r   r   rv    sf   

z&TokenizerManager.convert_logprob_styletoken_logprobs_valtoken_logprobs_idxdecode_to_textc                 C   sB   |sdd t ||D S | jd usJ | j|}tt |||S )Nc                 S   s   g | ]	\}}||d fqS r  r   )r  logprobtoken_idr   r   r   r    s    z>TokenizerManager.detokenize_logprob_tokens.<locals>.<listcomp>)zipr   batch_decoder   )r   r  r  r  token_textsr   r   r   r    s   z*TokenizerManager.detokenize_logprob_tokensc                 C   sJ   g }t t|D ]}|| r|| || || | q|d  q|S r  )r  rY  r  r  )r   r  r  r  retr  r   r   r   r    s   z/TokenizerManager.detokenize_top_logprobs_tokensr  c                 C   s   t |drg|j| dkrit |drkt|j|krm| jjd }|j| | }|j| }|dkrO|| |d< |j| |j|  |d< ||d< ||d< |j| |d< |jrot|j|krq|j| rs|j| |d	< d
S d
S d
S d
S d
S d
S d
S d
S )z^Calculate speculative decoding metrics, such as acceptance rate and acceptance length metrics.spec_verify_ctr   spec_accepted_tokensrb   spec_accept_ratespec_accept_lengthspec_accept_token_numspec_draft_token_numspec_accept_histogramN)r  r  rY  r  r   r   rd  spec_acceptance_histogram)r   r  r]  r  num_guess_tokenstotal_draft_tokensaccepted_tokensr   r   r   r~  !  s:   
z1TokenizerManager._calculate_spec_decoding_metricsc                 C   s   |j dkr
|j |d< |jdkr|j|d< |jdkr|j|d< |jdkr(|j|d< t|drH|jrH|j| durH|jdkrH|j|j|  }||d	< |jdkrq|jdkrst|t	su|j
| dkrw|j|j }|j
| }|| |d
< dS dS dS dS dS )zfCalculate request-level timing metrics, such as inference time, decode throughput, and time per token.r   request_received_tsrf   rg   decode_finished_tsforward_entry_timeNr^   inference_timedecode_throughput)r]   rf   rg   r_   r  r  rd   re   rX  r   rd  )r   r  rR  r]  r  r  decode_timerd  r   r   r   r  M  s8   










z*TokenizerManager._calculate_timing_metrics	attr_namer  c                 C   sH   t ||rt||r t||| dur"t||| ||< dS dS dS dS )ab  Add a metric to meta_info if it exists and is not None.

        Args:
            recv_obj: The received object that may contain the metric attribute
            attr_name: The name of the attribute to check
            meta_info: The dictionary to add the metric to
            index: The index to access the metric value in the attribute list
        N)r  r   )r   r]  r  r  r  r   r   r   ru  {  s   z'TokenizerManager._add_metric_if_presentc                 C   s8   |j dd p|j dd p|j dd p|j dd S )Njson_schemaregexebnfstructural_tag)r  r   r  r   r   r   _request_has_grammar  s   z%TokenizerManager._request_has_grammarc              
   C   sX  t |dd r|j| nd}t |jdd }|ri | jj|n| jj}|jdkrI| jtjkrIt		  |_|_
t	 |_||_| j||j|j  n||j }|rgt		 }||j
 }	| j||	| ||_
||_|jrt |dd r||t|jk r||j| nd}
d }t|dr|jr|j| }| j||j| ||j| |j|j | |j|
| d S d S )Nrd  r   custom_labelsr^   rt  rf  )r   rd  r\   r)  r  r`   r  r   PREFILLrB  ra   r}  re   rc   observe_time_to_first_tokenr]   observe_inter_token_latencyrZ   rY  rt  r  rf  observe_one_finished_requestr^  re  r_   r  )r   rR  r]  r  rd  r  r  num_new_tokensnew_timeintervalretraction_countrf  r   r   r   r    sh   









z TokenizerManager.collect_metricsr  c                 C   s|   | j |j||jt f t| j | jkr<tj	| j
t dd }| j| j |dt| j  d| d g | _ d S d S )N%Y-%m-%d_%H-%M-%S.pklzDump z requests to )	data_listfilenamelog_message)r   r  r\   r]   rB  rY  r   r   pathr@  r   r   nowstrftime_dump_data_to_file)r   rR  r  r  r   r   r   r    s   
zTokenizerManager.dump_requestsc                 C   sr   t   }| j|j||j|f | jr3|| jd d  dkr7| j  | jr5|| jd d  dksd S d S d S d S )Nr   r   i,  )rB  r  r  r\   r]   popleft)r   rR  r  current_timer   r   r   r    s   
z.TokenizerManager.record_request_for_crash_dumpr  r  r  c                    s<   t | | j| d fdd}tt| d S )N)r   r  c                     sR   t jt j dd t d} t|  W d    d S 1 s"w   Y  d S )NTexist_okwb)r   makedirsr  dirnameopenpickledump)fr  to_dump_with_server_argsr   r   background_task  s   "z<TokenizerManager._dump_data_to_file.<locals>.background_task)rf  r.  r   r  r   r  	to_thread)r   r  r  r  r  r   r  r   r    s   
z#TokenizerManager._dump_data_to_fileHOSTNAMEhostnamec           	      C   sZ  | j sd S | jrtd d S d| _td| j  g }| jr&|| j g }| j D ]\}}|j	sI|
|j|jr@|jd ni |jt f q-|rQ|| |sUd S tj| j |dt d d}tjtj|dd | j|d	tjd
}t|d}t|| W d    n1 sw   Y  tdt| j dt| d|  |S )NzPSIGTERM/SIGQUIT/Exception triggered, but crash dump already performed, skipping.Tz6Dumping requests before crash. self.crash_dump_folder=r  crash_dump_r  r  r   )r   r  launch_commandr  zDumped z finished and z% unfinished requests before crash to )r   r  rf  r.  rr  r  rz  r   r  rZ   r  r\   rY   r]   rB  r   r  r@  r   r  r  r  r  r   sysargvr  r  r  rY  )	r   r  data_to_dumpunfinished_requestsr  rR  r  data_to_dump_with_server_argsr  r   r   r   dump_requests_before_crash  sX   

z+TokenizerManager.dump_requests_before_crashc                    s   | j stdI d H  | j r	 t| j}t| j }| jtj	kr0t
d |   |   n-tdr>t
d |   nt
d| d|d |d	krWtdI d H  n|   nqtt dd
 td	 d S )N   TzASignal SIGTERM received while health check failed. Force exiting.SGL_FORCE_SHUTDOWNzESignal SIGTERM received while force shutdown flag set. Force exiting.z3Gracefully exiting... Remaining number of requests z$. Remaining requests remaining_rids=r,  r   include_parent)r   r   r(  rY  r   r   r  r   r   	UnHealthyrf  rr  r  force_exit_handlerrL   r.  rO   r   getpidr  exit)r   remain_num_reqremaining_ridsr   r   r   rV  A  s<   
z!TokenizerManager.sigterm_watchdogc                 C   s   dS )z&Put some custom force exit logic here.Nr   r   r   r   r   r  g  s   z#TokenizerManager.force_exit_handlerc           	      C   s   t |rd S | j|j }d|_t |_|jpd}d|d}|jr$|j}|j|| jj	|j|j
 d}t|jdd}t|jddrT| |||jj|jj|jjoR| jj  |j}t||d	< |rlt|d
krj|d gng }|j||d}|j| |j  d S )NTzAbort in waiting queuer  )r  r  )r  r  r2  rm  r  Fr  rd  r   r  rk  )r7   r   r  rZ   rB  r_   abort_messagefinished_reasonr   r2  r]   r   r\   r  r  r  rw  r   rl   rY  rj   rY   r  r[   r   )	r   r]  rR  r  r  r  r  rl   r  r   r   r   r6  k  sH   


	z"TokenizerManager._handle_abort_reqranksc                 C   s   | j | d S r  )r   r  )r   r  r   r   r   r9    s   z$TokenizerManager.update_active_ranksc                 C   s(   | j |j |jr|j d S d  d S r  )r   
session_id
set_resultr6  r\  r   r   r   r7    s
   
z0TokenizerManager._handle_open_session_req_outputc                 C   sP   | j jdkr| j| d S | j| t| j| j jkr&| j| j d S d S )Nrb   )r   r=  r  r  r>  r  rY  r\  r   r   r   r8    s   z<TokenizerManager._handle_update_weights_from_disk_req_outputc                    s`   |j sd S | jjs&t|j tr|j n
tdd |j D d }td| d| |I d H  d S )Nc                 s   s    | ]}|r|V  qd S r  r   )r  ar   r   r   r    r  z>TokenizerManager._validate_and_resolve_lora.<locals>.<genexpr>zLoRA adapter 'z' was requested, but LoRA is not enabled. Please launch the server with --enable-lora flag and preload adapters using --lora-paths or /load_lora_adapter endpoint.)r  r   r  rX  r   nextre  _resolve_lora_path)r   r\   first_adapterr   r   r   rK    s   

z+TokenizerManager._validate_and_resolve_lorac                    s$  t |jtrt|jg}nt|j}| jjd ur/t|| jjkr/tdt| d| jj | j	|I d H }|D ]J}|d u rAq:|| j
vrTtd| d| j
  dtd|  | j
| }| t|j|j|jdI d H }|jsd|jvrtd	| d
|j q:| j|jI d H |_d S )NzReceived request with z0 unique loras requested but max loaded loras is z-Got LoRA adapter that has never been loaded: z
All loaded adapters: r,  zReloading evicted adapter: )r  r  pinnedzalready loadedz'Failed to implicitly load LoRA adapter z: )rX  r  r   r   r   max_loaded_lorasrY  re  r  get_unregistered_lorasr  r  rf  r.  load_lora_adapterr'   r  r  r6  error_messageacquirer  )r   r\   unique_lora_pathsunregistered_lorasr  new_lora_refload_resultr   r   r   r    sP   





z#TokenizerManager._resolve_lora_pathc                 C   s
  d }|rd|j v rt|j d  nt|j }n|jr|j}|jrIt|dr(|jnd }t|j|t	|d | j
j|d td|jt	|d dd d S tt|jD ]2}t|dr_|jr_|j| nd }t|j| |t	|d | j
j|d td|j| t	|d dd qPd S )Nr  r  rn  )ro  roleexternal_trace_headerri   T)ro  	anonymous)headersrG   rC   r  rL  r  r  rF   r  r   r   r  rI   r  rY  )r   r\   r]   r@  r  r  r  r   r   r   rE    sH   


z%TokenizerManager._trace_request_startc                 C   s:   t |tr| jjdkr| r| j| dS dS dS dS )z0Handle EPD-disaggregation mode encoding request.zmq_to_schedulerN)rX  r%   r   r~  r{  r  send_encode_requestr  r   r   r   rF  	  s   z:TokenizerManager._handle_epd_disaggregation_encode_requestc                 C   s  i }| j s|S |j| |tj< |j| |tj< |j| |tj< |jj	r)t
|jj	nd|tj< |jjp3i }|d }r@||tj< |d }rL||tj< |d }rX||tj< |d }	rd|	|tj< |d }
rp|
|tj< | j|tj< |j| r|j| dnd}|rt|g|tj< |jr|jr|j|j |tj< |jr|jr|j|j |tj< |jr|jr|j|j |tj < |j!r|jr|j|j! |tj"< |j!r|jr|j|j! |tj#< |S )z&Convert attributes to span attributes.Nr  top_ptemperaturetop_knr  )$r   rd  rB   GEN_AI_USAGE_COMPLETION_TOKENSr^  GEN_AI_USAGE_PROMPT_TOKENSre  GEN_AI_USAGE_CACHED_TOKENSr\   r  r   GEN_AI_REQUEST_IDr  r   GEN_AI_REQUEST_MAX_TOKENSGEN_AI_REQUEST_TOP_PGEN_AI_REQUEST_TEMPERATUREGEN_AI_REQUEST_TOP_KGEN_AI_REQUEST_Nr   GEN_AI_RESPONSE_MODELrs  jsondumpsGEN_AI_RESPONSE_FINISH_REASONSr`   r]   "GEN_AI_LATENCY_TIME_TO_FIRST_TOKENr_   GEN_AI_LATENCY_E2Ere   rd   #GEN_AI_LATENCY_TIME_IN_MODEL_DECODErf   &GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE$GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL)r   rR  r]  r  
span_attrsr  r  r  r  r   r  r  r   r   r   r  (	  sb   












z&TokenizerManager.convert_to_span_attrsr  )F)NNNNN)ri   F)ir   r   r   r   r?   r>   r   r   r   r   r   r   r   r   r   r   r   r   r%   r#   r   r  RequestrT  r   r   r   r   rZ  r]  r   r   rc  rn  rM  r  r}  r  r  r   r
   r,   r+   r  r  r  r  r  rN  r  rX   rO  rP  r  r)   r)  r"   r+  r-   r8  r9  r1  r!   rF  rK   rI  rC  rL  r   r   r   r   r5  r  r  rv  r  r  r   r~  r  ru  r  r  r  r  r  r   getenvsocketgethostnamer  rV  r  r   r6  r   r9  r7  r8  rK  r  rE  rF  r  r   r   r   r   r      s   
.9!

&




P

U

R






H

-











u

e	

!

	
  
V
D


	

,


.

?

<&+	

4

0

	

r   c                   @   s   e Zd ZdZdZdZdS )r   Upr   r  N)r   r   r   r  r   r  r   r   r   r   r   	  s    r   c                    sz   z	|  I dH  W dS  t y<   t }td|  t| dr,t| jtr,| j  t	t
 dd td Y dS w )zt
    Sometimes an asyncio function does not print exception.
    We do another wrapper to handle the exception.
    Nz#TokenizerManager hit an exception: __self__Tr  rb   )	ExceptionrW   rf  rr  r  rX  r  r   r  rO   r   r  r  r  )func	tracebackr   r   r   rK  	  s   
rK  c              
   C   s   zt | j| j| j| j| j d}W |S  tyG } z)t|}d|v r:t	d| j d t | j| j| j| jdd}n|W Y d }~|S d }~ww )N)r   r   r   use_fastzdoes not have a slow versionz
Processor z= does not have a slow version. Automatically use fast versionT)
rQ   r   r   r   r   disable_fast_image_processorre  r   rf  r.  )r   r   er  r   r   r   r   	  s6   
	r   r   rW  c                 C   s   | j }|rdS dS )Ndefaultcuda_ipc)dist_init_addr)r   is_cross_noder   r   r   r   	  s   r   c                   @   s.   e Zd ZdefddZd	ddZd	ddZdS )
r=  tokenizer_managerc                 C   s
   || _ d S r  )r&  )r   r&  r   r   r   r   	  s   
zSignalHandler.__init__Nc                 C   s$   t d|d|d d| j_d S )NzSIGTERM received. signum=z frame=z(. Draining requests and shutting down...T)rf  r  r&  r   r   signumframer   r   r   rS  	  s   zSignalHandler.sigterm_handlerc                 C   s2   t d|d|d | j  tt  d S )NzSIGQUIT received. signum=z, frame=z$. It usually means one child failed.)rf  rr  r&  r  rO   r   r  r'  r   r   r   rU  	  s
   
z+SignalHandler.running_phase_sigquit_handlerr  )r   r   r   r   r   rS  rU  r   r   r   r   r=  	  s    
r=  )r   r   r  r   r  rE  r   r  rQ  r  r  rM  rB  collectionsr   
contextlibr   r   enumr   httpr   typingr   r	   r
   r   r   r   r   r  uvloopr   zmq.asyncior   sglang.srt.configs.model_configr   )sglang.srt.disaggregation.encode_receiverr   sglang.srt.disaggregation.utilsr   sglang.srt.environr   sglang.srt.lora.lora_registryr   r   1sglang.srt.managers.async_dynamic_batch_tokenizerr   +sglang.srt.managers.async_mm_data_processorr   "sglang.srt.managers.disagg_servicer   sglang.srt.managers.io_structr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   sglang.srt.managers.mm_utilsr0   r1   (sglang.srt.managers.multimodal_processorr2   r3   ,sglang.srt.managers.request_metrics_exporterr4   "sglang.srt.managers.schedule_batchr5   r6   sglang.srt.managers.schedulerr7   +sglang.srt.managers.scheduler_input_blockerr8   0sglang.srt.managers.tokenizer_communicator_mixinr9   5sglang.srt.managers.tokenizer_manager_multiitem_mixinr:   sglang.srt.metrics.collectorr;   sglang.srt.metrics.cpu_monitorr<   #sglang.srt.sampling.sampling_paramsr=   sglang.srt.server_argsr>   r?   r@    sglang.srt.speculative.spec_inforA   sglang.srt.tracing.tracerB   rC   rD   rE   rF   rG   rH   rI   sglang.srt.utilsrJ   rK   rL   rM   rN   rO   sglang.srt.utils.aio_rwlockrP   &sglang.srt.utils.hf_transformers_utilsrQ   rR   rS   sglang.srt.utils.request_loggerrT   sglang.srt.utils.watchdogrU   sglang.utilsrV   rW   set_event_loop_policyEventLoopPolicy!SGLANG_REQUEST_STATE_WAIT_TIMEOUTr   r  	getLoggerr   rf  	dataclassrX   r   r   r   rK  r   r   r=  r   r   r   r   <module>   s   $d( 


3                 Z
