o
    پi                    @   s|  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZ ddlZddlZddlZddlZddlZddlmZ ddlmZ dd	lmZ dd
l m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZF ddlGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ dd lmZmZmZmZmZmZ dd!lmZmZmZ dd"lmZ dd#lmZ dd$lmZmZmZ dd%lmZ dd&lmZ dd'lmZ dd(lmZ dd)lmZmZ dd*lmZ dd+lmZ dd,lmZmZ dd-lmZ dd.lmZ dd/lmZ dd0lmZmZ dd1lmZ dd2lmZ dd3lmZmZmZ dd4lmZ dd5lmZmZmZmZmZmZmZ dd6lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd7lmZmZmZ dd8lmZ dd9lmZmZ eeZe>j Ze>j Ze>j Ze
G d:d; d;ZG d<d= d=eeeee'e.eeeee<ZG d>d? d?Zd@dA ZdBdC ZG dDdE dEZdFedGedHedIedJedKedLedMedNee fdOdPZdS )Q6A scheduler that manages a tensor parallel GPU worker.    N)deque)	dataclass)
HTTPStatus)AnyDequeDictListOptionalTupleUnion)Stream)StreamContext)barrier)ModelConfig)GrammarManager)DecodePreallocQueueDecodeTransferQueue"SchedulerDisaggregationDecodeMixin)DecodeKVCacheOffloadManager)MMReceiverHTTP)PrefillBootstrapQueue#SchedulerDisaggregationPrefillMixinrelease_req_to_metadata_buffer)DisaggregationModeMetadataBuffersReqToMetadataIdxAllocatorTransferBackendprepare_abort)get_pp_groupget_world_group)get_tp_group)SchedulerDllmMixin)envs)'get_global_expert_distribution_recorder)/initialize_mamba_selective_state_update_backend)compute_dp_attention_world_infoget_attention_cp_groupget_attention_tp_group)initialize_moe_config)initialize_fp4_gemm_config)initialize_fp8_gemm_config)LoRAOverlapLoader);AbortReqActiveRanksOutputAttachHiCacheStorageReqInputAttachHiCacheStorageReqOutputBaseBatchReqBaseReqBatchTokenizedEmbeddingReqInputBatchTokenizedGenerateReqInputCheckWeightsReqInputClearHiCacheReqInputClearHiCacheReqOutputCloseSessionReqInputContinueGenerationReqInput!DestroyWeightsUpdateGroupReqInputDetachHiCacheStorageReqInputDetachHiCacheStorageReqOutputDumperControlReqInputDumperControlReqOutputExpertDistributionReqExpertDistributionReqOutputExpertDistributionReqTypeFlushCacheReqInputFlushCacheReqOutputFreezeGCReqGetInternalStateReqGetInternalStateReqOutputGetLoadReqInputGetLoadsReqInputGetWeightsByNameReqInputHealthCheckOutput-InitWeightsSendGroupForRemoteInstanceReqInput.InitWeightsSendGroupForRemoteInstanceReqOutputInitWeightsUpdateGroupReqInput"LoadLoRAAdapterFromTensorsReqInput#LoadLoRAAdapterFromTensorsReqOutputLoadLoRAAdapterReqInputLoadLoRAAdapterReqOutputOpenSessionReqInputOpenSessionReqOutputPauseGenerationReqInput
ProfileReqReleaseMemoryOccupationReqInputResumeMemoryOccupationReqInputRpcReqInputRpcReqOutput#SendWeightsToRemoteInstanceReqInput$SendWeightsToRemoteInstanceReqOutputSetInternalStateReqSetInternalStateReqOutputSlowDownReqInputSlowDownReqOutputTokenizedEmbeddingReqInputTokenizedGenerateReqInputUnloadLoRAAdapterReqInputUnloadLoRAAdapterReqOutputUpdateWeightFromDiskReqInput$UpdateWeightsFromDistributedReqInputUpdateWeightsFromIPCReqInputUpdateWeightsFromTensorReqInput)init_mm_embedding_cacheunwrap_shm_features)	FutureMap)PrefillDelayer PrefillDelayerSinglePassExecutor)FINISH_ABORTModelWorkerBatchMultimodalInputsReqRequestStageScheduleBatch)AddReqResultPrefillAdderSchedulePolicy)SchedulerDPAttnMixin)SchedulerInputBlocker)RECORD_STEP_TIMEPrefillStatsSchedulerMetricsMixin)SchedulerOutputProcessorMixin)SchedulerPPMixin)SchedulerProfilerMixin)SchedulerRecvSkipper)SchedulerRuntimeCheckerMixincreate_scheduler_watchdog)SchedulerUpdateWeightsMixin)Session)GenerationBatchResultvalidate_input_length)CacheInitParams)release_kv_cache)
RadixCache)ForwardModePPProxyTensors)SchedulerMultiplexMixin)ReasoningParser)PortArgs
ServerArgsget_global_server_args)SpeculativeAlgorithm)process_tracing_inittrace_event_batch trace_set_proc_propagate_contexttrace_set_thread_infotrace_slice_batchtrace_slice_endtrace_slice_start)DynamicGradModebroadcast_pyobjconfigure_gc_loggerconfigure_logger	freeze_gcget_available_gpu_memoryget_bool_env_varget_int_env_varget_zmq_socketkill_itself_when_parent_diednuma_bind_to_nodepoint_to_point_pyobjrequire_mlp_syncset_gpu_proc_affinityset_random_seedsuppress_other_loggers)get_processorget_tokenizerget_tokenizer_from_processor)TorchMemorySaverAdapter)TypeBasedDispatcherget_exception_tracebackc                   @   s4   e Zd ZU ejed< dZeejj	 ed< dd Z
dS )EmbeddingBatchResult
embeddingsN	copy_donec                 C   s   t | jtjrt| jj | _| jjddd| _n&t | jt	s#J t
| jdkr,dS t| jd j | _dd | jD | _| j  dS )z4Copy embeddings tensor to CPU in overlap scheduling.cpuTnon_blockingr   Nc                 S   s   g | ]	}|j d ddqS )r   Tr   )to).0emb r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/scheduler.py
<listcomp>   s    z4EmbeddingBatchResult.copy_to_cpu.<locals>.<listcomp>)
isinstancer   torchTensorget_device_moduledeviceEventr   r   listlenrecordselfr   r   r   copy_to_cpu   s   z EmbeddingBatchResult.copy_to_cpu)__name__
__module____qualname__r   r   __annotations__r   r
   cudar   r   r   r   r   r   r      s   
 
r   c                   @   s  e Zd ZdZdededededededed	ed
ee fddZdd Z	defddZ
dd ZdddZdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zdefd'd(Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Ze d5d6 Ze d7d8 Zd9ede fd:d;Z!d<ede fd=d>Z"de#e$e%e&e'f  fd?d@Z(dAe#fdBdCZ)dAe#fdDdEZ*dFdG Z+dHee, fdIdJZ-dKe,fdLdMZ.d9eddfdNdOZ/dPe%fdQdRZ0dPe1fdSdTZ2dUe3fdVdWZ4ddUe3dYe fdZd[Z5dUe3de fd\d]Z6dPe3de fd^d_Z7d`da Z8dPe&fdbdcZ9dPe:fdddeZ;dUe3fdfdgZ<dee fdhdiZ=djdk Z>dee fdldmZ?dnee@ dee fdodpZAd9edee fdqdrZBdseCfdtduZD	dd9edveeE de$eFeGf fdwdxZHdyeFde$eF fdzd{ZId9ed|e$eFeGf fd}d~ZJdd ZKdPeLfddZMdPeNfddZOde fddZPdPeQdeRfddZSdPeTdeUfddZVdd ZWdd ZXdPeYfddZZdPe[fddZ\dPe]fddZ^dPe_fddZ`deae#e3 ef fddZbdPecfddZddPeefddZfdPegdehfddZidPejdekfddZldPemdenfddZodPepfddZqdPerfddZsdPetfddZudPevfddZwdPexfddZydPezfddZ{dd Z|dPe}fddZ~dPefddZdedyeFfddZdd ZdS )	Schedulerr   server_args	port_argsgpu_idtp_rankmoe_ep_rankpp_rankattn_cp_rankmoe_dp_rankdp_rankc
                 C   sR  d| _ | | || _|| _|| _|| _|| _|j| _|| _|j	| _	|	| _
|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j | _|j| _|j| _|j| _|j| _|j| _|j | _ t!"|j#| _$|| _%|j&| _&|j'| _'|j(d u| _)t*j+, | _-t.|j/| j| j| j| j\| _0| _1| _2t3|j4o| j0dk| _5| 6  | 7|||	 | 8| | jr| 9  | :  | ;  | <  | =  t*j>,  }
dkrt?@|
 | A  | B  | C  | D  | E  | F  | G  | H  | I  | J  | K  | jrtL| jMjNjO| _PtQ| | _Rd| _ d S )NTr   F)Sis_initializinginit_soft_watchdogr   r   r   r   r   attn_cp_sizer   moe_dp_sizer   tp_sizeep_sizemoe_ep_sizepp_sizedp_size	nccl_portschedule_policyenable_priority_schedulingabort_on_priority_when_disabled"schedule_low_priority_values_first(priority_scheduling_preemption_thresholdenable_loraenable_lora_overlap_loadingmax_loras_per_batchdisable_overlap_scheduleenable_overlapenable_pdmuxskip_tokenizer_initenable_metrics!enable_metrics_for_all_schedulersenable_tracestream_intervalr   from_stringspeculative_algorithmspec_algorithmr   	page_sizeenable_hierarchical_cachehicache_storage_backendenable_hicache_storager#   "SGLANG_SCHEDULER_MAX_RECV_PER_POLLgetmax_recv_per_pollr&   enable_dp_attentionattn_tp_rankattn_tp_sizeattn_dp_rankboolkv_events_configenable_kv_cache_eventsinit_model_configinit_metricsinit_ipc_channels
init_pdmuxinit_tokenizerinit_moe_gemm_configinit_mamba_backendinit_model_worker SGLANG_TEST_STUCK_SCHEDULER_INITtimesleepinit_cache_with_memory_poolinit_running_statusinit_chunked_prefillinit_diffusion_llminit_schedule_policy)init_watch_dog_memory_saver_input_blockerinit_profilerinit_disaggregationinit_overlap#init_deterministic_inference_configinit_request_dispatcherr,   	tp_workermodel_runnerlora_managerlora_overlap_loaderr   grammar_manager)r   r   r   r   r   r   r   r   r   r   tr   r   r   __init__  s   






zScheduler.__init__c                 C   s   t | j| _d S N)r   from_server_argsr   model_configr   r   r   r   r     s   zScheduler.init_model_configc                 C   s  t d}d | _| jdkrc| jdkrc| jdkrct|t j|jd| _	t|t j
|jd| _t|t j|jd}| jjrBt|t j|jd}n	t|t j|jd}t|| _t|| _| jjrbt| j	| jg| _nd | _	d | _td | _td | _| jrt|t j|jd| _d S d S )N   r   F)zmqContextidle_sleeperr   r   r   r   PULLscheduler_input_ipc_namerecv_from_tokenizerDEALERrpc_ipc_namerecv_from_rpcPUSHtokenizer_ipc_namer   r   detokenizer_ipc_nameSenderWrappersend_to_tokenizersend_to_detokenizersleep_on_idleIdleSleeper!current_scheduler_metrics_enabledmetrics_ipc_namesend_metrics_from_scheduler)r   r   contextr-  r.  r   r   r   r    sJ   





zScheduler.init_ipc_channelsc                 C   s   | j }| jj| _|jrd  | _| _n(| jjr-t|j|j	|j
|j|j d| _t| j| _nt|j|j	|j
|jd| _| j jrY| jr[t| j jdd}| jj|jjddd | j_d S d S d S )N)tokenizer_modetrust_remote_coderevisionuse_fast)r5  r6  r7  F)
model_typestream_reasoning)add_special_tokensr   )r   r  is_generationr   	tokenizer	processoris_multimodalr   tokenizer_pathr5  r6  r7  disable_fast_image_processorr   r   reasoning_parserr   encodedetectorthink_end_tokenthink_end_id)r   r   rB  r   r   r   r    s:   
zScheduler.init_tokenizerreturnNc                 C   s   t | j d S r  )r%   r   r   r   r   r   r    s   zScheduler.init_mamba_backendc                 C   sL   t | jjd| jj}t|drt| j t| j t| j t| j| _d S )Ntext_confignum_experts_per_tok)	getattrr  	hf_confighasattrr)   r   r+   r*   r   )r   config_to_checkr   r   r   r    s   



zScheduler.init_moe_gemm_configc                 C   s>   ddl m} || j| j| j| j| j| j| j| j	| j
d	| _d S )Nr   )TpModelWorker)	r   r   r   r   r   r   r   r   r   )sglang.srt.managers.tp_workerrN  r   r   r   r   r   r   r   r   r   r  )r   rN  r   r   r   init_tp_model_worker  s   zScheduler.init_tp_model_workerc                 C   s   | j  r
d | _d S t| j| j| j| j| j| j	| j
| j| jd	}| jjd ur7| jj| j_td| jj d | j | j}|di || _d S )N)	r   r   r   r   r   target_workerr   r   r   z Using draft model load_format: ''r   )r   is_nonedraft_workerdictr   r   r   r   r   r  r   r   r   speculative_draft_load_formatload_formatloggerinfocreate_worker)r   draft_worker_kwargsDraftWorkerClassr   r   r   maybe_init_draft_worker  s,   
z!Scheduler.maybe_init_draft_workerc                 C   s  |    |   | j r| j| _n| j| _| j \| _| _	| _
| _| _| _| _| _| _}}}t jd u rAt| j
| j dt _t | _| jj| _t | _| jj| _t | _| jj| _t | _ t! | _"| j#j$rk| jn| j| _%| j%j| _&| j' | _(t)| j | j*dkrt+| j| j,dd}t-.d| j d| j#j/ d| j	 d| j
 d	| j0j1 d
| jdkrdnd d|dd | j2rt3| dr| j45| j6| j| j6  d S d S d S )N   r   F)empty_cachezmax_total_num_tokens=z, chunked_prefill_size=z, max_prefill_tokens=z, max_running_requests=z, context_len=z, r   available_cpu_memavailable_gpu_mem=z.2fz GBmetrics_collector)7rP  r]  r   rS  r  model_workerrT  get_worker_infomax_total_num_tokensmax_prefill_tokensmax_running_requestsmax_queued_requestsmax_req_lenmax_req_input_lenrandom_seedr   forward_streamr   pp_max_micro_batch_sizemaxr   r!   tp_group	cpu_grouptp_cpu_groupr(   attn_tp_groupattn_tp_cpu_groupr'   attn_cp_groupattn_cp_cpu_groupr   pp_groupr    world_groupr   r   dp_tp_groupdp_tp_cpu_groupget_pad_input_ids_funcpad_input_ids_funcr   r   r   r   rX  rY  chunked_prefill_sizer  context_lenr   rL  rc  emit_cache_config_infor   )r   _	avail_memr   r   r   r  3  s|   




	




	zScheduler.init_model_workerc                 C   s6  | j }| jj| _| jjjd up| jjjd u| _d | _| jr+| jj| _| j \| _	| _
| j \| _| _t|j| j| j| j| j | j jrH| jn| j|j| j| j| | j| j|j| jd}|jd ur|jr| jsuddlm} ||| _nzddlm } ||| _nnt!j"# rddl$m%} t&'d |||d| _nV| j(rddl)m*} |||d| _| j+| jj,j- n<| jrdd	l.m/} ||d
| _n,| jrddl0m1} ||| _n|j2rddl3m4}	 |	|| j5| j6| j7| j8d| _nt9|| _|j:dkr	|j;r	t<| j| j|j=| j| j d| _>nd | _>t!j?# }
t@|
d d  d S )N)disablereq_to_token_pooltoken_to_kv_pool_allocatorr   is_eagletp_cache_groupeviction_policyr   r   enable_mamba_extra_bufferr   r   r}  sliding_window_sizer   )
ChunkCache)SWAChunkCache)RadixCacheCppz1Using experimental C++ radix tree implementation.)paramsr   )HiRadixCache)SWARadixCache)r  )MambaRadixCache)LMCRadixCache)r  r  r   rankrp  decode)r  r  rp  
tree_cacher   i   )Ar   r  is_hybrid_swar  hybrid_gdn_configmamba2_configis_hybrid_ssmr  get_tokens_per_layer_infofull_tokens_per_layerswa_tokens_per_layerget_memory_poolr  r  r   disable_radix_cacher   r   r  r   rt  rr  radix_eviction_policyr   r   r  r   r   r}   sglang.srt.mem_cache.chunk_cacher  r  r  r#   "SGLANG_EXPERIMENTAL_CPP_RADIX_TREEr   $sglang.srt.mem_cache.radix_cache_cppr  rX  rY  r   "sglang.srt.mem_cache.hiradix_cacher  'register_hicache_layer_transfer_countercache_controllerlayer_done_counter$sglang.srt.mem_cache.swa_radix_cacher  &sglang.srt.mem_cache.mamba_radix_cacher  enable_lmcache4sglang.srt.mem_cache.storage.lmcache.lmc_radix_cacher  r  r   r   rp  r   disaggregation_mode,disaggregation_decode_enable_offload_kvcacher   r  decode_offload_managerSGLANG_VLM_CACHE_SIZE_MBrh   )r   r   r  r  r  r  r  r  r  r  embedding_cache_sizer   r   r   r
  |  s   










z%Scheduler.init_cache_with_memory_poolc                 C   sN   g | _ tg dd| _d | _d | _d| _d| _d| _d| _i | _	d | _
d| _d S )NFreqsbatch_is_fullr   )waiting_queuerr   running_batch	cur_batch
last_batch
forward_ctreturn_health_check_ctnum_retracted_reqsnum_paused_reqssessionsforward_sleep_time_engine_pausedr   r   r   r   r    s   
zScheduler.init_running_statusc              
   C   s   | j j| _| jdkrd | _d | _| jd uo| j j| _| j jo"| jdk| _| jrNz|   W d S  tyM } zt	
d| d d| _W Y d }~d S d }~ww d S )Nr   r^  z6[PP Dynamic Chunk] Failed to profile prefill latency: z$. Dynamic chunking will be disabled.F)r   r}  chunked_reqenable_mixed_chunkis_mixed_chunkenable_dynamic_chunkingr   profile_and_init_predictor	ExceptionrX  warning)r   er   r   r   r    s(   



zScheduler.init_chunked_prefillc              	   C   s   t | j| j| j| j| j| _d | _| jj	r/t
| j| j| j| j| jr$| jnd | jj| jjd| _| j| _ttj | jj d| _t| jtj  d| _| j| j tj  | _| j| _d S )N)r   r   rq  r   rc  max_delay_passestoken_usage_low_watermarkg      ?)ru   r   r  r   r   r   policyprefill_delayerr   enable_prefill_delayerrk   r   r   rr  r   rc   prefill_delayer_max_delay_passes)prefill_delayer_token_usage_low_watermarktry_preemptionminr#   SGLANG_INIT_NEW_TOKEN_RATIOr   schedule_conservativenessinit_new_token_ratio!SGLANG_MIN_NEW_TOKEN_RATIO_FACTORmin_new_token_ratio"SGLANG_NEW_TOKEN_RATIO_DECAY_STEPSnew_token_ratio_decaynew_token_ratior   r   r   r   r    sB   
zScheduler.init_schedule_policyc                 C   s&   |j  }d urt| |dd| _d S d S )NT)watchdog_timeoutsoft)soft_watchdog_timeoutr   soft_watchdog)r   r   xr   r   r   r   9  s
   zScheduler.init_soft_watchdogc                 C   sp   t | | jjd| _tj| jjd| _t | _	t
| j| _tdr)t| jdkdnd | _tj r6t  d S d S )N)r  )enable!SGLANG_ENABLE_COLOCATED_BATCH_GENr   )noop)r   r   r  watchdogr   createenable_memory_savermemory_saver_adaptersetoffload_tagsr~   maybe_createrecv_skipperr   rw   r   input_blockerr#   SGLANG_LOG_GCr   r   r   r   r   r   r  ?  s   

z3Scheduler.init_watch_dog_memory_saver_input_blockerc                 C   s  t | jj| _t| jj| _| jd u s| j rd }n)| j	 r;| j
r;| jjr/| jjjd }n| jjj}|j}|j}n	| jjj}| jj}| jt jkr| jjd }t|| _t|| j r_|jnd| j rh|jntj| j  d| _t | j!| j| j"| j| | j#d| _$t%di d| jd| jd|d	| jd
| jd| d| j$d| j#d| j!d| j"d| j&d| jj'd| j(d| jj)d| j*d| jj+d| j,d| jj-d| j| _.n| jt j/kra| j0d }t|| _t|| j s| j1 r|jnd| j s| j1 r	|jntj| j  d| _t2di d| j d|d	| jd
| jd| j"d| j&d| j(d| jj)d| j!d| j*d| jj3d| jj4d| d| j,d| j5d| j| _6g | _7| jj8r| jj9dkrt:| j| jj;| j,| j"| j<| d| _=d S d S d S ) Nr   r     )hidden_sizehidden_states_dtypecustom_mem_pool)
gloo_group$req_to_metadata_buffer_idx_allocatorr   metadata_buffers	schedulerr  r  r  draft_token_to_kv_poolr  r  r  transfer_queuer  r  r   r   r   r   bootstrap_portrf  prefill_pp_sizer   num_reserved_decode_tokenstransfer_backendtoken_to_kv_pooldecode_tp_sizedecode_dp_sizer   zmq_to_scheduler)rK  r   r   rp  r  r   )>r   r   r  r   disaggregation_transfer_backendr  rT  r   is_ngramsupports_spec_v2r   enable_multi_layer_eagledraft_runner_listdraft_runnerr  r  r  DECODEr  sizer   r  r   r  r  dtyper   float32r  get_kvcachemaybe_get_custom_mem_pooldisagg_metadata_buffersr   rt  r   r  disagg_decode_transfer_queuer   r   r   r   disaggregation_bootstrap_portrf  disaggregation_prefill_ppr   r  disagg_decode_prealloc_queuePREFILLrh  is_standaloner   disaggregation_decode_tpdisaggregation_decode_dpr   disagg_prefill_bootstrap_queuedisagg_prefill_inflight_queuelanguage_onlyencoder_transfer_backendr   rK  rp  mm_receiver)r   r  r  r  buffer_sizer   r   r   r  W  s   


	



	
zScheduler.init_disaggregationc                 C   s   t | j| _| j | _| jdkrdd | j_| j| j| _	| j
 | _| j| j| _| js6d | _d S t| j| j| jj| j| j| _d gd | _d| _d S )Nr   c                   S      d S r  r   r   r   r   r   <lambda>  s    z(Scheduler.init_overlap.<locals>.<lambda>r  r   )r   r   r   device_modulecurrent_streamdefault_streamsynchronizestreamrm  forward_stream_ctxr   copy_streamcopy_stream_ctxr   
future_maprj   rh  r}  r  r~  r   batch_record_bufbatch_record_ctr   r   r   r   r    s.   

zScheduler.init_overlapc                 C   sN   | j js	d| _dS ddd}|| j jd\}}|r"t||| _dS d| _dS )zRInitialize deterministic inference configuration for different attention backends.N))SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE   )+SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZEr#  )
flashinfertriton)NN)r   enable_deterministic_inferencetruncation_align_sizer   attention_backendr   )r   backend_sizesenv_vardefault_sizer   r   r   r    s   z-Scheduler.init_deterministic_inference_configc                 C   s  t g t| jft| jft| jft| jft	| j
ft| jft| jft| jft| jft| jft| jft| jft| jft| jft| jft| j ft!| j"ft#| j$ft%| j&ft'| j(ft)| j*ft+| j,ft-| j.ft/| j0ft1| j2ft3| j4ft5| j6ft7| j8ft9| j:ft;| j<ft=| j>ft?| j@ftA| jBftC| jDftE| jFftG| jHftI| jJftK| jLf| _Md S r  )Nr   ra   handle_generate_requestr`   handle_embedding_requestr4   handle_batch_generate_requestr3   handle_batch_embedding_requestrB   flush_cache_wrappedr6   clear_hicache_storage_wrappedr/   attach_hicache_storage_wrappedr;   detach_hicache_storage_wrappedr-   abort_requestrR   open_sessionr8   close_sessionrd   update_weights_from_diskrM   init_weights_update_groupr:   destroy_weights_update_grouprK   +init_weights_send_group_for_remote_instancerZ   send_weights_to_remote_instancere   update_weights_from_distributedrg   update_weights_from_tensorrf   update_weights_from_ipcrI   get_weights_by_namerV   release_memory_occupationrW   resume_memory_occupationr5   check_weightsr^   	slow_downrU   profilerD   handle_freeze_gcrE   get_internal_stater\   set_internal_staterX   handle_rpc_requestr?   expert_distribution_handlerP   load_lora_adapterrN   load_lora_adapter_from_tensorsrb   unload_lora_adapterrG   get_loadrH   	get_loadsrT   pause_generationr9   continue_generationr=   handle_dumper_control_request_dispatcherr   r   r   r   r    s   	
 !"#$%&'(*-./012
z!Scheduler.init_request_dispatcherc                 C   st   t j }|dkrd S | j rd S t | }| jjD ]}| s7d|j	j
  k r.|k r7n qtdtj|_qd S )Nr   z Request running timeout reached.)r#   SGLANG_REQ_RUNNING_TIMEOUTr   r  is_emptyr  perf_counterr  finished
time_statsforward_entry_timerm   r   SERVICE_UNAVAILABLE	to_finish)r   	timeout_sdeadlinereqr   r   r   _abort_on_running_timeoutC  s   

$z#Scheduler._abort_on_running_timeoutc                 C   sh   	 |   }| | | jrq |  }|| _|r#| |}| || n|   || _t	j
 r3|   q)zA normal scheduler loop.)recv_requestsprocess_input_requestsr  get_next_batch_to_runr  	run_batchprocess_batch_resultself_check_during_idler  r#   *SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSYr   self_check_during_busy)r   	recv_reqsbatchresultr   r   r   event_loop_normalS  s   


zScheduler.event_loop_normalc                    s   t   _ fdd}	   } |  jrq
  }| _ |}|r)|  |r; |} j	|
 |f nd} jrF|sE|  n|du rN    jrV | | _tj rb   q)zFA scheduler loop that overlaps the CPU processing and GPU computation.c                     s    j  \} } | | d S r  )result_queuepopleftrd  )	tmp_batch
tmp_resultr   r   r   pop_and_processu  s   z5Scheduler.event_loop_overlap.<locals>.pop_and_processTN)r   rl  r`  ra  r  rb  r  is_disable_overlap_for_batchrc  appendcopyr  re  r<  launch_batch_sample_if_neededr#   rf  r   rg  )r   rp  rh  ri  disable_overlap_for_batchbatch_resultr   r   r   event_loop_overlapn  s:   




zScheduler.event_loop_overlapri  c                 C   sZ   t j o|o|j o| jo| jj }|o(|jo(|jo(|j o(t	| j
dk}|p,|S Nr   )r#   *SGLANG_DISABLE_CONSECUTIVE_PREFILL_OVERLAPr   forward_mode	is_extendr  
is_spec_v2has_grammar	is_decoder   rl  )r   ri  ru  need_grammar_syncr   r   r   rq    s&   

z&Scheduler.is_disable_overlap_for_batchnum_recv_reqsc                 C   s   | j dk rdS || j kS )Nr   F)r   )r   r  r   r   r   recv_limit_reached  s   

zScheduler.recv_limit_reachedc                 C   s  | j dur| jdur| jjnd}| j |sg S | jdkry| jdkrv| jdkrvg }	 z| t|r4W n| j	
tj}t|}W n
 tjyJ   Y nw || q*	 z| t|r\W n| j
tj}W n
 tjyn   Y nw || qRn4d}n1| jdkr| jdkr| j| j }tg | j| j | | jj| jd | j | | j| j | }nd}| jdur| j|}| jjr| jdkr| jdkr| |\}}nd}d}| jdkrt|| jj| j| jjd d}| j dkrt|| j!j| j"| j!jd d}| jdkrt|| j#j| j$| j#jd d}|| }n| jdkr't|| j#j| j$| j#jd d}| jdkrd| jj%rd| jj&dkrd| j'(|\}}|D ] \}	}
}|dkrPt)j*nt)j+}t,|	|
|d | -|	g|	j. qC| j/r|D ]}	t0|	t1t2frt3|	j4|	j5 t6d	|	j4dd
 qj|S )zFReceive results at tp_rank = 0 and broadcast it to all other TP ranks.Nr   Tr^  )srcr  i  status_code )	anonymous)7r  r  rz  handler   r   r   r  r   r%  
recv_pyobjr   NOBLOCKri   ZMQErrorrr  r(  r   r   r   r   rx  rq  r  r   r   _split_work_and_control_reqsr   rs  r  rt  ranksr   ru  rv  rp  rr  r  r  r  process_waiting_requestsr   BAD_REQUESTINTERNAL_SERVER_ERRORr   stream_outputreturn_logprobr   r   ra   r`   r   ridtrace_contextr   )r   last_forward_moderh  recv_reqrecv_rpc	dp_offset	work_reqscontrol_reqs
abort_reqsr^  	error_msg
error_coder  r   r   r   r`    s   




	








	
zScheduler.recv_requestsrh  c                 C   s$   dd |D }dd |D }||fS )Nc                 S   s"   g | ]}t |ttttfr|qS r   r   ra   r`   r4   r3   r   r^  r   r   r   r   8      z:Scheduler._split_work_and_control_reqs.<locals>.<listcomp>c                 S   s"   g | ]}t |ttttfs|qS r   r  r  r   r   r   r   E  r  r   )r   rh  r  r  r   r   r   r  7  s   z&Scheduler._split_work_and_control_reqsc                 C   s   |D ]E}t |r&| jd us| j s| j rt| jdkr&|  jd7  _q| 	|}|d urGt
|ts<| j|| q| jd urG| j| qd S )Nr   r^  )is_health_check_generate_reqr  dllm_managerany_staging_reqsr  rU  r   r  r  rS  r   rY   r-  send_outputr(  
send_pyobj)r   rh  r  outputr   r   r   ra  T  s$   



z Scheduler.process_input_requestsc                 C   s6   t |jjd ur|jjnd| jt|j d |j_d S )Ni   @r^  )r  sampling_paramsmax_new_tokensrj  r   origin_input_idsr   r^  r   r   r   init_req_max_new_tokensi  s   z!Scheduler.init_req_max_new_tokensraw_mm_inputsc              
   C   s  |du rdS d}zt j r t j r | jdur t jj| jd}W n ty< } ztd| d W Y d}~nd}~ww | j	j
dkrat|}|dkr_|g}t jj|| j	j| jd |d }|S |dkrzdg}t jj|| j	j| jd |d }|S t|}|S )a  Materialize MultimodalInputs once on the entry rank and broadcast to others.

        Entry rank:
        - constructs MultimodalInputs.from_dict(raw_mm_inputs) once
        - broadcasts to other ranks in self.cpu_group (if world_size > 1)

        Non-entry ranks:
        - receive the object via broadcast (if world_size > 1)
        - otherwise (single-rank / no group) fall back to local from_dict

        Returns:
            MultimodalInputs | None
        Nr^  )groupz4Failed to get world size in mm_inputs handling with z, fallback to 1.r   )r  r  )r   distributedis_availableis_initializedrz  get_world_sizer  rX  r  ry  rank_in_groupro   	from_dictbroadcast_object_list
first_rank)r   r  group_world_sizer  image_inputsobj_listr   r   r    _process_and_broadcast_mm_inputss  sR   



z*Scheduler._process_and_broadcast_mm_inputsmm_inputs_dictc                 C   s   | j jr	| |S t|S r  )r   "enable_broadcast_mm_inputs_processr  ro   r  )r   r  r   r   r   _get_multimodal_inputs  s   

z Scheduler._get_multimodal_inputsc                 C   sD   |j D ]}| r|j }sq|jrq|jD ]}d |_qd |_qd S r  )r  rW  multimodal_inputs
session_idmm_itemsfeature)r   ri  r^  	mm_inputsitemr   r   r   _maybe_clear_mm_inputs  s   

z Scheduler._maybe_clear_mm_inputsr  c           
         s(  |j d u s|j jd u s|j j| jvr|jd ur$t|j}dg| }||_|jd u r.| jj|_t	|j
|j|j|jfi d|jd|jd|jd|jd|jd|jd|jd	|jd
|jd|jd| jjd|jd|jd|jd| jd|jd| jjd|jd| jr| jnd d|j d|j!d| j"}| j#|_#| jt$j%kr|jd u rd|j
}t&'| t(||t)j*d | +|g|j d S |j d ur|j jd ur|,d|j j d | -| | .| d S n$| j|j j }|/|| j#| jj}t0|j1t2r| -| | .| d S |j3d ur| 4|j3}t|jt|j5k rN|j j| jv s.J t|j5t|j  |j6D ]}|j7rL fdd|j7D |_7q;| 8|j5||_5|9| t|j5| j:kr|j,dt|j;dt|j5 d | j: d!d" | -| | .| d S | -| t<|| j:| jj=}|r|,| | .| d S |js|j>d#krd#|_>|j>d#kr|jr|jd u rt|j5d |_>n|j?rt|j5|_>nd#|_>n|j>|_>|j>t|j5krd$|j>d%t|j5d&}d#|_>|,| | .| d S | j@A|}	|	s| .| d S d S )'Nr^  r  top_logprobs_numtoken_ids_logprobr  lora_idinput_embedscustom_logit_processorrequire_reasoningreturn_hidden_statesreturn_routed_expertseos_token_idsbootstrap_hostr  bootstrap_roomdisagg_modedata_parallel_rank
vocab_sizepriorityrc  routing_keyhttp_worker_ipcdllm_configzSInvalid request: Disaggregated request received without bootstrap room id. req.rid=r  zInvalid request: session id z does not existc                    s    g | ]\}}|  |  fqS r   r   )r   startend
prefix_lenr   r   r   *  s    z5Scheduler.handle_generate_request.<locals>.<listcomp>tMultimodal prompt is too long after expanding multimodal tokens. After expanding len(req.origin_input_ids_unpadded)= =>  >= .r  zreq.logprob_start_len=zE is higher than the number of input tokens len(req.origin_input_ids)=z). Please use a smaller logprob_start_len.)Bsession_paramsidr  r  r   	input_idsr  r   r  rp   r  
input_textr  r  r  r  r  r  r  r  r  r  r  hf_eos_token_idr  r  r  r  r  r  r   rc  r  r  r  r=  r   NULLrX  errorr   r   r  r  set_finish_with_abortr  _add_request_to_queue
create_reqr   finished_reasonrm   r  r  r  r  offsetsr|  extend_image_inputsrk  origin_input_ids_unpaddedr   allow_auto_truncatelogprob_start_lenis_prefill_onlyr  process_req_with_grammar)
r   r  
seq_lengthfake_input_idsr^  r  sessionr  mm_itemadded_to_grammar_queuer   r  r   r-    s"  





	

















z!Scheduler.handle_generate_requestc                 C   .   t dt| d |D ]}| | qdS )z(Handle optimized batch generate request.z'Processing batch generate request with 	 requestsN)rX  debugr   r-  r   r  tokenized_reqr   r   r   r/  m  s   z'Scheduler.handle_batch_generate_requestr^  c                 C   s   | j r=|| j |jjr?|j }t|j|j	 }|j
|d  }| jjr-|j|jjnd }| j|j|j||| d S d S d S r  )r   init_next_round_inputr  	last_nodebackupedlast_host_nodeget_last_hash_valuer   prefix_indiceshost_hit_lengthfill_ids hicache_storage_pass_prefix_keysget_prefix_hash_valuesparentprefetch_from_storager  )r   r^  	last_hashmatched_lennew_input_tokensprefix_keysr   r   r   _prefetch_kvcachex  s&   
zScheduler._prefetch_kvcacheFis_retractedc                 C   s   | j tjkr0| |sd S | |rd S | | | j| t	 |j
_ttj|jdd d S | j tjkrL| | | j|| jj t	 |j
_d S | j tjkrf| jj||d |sdt	 |j
_d S d S td| j )NT)auto_next_anonr  z!Invalid self.disaggregation_mode=)r  r   r  _set_or_validate_priority_abort_on_queued_limitr  r  rr  r  rV  rX  wait_queue_entry_timer   rq   REQUEST_PROCESSr  r  r  addr  num_key_value_heads"prefill_bootstrap_queue_entry_timer   r
   decode_prealloc_queue_entry_time
ValueError)r   r^  r  r   r   r   r    s*   



zScheduler._add_request_to_queuec                 C   sx   | j r|jdu r| jrtj|_dS tj d |_dS | j s:|jdur:| jr:tdtjdd|j	d}| j
|| dS dS )	z[Set the default priority value, or abort the request based on the priority scheduling mode.Nr^  abortzYUsing priority is disabled for this server. Please send a new request without a priority.typer  messager  r  FT)r   r  r   sysmaxsizer   r-   r   rZ  r  r-  r  )r   r^  	abort_reqr   r   r   r    s*   
z#Scheduler._set_or_validate_priorityc                    s   | j du st| jd | j krdS |}d}| jr[| jrdnd  fdd}tt| j|d\}} |j  |j k }|r[| jrG| j	
|j n
| jrQ| j	|j | j| |}d	}| jtd
tj|d|jd| |j|jkS )ztAbort an incoming or existing request if the waiting queue is full. Returns True if the incoming request is aborted.Nr^  FzThe request queue is full.r  c                    s    | d j  | d jjfS Nr^  )r  rX  r  )r  	directionr   r   r    s   
z2Scheduler._abort_on_queued_limit.<locals>.<lambda>)keyz4The request is aborted by a higher priority request.r  r  r  )ri  r   r  r   r   ro  	enumerater  r   r  release_aborted_requestr  r   terminate_prefetchpopr-  r  r-   r   rZ  )r   r  req_to_abortr  key_fnidxcandidate_reqabort_existing_reqr   r!  r   r    s<   
z Scheduler._abort_on_queued_limitc                    s   t j  }dkrd S t  t | }| jD ]2}|jj}d|  k r'|k rIn q| j	r3| j
|j | jtdtjdd|jd|  | q rY fdd| jD | _d S d S )Nr   r  z Request waiting timeout reached.r  r  c                    s   g | ]}| vr|qS r   r   r  deleted_reqsr   r   r     s    z7Scheduler._abort_on_waiting_timeout.<locals>.<listcomp>)r#   SGLANG_REQ_WAITING_TIMEOUTr   r  r  rV  r  rX  r  r   r  r%  r  r-  r  r-   r   rZ  r  )r   r\  r]  r^  
entry_timer   r-  r   _abort_on_waiting_timeout  s4   


z#Scheduler._abort_on_waiting_timeoutc                 C   s   t |j|j|j|j|j|j|j|j|j	d	}| j
|_
|jd ur[| |j}| jr0| |j||_|| t|j| jkr[|jdt|jdt|j d| j dd | | d S t|| j| jj}|rm| | d S d|_| | d S )N)token_type_idsr  
dimensionsr  r  r  r  r  r  r  r  )rp   r  r  r  r  r2  r  r3  r  r  r=  r  r  r|  r  r  r   rk  r  r  r  r   r   r  r  )r   r  r^  r  r  r   r   r   r.    sR   



z"Scheduler.handle_embedding_requestc                 C   r  )z)Handle optimized batch embedding request.z(Processing batch embedding request with r  N)rX  r  r   r.  r  r   r   r   r0  C  s   z(Scheduler.handle_batch_embedding_requestc                 C   s   | j j|dd d S )NT)chunked)r  cache_unfinished_reqr  r   r   r   stash_chunked_requestP  s   zScheduler.stash_chunked_requestc                 C   s  |    |   | jd ur| j  t }| jd ur2| j r2|| jj | jjD ]}| 	| q*| j
d urC|| j
 | 	| j
 | jr| jj r| jj
d urY|| jj
 | jd uri| jjri|| jj | j }| jjt|d | j |k rd| j_| j s| jjs| j r| j| _n| j| j | jd ur|  }n|  }| j}|r| j s| |}|d u }|d ur|}n| j s| | j| _| j s| jnd }nd }| j||d}|rtd|j |S )Nchunked_req_to_excludeF)	need_syncschedule) r1  r_  r  r  filter_finished_reqsr  r  updatestaging_queuer6  r  r  r  rz  r{  r  
batch_sizefilter_batchr   r  r  rU  r  merge_batchget_new_batch_dllmget_new_batch_prefillr   r   rS  maybe_prepare_mlp_sync_batchupdate_running_batchr   )r   r8  r^  last_bs	new_batchneed_mlp_syncretr   r   r   rb  S  sX   









zScheduler.get_next_batch_to_runc                 C   s*   t  j| }| jdkrt|| j }|S r   )r   rn  r   r  r  available_size)r   
running_bsresr   r   r   get_num_allocatable_reqs  s   
z"Scheduler.get_num_allocatable_reqsc                 C   sN   d }| j r|  \}}}}t| j |d}| j|d}| j r%|j|d ud |S )N)token_usage)prefill_delayer_single_pass)actual_prefill)r  _get_token_inforl   _get_new_batch_prefill_rawfinalize)r   rN  r  rM  rH  r   r   r   rB    s   zScheduler.get_new_batch_prefillrN  c                    s  | j  r| j  }|D ]}| | q| jrd| j_| jjs&t| jdkr-| j	d u r-d S t| jj
}| |dkrH| j	d urH| jsHd| j_d S | jrP| j  | j| j| j tra|tkrad S | j}| j	d ur}| jr}t| j	j}| |}|d ur}|}t| j| j| j| j| j| j|| jr|nd| j| jj|| j d}| j	d ur| j	!  |"| j	| _	| j#rdd | jj
D }	| jD ]}| j#r|j$|	vr| j%r| j&'|j$|	}
|
sqn|j$h|	B }| j(j)j*+|sqt| jj
}t|j,| |krd| j_| j-t.j/krt|j,| j01 krd| j_| jjr!| jr|2|| js! n[| j3r8| j4|j5}|s0q| j6|j5|_7|!| j |j8|| j	d u| j9d}
| j#rT|	:|j$ |
t;j<kr{|
t;j=kry| jrut|j,dkpq| j>  | j_nd| j_ nq|j, t dkrd S | j?r D ]	}|@tAjB q fdd	| jD | _|jCr|jCD ]}| | q|jDd ur| j	d u sJ |jD| _	| j	d ur| j	 jEd
7  _E|| _F | _,t| jj
| _G D ]}|jHjIdkrtJK |jH_I| j?r| jLM|jHN  qtOjP | j0| j| j| jQ| jR| jS| j	d}| jr| jT |_U|V  tW|jX|jY|jt| jj
t d|_Z| jrp| j> sp|j[sp| jj[sp| jj\dd | j> se| j]  |^| j | jj
|__tOg | jjd| _|S d |__|S )NFr   T)prefill_max_requestsrN  r  c                 S   s   h | ]}|j qS r   )r  r  r   r   r   	<setcomp>  s    z7Scheduler._get_new_batch_prefill_raw.<locals>.<setcomp>)has_chunked_reqr(  c                    s   g | ]
}|t  vr|qS r   )r  r   r  can_run_listr   r   r   N  s    z8Scheduler._get_new_batch_prefill_raw.<locals>.<listcomp>r^  )r  )log_input_tokenslog_hit_tokensr  rJ  num_new_seqsv1_spec_info_filteredr  )`r  has_waiting_grammarsget_ready_grammar_requestsr  r  r  r  r   r  r  r  rL  r   r  check_hicache_eventsr  calc_priorityTEST_RETRACTTEST_RETRACT_NO_PREFILL_BSr}  r  r   predict_next_chunk_sizert   r   r  r  rg  r  r   r   rS  r  r  add_chunked_reqr   r  r   r  try_overlap_load_lorar  r  r  validate_lora_batchrX  r  r   r  r  rI  preempt_to_scheduler   check_prefetch_progressr  pop_prefetch_loaded_tokensstorage_hit_lengthadd_one_reqr(  r  rs   CONTINUENO_TOKENrU  r   add_latencyrq   PREFILL_WAITINGpreempt_listnew_chunked_req
is_chunkedadderrJ  rX  rY  r  rV  rc  observe_queue_timeget_queueing_timerr   init_newr  r   r   ready_to_load_host_cachehicache_consumer_indexprepare_for_extendry   rY  rZ  prefill_statsr  r?  prepare_for_decodemix_with_runningdecoding_reqs)r   rN  ready_grammar_requestsr^  rJ  r}  history_lendynamic_sizert  running_lorasrK  new_lora_setprefetch_donerF  r   rW  r   rQ    sR  


















z$Scheduler._get_new_batch_prefill_rawc                 C   s  |  }|jdd | rd|_|S |   }s#tr| jt dkr| j	 }| j
}|| j\}}}| j	 }	|	| }
t|| _| jrdt|dkrd| jjt|tdd |D tdd |D d || _
|D ]}|j}| jt|j|jd	| qi|rd
nd}dt| d|
 }|r|d|dd|d7 }t||  |D ]	}| j|dd qnt| j
| j | j| _
|  |k rd|_|  |S )z*Update the current running decoding batch.Tr\  Fr   c                 s       | ]}t |jV  qd S r  )r   r  r   rr   r   r   	<genexpr>      

z1Scheduler.update_running_batch.<locals>.<genexpr>c                 s   r  r  )r   
output_idsr  r   r   r   r    r  )r  num_retracted_input_tokensnum_retracted_output_tokens)abort_messager  z)KV cache pool is full. Retract requests. zTesting retraction. z#retracted_reqs: z, #new_tokens_gained: z, #new_token_ratio: z.4fz -> r  ) r>  r?  rU  r  check_decode_memrb  r  TEST_RETRACT_INTERVALr  rI  r  retract_decoder   r   r  r   rc  increment_retracted_reqssumr[  r-  r  r-   r  r  rX  r  r  ro  r  r  r|  )r   ri  
initial_bskv_full_retract_flagold_available_tokens	old_ratioretracted_reqsr  reqs_to_abortnew_available_tokensnew_token_gainedr^  abort_reason
msg_prefixmsg_detailsr   r   r   rD    sl   



	
zScheduler.update_running_batchmodel_worker_batchc                 C   s    | j d d | _ || j| j < d S )Nr^  r  )r!  r   )r   r  r   r   r   record_batch_in_overlap  s   z!Scheduler.record_batch_in_overlappp_proxy_tensorsc              	   C   s  |  j d7  _ | | | jdur!td| j d t| j |jtj	kr5t
 }|jD ]}||j_q.|j r?| |S | jrC| j sK| jrP| }n|}| jr|}| | |j |_t|j}| j|}| jM | j| j | j | | !| | j"#|}	W d   n1 sw   Y  | j$% |	_&|	j'du r| j(||	 |	j)|j*d n||	_+W d   n1 sw   Y  |j, }
|j-r|	j.|_/||j/_+|	j.j0|_nF| j1r|j2 r| j34|}	|	j5}
n4| j rd|ini }| !| | j"j#|fi |}	W d   n	1 sw   Y  |	j5}
| 6||	 |
|_7|j*r:dd |jD |	_8d	d |jD |	_9nd|	_8d|	_9|	}nC| }| jr{| | | j | j| j | j3:|}t;|d
}|)  W d   n	1 suw   Y  n| j3:|}t;|d
}|jtj	krt
 }|jD ]}||j_<q| j=j>r| j=j?dkr| j@jAB C D }| j@jEB D }||M }|F| jGdjHdd}| jIJtK|L d |S )zRun a batch.r^  NzScheduler.run_batch sleep sr  r  c                 S      g | ]}|j qS r   )extend_input_lenr  r   r   r   r   P	      z'Scheduler.run_batch.<locals>.<listcomp>c                 S   r  r   )extend_logprob_start_lenr  r   r   r   r   S	  r  )r   mooncaker  )axis)status)Mr  _profile_batch_predicater  rX  rY  r  r	  rz  r   EXTENDrV  r  rX  prefill_start_time_hostis_prebuilt_run_batch_prebuiltr<  r   rS  r   get_model_worker_batchr  sampling_infocopy_for_forwardr   seq_lensr  alloc_future_indicesr  rm  wait_streamr  resolve_futurerecord_forward_metricsrd  forward_batch_generationr  r   r   delay_sample_funcstore_to_mapr   r  future_indicesindicesr|  next_draft_input	spec_infonew_seq_lensr   is_split_prefillr  forward_batch_split_prefillnext_token_idsupdate_cache_from_schedulerr  extend_input_len_per_req extend_logprob_start_len_per_reqforward_batch_embeddingr   prefill_end_time_hostr   r   elastic_ep_backendrp  active_ranksdetachr   numpyactive_ranks_cpureshaper   prodr-  r  r.   tolist)r   ri  r  current_timer^  worker_batch_or_batchr  bsr  rv   future_indices_or_next_token_idskwargsrH  r   tp_active_rankstp_active_ranks_cpudp_active_ranksr   r   r   rc    s   










	






zScheduler.run_batchrv  c                 C   s   |d u s	|j d u rd S | j* | j| j |  }||u s J | j|j| |j| j	j
d W d    d S 1 s;w   Y  d S )Nr  )r  r  rm  r  r  r  r  r  r   r  r  )r   rv  _batch_resultr   r   r   rt  	  s   "z'Scheduler.launch_batch_sample_if_neededrj  c                 C   s   |j  r| || ttj|j n-|j  r*| r#| 	|| n| 
|| n|j  r5| | n|j  r@| || | || | | |   d S r  )rz  r~  process_batch_result_decoder   rq   DECODE_LOOPr  r{  is_dllmprocess_batch_result_dllmprocess_batch_result_prefillr  process_batch_result_prebuiltis_idleprocess_batch_result_idlelog_batch_result_statsr  maybe_send_health_check_signal)r   ri  rj  r   r   r   rd  	  s   




zScheduler.process_batch_resultc                 C   s*   | j r|  j d8  _ | jt  d S d S r   )r  r-  r  rJ   r   r   r   r   r  	  s   z(Scheduler.maybe_send_health_check_signalc                 C   s   |   }t|dS )Nsuccess)flush_cacherC   )r   r  r  r   r   r   r1  	  s   
zScheduler.flush_cache_wrappedc                 C   s8   | j r| j  td d}ntd d}t|dS )Nz(Hierarchical cache cleared successfully!T"Hierarchical cache is not enabled.Fr  )r   r  clear_storage_backendrX  rY  loggingr  r7   )r   r  
if_successr   r   r   r2  	  s   



z'Scheduler.clear_hicache_storage_wrappedc                 C   s6   |   sdS t| jdkrdS t| jjdkrdS dS )zStricter idle check for storage attach/detach.

        We require:
        - no running batches (including overlap/pp/disagg paths) via `_is_no_request()`
        - no queued requests in scheduler queues (waiting/grammar/disagg queues)
        Fr   T)_is_no_requestr   r  r  grammar_queuer   r   r   r   _is_idle_for_hicache_storage_op	  s   z)Scheduler._is_idle_for_hicache_storage_opc              
   C   s0  | j s	tdddS |  s tddt| j dt| jj dS t| jds,tdddS z| jj	|j
|j| jj|j|jd\}}W n ty` } ztd	 tdt|dW  Y d }~S d }~ww |rd
| _|j
| j_
|jd uru|j| j_|jd ur|j| j_|jd ur|j| j_td|j
  t||dS )NFr  r  r  z1Reject attach: scheduler is not idle. #queue-req= #running-req=attach_storage_backendzBCurrent tree_cache implementation does not support dynamic attach.)storage_backend!storage_backend_extra_config_jsonserved_model_namehicache_storage_prefetch_policyhicache_write_policyz5Attach HiCache storage backend failed with exception.Tz"Attached HiCache storage backend: )r   r0   r  r   r  r  r  rL  r  r  r   )hicache_storage_backend_extra_config_jsonr   r  r  r  r  rX  	exceptionstrr   $hicache_storage_backend_extra_configrY  r   r  okmsgr  r   r   r   r3  	  s\   
	






z(Scheduler.attach_hicache_storage_wrappedc              
   C   s   | j s	tdddS |  s tddt| j dt| jj dS t| jds,tdddS z	| j	 \}}W n t
yT } ztd tdt|dW  Y d }~S d }~ww |sZ| jsrd| _d | j_d | j_td	 td
|poddS td|dS )NFr  r  z1Reject detach: scheduler is not idle. #queue-req=r  detach_storage_backendzBCurrent tree_cache implementation does not support dynamic detach.z5Detach HiCache storage backend failed with exception.z!Detached HiCache storage backend.Tz$HiCache storage backend is detached.)r   r<   r  r   r  r  r  rL  r  r  r  rX  r  r  r   r   r   r  rY  r  r   r   r   r4  	  sD   
	


z(Scheduler.detach_hicache_storage_wrappedc                 C   s   | j  o2| jd u p| j o2| jd u p| j o2| j p#t| jdko2| jdkp2tdd | j	D }| j
tjkrJ|t| jjdkoHt| jdkM }| j
tjkrb|t| jjdko`t| jjdkM }|S )Nr   r^  c                 s   s    | ]}|  V  qd S r  )rU  rV  r   r   r   r  1
  s    z+Scheduler._is_no_request.<locals>.<genexpr>)r  rU  r  r  r   r   rl  r   allrunning_mbsr  r   r  r  queuer  r   r
  r  )r   
no_requestr   r   r   r  +
  s(   
zScheduler._is_no_requestc                 C   s   |   r8d| _d| _| j  | j  | j  | j  | 	  | j
r*| j
  tj  td d}|S tdt| j dt| jj  d}|S )z Flush the memory pool and cache.NzCache flushed successfully!TzBCache not flushed because there are pending requests. #queue-req: z, #running-req: F)r  r  r  r  resetr  clearr  r  reset_metricsrT  clear_cache_poolr   r   r_  rX  rY  r  r  r   r  r  r  )r   r  r   r   r   r  ?
  s.   







zScheduler.flush_cachec                 C   s   t t }| j|d< t| jjjdt| j j	dt
| jt| jjjdd|d< | j|d< | j s@| jdkr@| j| j |d< trG| j|d< |d	d  t|d
S )Nlast_gen_throughputr  )weightkvcachetoken_capacitygraphmemory_usage%effective_max_running_requests_per_dpr   avg_spec_accept_lengthstep_time_dictr  )internal_state)varsr   r  roundr  r  weight_load_mem_usager  r  	mem_usageintrf  graph_mem_usagerh  r   rS  spec_total_num_forward_ctspec_total_num_accepted_tokensrx   r  r'  rF   )r   r  rH  r   r   r   rG  Z
  s"   






zScheduler.get_internal_statec              
   C   s  |j }tg d}d}| D ]<\}}||vr$td| d d} n(|dkrK|| j| j ks4|dk rKtd| d| d	| j| j  d
 d} nq|r| j sf| j	dkrf| j
| j	 }td| d | _
| _	| D ]\}}tt || qptdt  tdtt dS )N)rn  #speculative_accept_threshold_single speculative_accept_threshold_accTz	Updating z is not supported.Frn  r^  z to z6 is rejected because it is out of the valid range [1, z].r   zavg_spec_accept_length=z5Global server args updated! get_global_server_args()=)updatedr   )r   r  itemsr  r  rh  r   r   rS  r  r  rX  rY  setattrr   r]   r  )r   r  server_args_dictargs_allow_updater  kvr  r   r   r   rH  t
  s>   
zScheduler.set_internal_statec              
   C   s   t d|j d|j  d}d }zt| |j}|jd ur&|di |j n|  W n$ tyN } zd}|}t d|j dt|  W Y d }~nd }~ww t  t	||sYdS t|S )	Nzhandle_rpc_request: z	, param: TFzFailed to call rpc z: r  r   )
rX  rY  method
parametersrJ  r  r  r  r   rY   )r   r  r  execfuncr  r   r   r   rI  
  s&   
(zScheduler.handle_rpc_requestc                 C   s  g }t | jD ]\}}|js|j|jr|| qt|D ]Q}| j|}| jr1| j	
|j | jt|jd| | jtjkrHt|| j	 | jtjkrTt|| j |jd urg| jtjkrgt|| j	dd td|j q| j| | jtjkr| jjD ] }|js|j|jrtd|j t|jdr|j  q| jD ] }|js|j|jrtd|j t|jdr|j  qn| jtjkrM| j jD ]}|js|j!j|jrtd|j!j |j"  q| j#jD ]}|js|j!j|jrtd	|j!j |j"  q| j j$rMg }| j j$D ]+}|js+|j|jrBt|d
s3J |`%| jt|jd| q|| q|| j _$| j&| j'u sZ| j&d u r_| j'j(}n| j'j(| j&j( }|D ]!}|) s|js||j|jrtd|j t* |_+qid S )N)r  F)	is_insertzAbort queued request. req.rid=z'Abort bootstrap queue request. req.rid=r  z&Abort inflight queue request. req.rid=z1Abort prealloc queue request. decode_req.req.rid=z1Abort transfer queue request. decode_req.req.rid=kv_cache_cpuzAbort running request. req.rid=),r$  r  	abort_allr  
startswithrr  reversedr'  r   r  r%  r-  r  r-   r  r   r   r   r  r   r  mamba_pool_idxrX  r  r  abort_requestsr  r  rL  disagg_kv_senderr  r  r
  r^  kv_receiverr  retracted_queuer$  r  r  r  rW  rm   r[  )r   r  to_delir^  
decode_reqremaining_retractedr  r   r   r   r5  
  s   









zScheduler.abort_requestc                 C   s   t  r  )NotImplementedErrorr   r   r   r   _pause_engine  s   zScheduler._pause_enginec                 C   s   d| _ | jr| jr| j \}}| || | jrB| jj rBt }|j	dkr2| j
d ur2|| j
 | jjt|d | j| j d | _d | _|j	dkrv| jjdd t| jjdkrm| j| j}|D ]}| | qed| j_d | _
d S d S )NTin_placer7  retractr\  r   F)r  r   r  rl  rm  rd  rz  r{  r  moder  r  r?  r   r  r@  r  r   r  retract_allr   r  r  )r   r  rn  ro  r8  r  r^  r   r   r   rP    s0   



zScheduler.pause_generationc                 C   s
   d| _ d S )NF)r  r   r  r   r   r   rQ  7     
zScheduler.continue_generationc                 C      | j |}|S )z=In-place loading a new lora adapter from disk or huggingface.)r  rK  r   r  rj  r   r   r   rK  :     zScheduler.load_lora_adapterc                 C   r9  )z<In-place loading a new lora adapter from serialized tensors.)r  rL  r:  r   r   r   rL  B  r;  z(Scheduler.load_lora_adapter_from_tensorsc                 C   r9  )zUnload the lora adapter.)r  rM  r:  r   r   r   rM  J  r;  zScheduler.unload_lora_adapterc                 C      | j |\}}t||S )z6Init the seed and client instance communication group.)r  r;  rL   r   r  r  r  r   r   r   r;  R  s   
z5Scheduler.init_weights_send_group_for_remote_instancec                 C   r<  )z;Send the seed instance weights to the destination instance.)r  r<  r[   r=  r   r   r   r<  [  s   
z)Scheduler.send_weights_to_remote_instancec                 C   s&   |j }|d ur|dkrd }|| _ t S rx  )r  r_   )r   r  r  r   r   r   rD  b  s
   zScheduler.slow_downc                 C   sb   |j }|tjkrt   t
 S |tjkrt   t
 S |tjkr*t   t
 S t	d|)Nz3Unrecognized ExpertDistributionReq value: recv_req=)actionrA   START_RECORDr$   start_recordSTOP_RECORDstop_recordDUMP_RECORDdump_recordr  r@   )r   r  r>  r   r   r   rJ  i  s   





z$Scheduler.expert_distribution_handlec                 C   sd   |j }|| jv rtd| d t|dS |d u r$td t|dS t|j|| j|< t|dS )Nsession id z already exist, cannot open.Fz session id is None, cannot open.T)r  r  rX  r  rS   r   capacity_of_str_lenr   r  r  r   r   r   r6  u  s   





zScheduler.open_sessionc                 C   s2   |j }|| jvrtd| d d S | j|= d S )NrE  z does not exist, cannot delete.)r  r  rX  r  rG  r   r   r   r7    s   
zScheduler.close_sessionc                 C   s   | j d ur| j   d S d S r  )r"  maybe_sleepr   r   r   r   maybe_sleep_on_idle  s   
zScheduler.maybe_sleep_on_idlec                 C   s   t d | j|| dS )zKHandle freeze_gc request: freeze scheduler's GC and forward to detokenizer.r   N)r   r.  r  r7  r   r   r   rF    s   zScheduler.handle_freeze_gcc              
   C   s   ddl m} z%g }tj rtj dkr|j|j|jd}| j	
td|d| W d S  tyU } ztd| dd | j	
tdg t|d	| W Y d }~d S d }~ww )
Nr   )dumper)r  bodyT)r  responsez)[Scheduler] handle_dumper_control error: )flushF)r  rL  r  )sglang.srt.debug_utils.dumperrJ  r   r  r  get_rank_handle_http_control_requestr  rK  r-  r  r>   r  printr  )r   r  rJ  rL  r  r   r   r   rR    s(   
zScheduler.handle_dumper_controlschedule_batchc                 C   r  r  r   )r   rR  rv  r   r   r   r    s   z%Scheduler.update_cache_from_schedulerc                 C   s
   | j  S r  )r  (get_remote_instance_transfer_engine_infor   r   r   r   rS    r8  z2Scheduler.get_remote_instance_transfer_engine_info)rG  N)Fr  )r   r   r   __doc__r   r   r  r
   r  r   r  r  r  r  rP  r]  r  r
  r  r  r  r   r  r  r  r  r  r_  r   rk  rw  rr   r   rq  r  r	   r   ra   r`   r   r`  r  ra  r  rU  r  r  r  r-  r4   r/  rp   r  r  r  r  r1  r.  r3   r0  r6  rb  rL  rB  rl   rQ  rD  rn   r  r   r   r   rc  rt  rd  r  rB   r1  r6   r2  r  r/   r0   r3  r;   r<   r4  r  r  rE   rG  r\   rH  rX   rI  r-   r5  r   r2  rT   rP  r9   rQ  rP   rQ   rK  rN   rO   rL  rb   rc   rM  rK   r;  rZ   r<  r^   rD  r?   rJ  rR   r6  r8   r7  rI  rD   rF  r=   rR  r  rS  r   r   r   r   r      sJ   	

 /
!In& 8

5
v

D
 %
0
7
O
 cB

 




5
,(c



	

r   c                   @   s    e Zd ZdZdd Zdd ZdS )r0  a  
    In setups which have long inactivity periods it is desirable to reduce
    system power consumption when sglang does nothing. This would lead not only
    to power savings, but also to more CPU thermal headroom when a request
    eventually comes. This is important in cases when multiple GPUs are connected
    as each GPU would otherwise pin one thread at 100% CPU usage.

    The simplest solution is to use zmq.Poller on all sockets that may receive
    data that needs handling immediately.
    c                 C   s>   t  | _t | _|D ]
}| j|t j qtj	 | _
d S r  )r   Pollerpollerr  last_empty_timeregisterPOLLINr#   SGLANG_EMPTY_CACHE_INTERVALr   empty_cache_interval)r   socketsr  r   r   r   r    s
   

zIdleSleeper.__init__c                 C   sJ   | j d | jdkr!t | j | jkr#t | _tj  d S d S d S )Ni  r   )rV  pollr[  r  rW  r   r   r_  r   r   r   r   rH    s   

zIdleSleeper.maybe_sleepN)r   r   r   rT  r  rH  r   r   r   r   r0    s    r0  c                 C   s   t | dd }|d uo|dS )Nr  HEALTH_CHECK)rJ  r&  )r  r  r   r   r   r    s   r  c                 C   s   t | ttttfS r  r  )r  r   r   r   is_work_request  s   r_  c                   @   sF   e Zd ZdejfddZ	d	deeef de	eeef  fddZ
dS )
r,  socketc                 C   s
   || _ d S r  )r`  )r   r`  r   r   r   r    r8  zSenderWrapper.__init__Nr  recv_objc                 C   sD   | j d u rd S t|tr|jd ur|jd u r|j|_| j | d S r  )r`  r   r2   r  r  )r   r  ra  r   r   r   r    s   


zSenderWrapper.send_outputr  )r   r   r   r   Socketr  r   r2   r1   r
   r  r   r   r   r   r,    s    
r,  r   r   r   r   r   r   r   r   r   c
              
   C   s  d}
|d u rdt jv rtt jd }|d ur|
d| 7 }
| jdkr)|
d| 7 }
| jdkr5|
d| 7 }
| jdkrA|
d| 7 }
| jdkrM|
d| 7 }
| jdkrY|
d	| 7 }
td
|
	dd  t
  t  t  }t| |
d t  tdrt| j| j| j| | j }d urtj st||  | jrt| jd d}| jdkrd}n| jdkrd}t||| zt| ||||||||	}d|j |j!d}| " r|# \}}|$|||d |	%| |j}|t&j'kr|j(r|)  W d S | jdkr|*  W d S |j+r|,  W d S |-  W d S |t&j.kr?| jdkr-|/  W d S |j+r8|0  W d S |1  W d S |t&j2krd| jdkrR|3  W d S |j+r]|4  W d S |5  W d S W d S  t6y   t7 }t89d|  |:t;j< Y d S w )Nr  SGLANG_DP_RANKz DPr^  z PPz ATTN_CPz MOE_DPz TPz EPzsglang::scheduler r  )prefixSGLANG_SET_CPU_AFFINITYsglangr   prefillzPrefill Schedulerr  zDecode Schedulerready)r  rf  rk  )r   *remote_instance_transfer_engine_session_id1remote_instance_transfer_engine_weights_info_dictzScheduler hit an exception: )=osenvironr  r   r   r   r   r   setproctitlereplacefaulthandlerr  r   psutilProcessr  r   r   r   r   nnodes	numa_noder#   SGLANG_NUMA_BIND_V2r   r   r   r   otlp_traces_endpointr  r   r   rf  rk  1remote_instance_weight_loader_use_transfer_enginerS  r<  sendr   r  r   event_loop_pdmuxevent_loop_ppr   rw  rk  r  event_loop_pp_disagg_prefill!event_loop_overlap_disagg_prefill event_loop_normal_disagg_prefillr   event_loop_pp_disagg_decode event_loop_overlap_disagg_decodeevent_loop_normal_disagg_decoder  r   rX  r  send_signalsignalSIGQUIT)r   r   r   r   r   r   r   r   r   pipe_writerre  parent_processrt  thread_labelr  result_dictrj  rk  r  	tracebackr   r   r   run_scheduler_process  s   







r  )rT  rp  r  rl  r  r  r  collectionsr   dataclassesr   httpr   typingr   r   r   r	   r
   r   r   rq  rn  r   torch.distributedr   
torch.cudar   
CudaStreamr   CudaStreamContextr   sglang.srt.configs.model_configr   &sglang.srt.constrained.grammar_managerr    sglang.srt.disaggregation.decoder   r   r   8sglang.srt.disaggregation.decode_kvcache_offload_managerr   )sglang.srt.disaggregation.encode_receiverr   !sglang.srt.disaggregation.prefillr   r   r   sglang.srt.disaggregation.utilsr   r   r   r   r   sglang.srt.distributedr   r    %sglang.srt.distributed.parallel_stater!   sglang.srt.dllm.mixin.schedulerr"   sglang.srt.environr#   #sglang.srt.eplb.expert_distributionr$   %sglang.srt.layers.attention.mamba.opsr%   sglang.srt.layers.dp_attentionr&   r'   r(   sglang.srt.layers.moer)   (sglang.srt.layers.quantization.fp4_utilsr*   (sglang.srt.layers.quantization.fp8_utilsr+   #sglang.srt.lora.lora_overlap_loaderr,   sglang.srt.managers.io_structr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   sglang.srt.managers.mm_utilsrh   ri   !sglang.srt.managers.overlap_utilsrj   #sglang.srt.managers.prefill_delayerrk   rl   "sglang.srt.managers.schedule_batchrm   rn   ro   rp   rq   rr   #sglang.srt.managers.schedule_policyrs   rt   ru   +sglang.srt.managers.scheduler_dp_attn_mixinrv   +sglang.srt.managers.scheduler_input_blockerrw   +sglang.srt.managers.scheduler_metrics_mixinrx   ry   rz   4sglang.srt.managers.scheduler_output_processor_mixinr{   &sglang.srt.managers.scheduler_pp_mixinr|   ,sglang.srt.managers.scheduler_profiler_mixinr}   *sglang.srt.managers.scheduler_recv_skipperr~   3sglang.srt.managers.scheduler_runtime_checker_mixinr   r   2sglang.srt.managers.scheduler_update_weights_mixinr   &sglang.srt.managers.session_controllerr   sglang.srt.managers.utilsr   r   &sglang.srt.mem_cache.cache_init_paramsr   sglang.srt.mem_cache.commonr    sglang.srt.mem_cache.radix_cacher   ,sglang.srt.model_executor.forward_batch_infor   r   'sglang.srt.multiplex.multiplexing_mixinr   "sglang.srt.parser.reasoning_parserr   sglang.srt.server_argsr   r   r    sglang.srt.speculative.spec_infor   sglang.srt.tracing.tracer   r   r   r   r   r   r   sglang.srt.utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   &sglang.srt.utils.hf_transformers_utilsr   r   r   +sglang.srt.utils.torch_memory_saver_adapterr   sglang.utilsr   r   	getLoggerr   rX  SGLANG_TEST_RETRACTr   rb  SGLANG_TEST_RETRACT_INTERVALr  !SGLANG_TEST_RETRACT_NO_PREFILL_BSrc  r   r   r0  r  r_  r,  r  r  r   r   r   r   <module>   s  $= $H	




                     N	