o
    پi*                    @  s,  U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE eFeGZHdZIh dZJg dZKg dZLg eLdZMg dZNg dZOg dZPg dZQg dZRg dZSddgZTddgZUdZVg dZWddgZXd gZYg d!ZZg d"Z[g d#Z\g d$Z]g d%Z^g d&Z_dd'gZ`d(d) Zad*d+ Zbd,d- Zcd.d/ Zdd0d1 Zed2d3 Zfd4d5 Zgd6d7 Zhd8d9 Zid:d; Zjd<d= Zkd>d? Zld@dA ZmejnG dBdC dCZodapdDeqdE< d_dGdHZrerZsd`dJdKZtdadNdOZudPZvdQZwejnG dRdS dSZxG dTdU dUejyZzdbdXdYZ{G dZd[ d[ejyZ|dcd]d^Z}dS )dzThe arguments of the server.    )annotationsN)AnyCallableDictListLiteralOptionalUnion)ConnectorType)envs)FunctionCallParser)
CHUNK_SIZELoRARef)ReasoningParser) LORA_TARGET_ALL_MODULESSUPPORTED_LORA_TARGET_MODULEScheck_pkg_version_at_leastconfigure_ipv6cpu_has_amx_supportget_bool_env_var
get_deviceget_device_memory_capacityget_device_nameget_device_smget_free_portget_int_env_varget_quantization_configis_blackwell_supportedis_cudais_flashinfer_availableis_hipis_hopper_with_cuda_12_3is_no_spec_infer_or_topk_oneis_npuis_remote_urlis_sm90_supportedis_sm100_supportedis_sm120_supportedis_triton_kernels_availableis_valid_ipv6_addressjson_list_typenullable_strparse_connector_typetorch_releasewait_port_availablexpu_has_xmx_support)check_gguf_file)is_in_ci >   ascendpytorch
flashinfer)autoptsafetensorsnpcachedummysharded_stateggufbitsandbyteslayeredflash_rlremoteremote_instancefastsafetensorsprivate)awqfp8mxfp8gptqmarlingptq_marlin
awq_marlinr>   r=   modeloptmodelopt_fp8modelopt_fp4petit_nvfp4	w8a8_int8w8a8_fp8	moe_wna16qoqw4afp8mxfp4z
auto-roundcompressed-tensors	modelslimquark_int4fp8_moeunquant)tritontorch_nativeflex_attentionnsacutlass_mlafa3fa4r6   flashmla
trtllm_mla
trtllm_mhadual_chunk_flash_attnaiterwave	intel_amxr4   	intel_xpu)rZ   csgmvr4   r[   )mooncakenixlr4   fakemori)zmq_to_schedulerzmq_to_tokenizerrj   )xgrammaroutlines
llguidancenone)r6   r_   rZ   r_   rZ   in-seq-splitround-robin-splitlru)flashmla_sparseflashmla_kvflashmla_autor_   tilelangre   trtllmlfufsdp)	r7   	deep_gemmrZ   triton_kernelflashinfer_trtllmflashinfer_cutlassflashinfer_mxfp4flashinfer_cutedslcutlass)rs   deepeprj   rm   ascend_fuseepr6   )r7   r~   r   flashinfer_deepgemmr   rZ   re   )r7   flashinfer_cudnnr   r   )float32bfloat16float16)r7   	no_bufferextra_bufferr6   c                 C     t |  d S N)LOAD_FORMAT_CHOICESextendchoicesr3   r3   J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/server_args.pyadd_load_format_choices      r   c                 C  r   r   )QUANTIZATION_CHOICESr   r   r3   r3   r   add_quantization_method_choices   r   r   c                 C  r   r   )ATTENTION_BACKEND_CHOICESr   r   r3   r3   r   add_attention_backend_choices   r   r   c                 C  r   r   )DISAGG_TRANSFER_BACKEND_CHOICESr   r   r3   r3   r   #add_disagg_transfer_backend_choices   r   r   c                 C  r   r   )GRAMMAR_BACKEND_CHOICESr   r   r3   r3   r   add_grammar_backend_choices   r   r   c                 C  r   r   )MOE_RUNNER_BACKEND_CHOICESr   r   r3   r3   r   add_moe_runner_backend_choices   r   r   c                 C  r   r   )FP8_GEMM_RUNNER_BACKEND_CHOICESr   r   r3   r3   r   #add_fp8_gemm_runner_backend_choices   r   r   c                 C  r   r   )FP4_GEMM_RUNNER_BACKEND_CHOICESr   r   r3   r3   r   #add_fp4_gemm_runner_backend_choices   r   r   c                 C  r   r   )'DETERMINISTIC_ATTENTION_BACKEND_CHOICESr   r   r3   r3   r   +add_deterministic_attention_backend_choices   r   r   c                 C  r   r   )/RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKENDr   r   r3   r3   r   ;add_radix_supported_deterministic_attention_backend_choices  r   r   c                 C  r   r   )RADIX_EVICTION_POLICY_CHOICESr   r   r3   r3   r   !add_radix_eviction_policy_choices  r   r   c                 C  r   r   )RL_ON_POLICY_TARGET_CHOICESr   r   r3   r3   r   add_rl_on_policy_target_choices
  r   r   c                 C  r   r   )MAMBA_SSM_DTYPE_CHOICESr   r   r3   r3   r   add_mamba_ssm_dtype_choices  r   r   c                   @  s  e Zd ZU dZded< dZded< dZded< d	Zd
ed< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZd
ed< dZded< dZded < dZded!< dZded"< dZded#< dZded$< dZded%< dZded&< dZded'< dZded(< dZded)< dZd*ed+< dZded,< dZ ded-< dZ!ded.< dZ"ded/< dZ#ded0< dZ$d1ed2< dZ%ded3< dZ&ded4< dZ'ded5< dZ(ded6< dZ)ded7< d8Z*d
ed9< dZ+ded:< d;Z,ded<< dZ-ded=< dZ.ded>< dZ/ded?< d@Z0d
edA< dBZ1dCedD< dZ2dedE< dFZ3dCedG< dZ4dedH< dIZ5dedJ< dZ6dedK< dLZ7d
edM< dZ8d1edN< dZ9dOedP< dZ:dOedQ< dZ;dedR< d	Z<d
edS< d	Z=d
edT< dZ>dedU< dVZ?d
edW< d	Z@d
edX< dZAdedY< dZBdedZ< dZCded[< dZDded\< d]ZEdCed^< dZFd1ed_< dZGded`< dZHdeda< dZIdedb< dVZJd
edc< d	ZKd
edd< dZLdede< dZMdfedg< dhZNdedi< dZOdedj< dZPdedk< dlZQd
edm< dnZRdedo< dZSdpedq< eTjUdrds dtZVduedv< dZWdedw< dZXdedx< dZYdedy< dZZdedz< d{Z[ded|< dZ\dped}< dZ]d~ed< dZ^dOed< dZ_dOed< dZ`dOed< dZaded< dZbdped< dZcdped< dZddCed< dZed
ed< dZfded< dZgded< dZhded< dZided< dZjded< dZkded< dZlded< dZmded< dZnded< dZoded< dZpded< dZqded< dZrded< dZsded< dZtded< dZuded< dZvded< dZwded< dZxded< d	Zyd
ed< dZzded< d	Z{d
ed< d	Z|d
ed< dZ}ded< d	Z~d
ed< dVZd
ed< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZd
ed< dIZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dBZdCed< dBZdCed< dZded< dZded< dZded< dZded< dZded< dZded< d	Zd
ed< dZd
ed< d	Zd
ed< d@Zd
ed< dZded< dZd
ed< dZd
ed< dZded< d	Zd
ed< dZded< dZded< dZded< dZded< dZded< dVZd
ed< dZded< dZded< dZded< dZded< dZd
ed< dZded< dBZdCed< dZded< dZded< dZded< dZded< dZded< dZded < dZded< dZded< dZded< dZdCed< dZded< dZd
ed< dZded	< d
ZdCed< dVZd
ed< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded < dZded!< dZded"< dZded#< d$Zd
ed%< dZd
ed&< d'Zded(< d)Zd
ed*< dVZd
ed+< d,Zd
ed-< d	Zd
ed.< d	Zd
ed/< d0Zded1< dZd2ed3< dZded4< dZded5< dZd6ed7< dZded8< dZded9< dZded:< dZded;< dZded<< dZded=< dZded>< dZded?< dZded@< dZdedA< dZdedB< dZdedC< dZdedD< dZdedE< dZdedF< dZdedG< dZdedH< dZdedI< dZdedJ< dZdedK< dLZ dCedM< dZdedN< dZdedO< dZdedP< d$Zd
edQ< dZdedR< dZd6edS< dTZdedU< dZdedV< dZ	dedW< dZ
dedX< dZdedY< dZd
edZ< dZded[< d	Zd
ed\< dZded]< dZded^< dZded_< dZded`< dZdeda< dZdedb< dZdedc< dZdedd< dZdede< dZdedf< dZdedg< dZdedh< dZdedi< d	Zd
edj< dZd6edk< dZdedl< dZdedm< dZ dedn< dZ!dedo< dpZ"dedq< dZ#dedr< dZ$deds< dZ%dedt< d$Z&d
edu< dvZ'dCedw< dZ(dedx< dZ)d6edy< dZ*dedz< dZ+ded{< d|Z,d}ed~< dZ-ded< dZ.d
ed< dZ/ded< dZ0ded< d	Z1ded< dZ2ded< dZ3ded< dZ4d
ed< d	Z5d
ed< dZ6ded< dZ7ded< e8dV Z9ded< eTjUe:dtZ;dued< dZ<dped< dZ=ded< dZ>ded< dZ?ded< dZ@d6ed< dZAded< dZBded< dZCded< dZDded< dZEd
ed< d$ZFd
ed< dZGdCed< dZHded< dZIded< dZJded< dZKded< dZLded< dZMded< dZNded< dZOded< dd ZPdd ZQdd ZRdd ZSdd ZTdd ZUdd ZVdd ZWdd ZXdd ZYdd ZZd1ddZ[d2dÐdĄZ\dŐdƄ Z]	ǐ		d3d4d̐d̈́Z^dΐdτ Z_dАdф Z`dҐdӄ ZadԐdՄ Zbd֐dׄ Zcdؐdل Zddڐdۄ Zedܐd݄ Zfdސd߄ Zgdd Zhdd Zidd Zjdd Zkdd Zldd Zmdd Zndd Zodd Zpdd Zqdd Zrd5ddZsdd Ztdd Zudd Zvdd  Zwdd Zxdd Zydd Zze{d6d	d
Z|e}d7ddZ~dd Zdd Zdd Zdd Zdd Zd8ddZed9ddZdd Zdd  Zd!d" Zd:d%d&Zd;d)d*Zd+d, Zd-d. Zd/d0 ZdS (<  
ServerArgsa8  
    The arguments of the server.

    NOTE: When you add new arguments, please make sure the order
    in this class definition the same as the order in the the function
    `ServerArgs.add_cli_args`.
    Please follow the existing style to group the new arguments into related groups or create new groups.
    str
model_pathNOptional[str]tokenizer_pathr7   tokenizer_mode   inttokenizer_worker_numFboolskip_tokenizer_initload_formatz{}model_loader_extra_configtrust_remote_codeOptional[int]context_lengthis_embeddingzOptional[bool]enable_multimodalrevision
model_impl	127.0.0.1hosti0u  port fastapi_root_path	grpc_modeskip_server_warmupwarmups	nccl_port+checkpoint_engine_wait_weights_before_readydtypequantizationquantization_param_pathkv_cache_dtypeenable_fp32_lm_headzOptional[Union[str, Dict]]modelopt_quant modelopt_checkpoint_restore_pathmodelopt_checkpoint_save_pathmodelopt_export_pathquantize_and_serverl_quant_profilezOptional[float]mem_fraction_staticmax_running_requestsmax_queued_requestsmax_total_tokenschunked_prefill_sizeenable_dynamic_chunking @  max_prefill_tokensprefill_max_requestsfcfsschedule_policyenable_priority_schedulingabort_on_priority_when_disabled"schedule_low_priority_values_first
   (priority_scheduling_preemption_threshold      ?floatschedule_conservativeness	page_size皙?swa_full_tokens_ratiodisable_hybrid_swa_memoryrv   radix_eviction_policyenable_prefill_delayer    prefill_delayer_max_delay_passes)prefill_delayer_token_usage_low_watermarkzOptional[List[float]]&prefill_delayer_forward_passes_buckets$prefill_delayer_wait_seconds_bucketsdevicetp_sizepp_sizepp_max_micro_batch_sizer   pp_async_batch_depthstream_intervalstream_outputrandom_seed#constrained_json_whitespace_pattern'constrained_json_disable_any_whitespace,  watchdog_timeoutsoft_watchdog_timeoutdist_timeoutdownload_dirmodel_checksumbase_gpu_idgpu_id_stepsleep_on_idlezOptional[Callable]custom_sigquit_handlerinfo	log_levellog_level_httplog_requests   log_requests_leveltextlog_requests_formatzOptional[List[str]]log_requests_targetc                   C  s   t tS r   )list+DEFAULT_UVICORN_ACCESS_LOG_EXCLUDE_PREFIXESr3   r3   r3   r   <lambda>y  s    zServerArgs.<lambda>)default_factory	List[str]#uvicorn_access_log_exclude_prefixescrash_dump_foldershow_time_costenable_metrics!enable_metrics_for_all_schedulerszx-custom-labels&tokenizer_metrics_custom_labels_header'tokenizer_metrics_allowed_custom_labelszOptional[Dict[str, str]]extra_metric_labelsbucket_time_to_first_tokenbucket_inter_token_latencybucket_e2e_request_latencycollect_tokens_histogramprompt_tokens_bucketsgeneration_tokens_bucketsg        gc_warning_threshold_secs(   decode_log_interval!enable_request_time_stats_loggingkv_events_configenable_tracelocalhost:4317otlp_traces_endpointexport_metrics_to_fileexport_metrics_to_file_dirapi_keyadmin_api_keyserved_model_namedefaultweight_versionchat_templatehf_chat_template_namecompletion_templatesglang_storagefile_storage_pathenable_cache_reportreasoning_parsertool_call_parsertool_servermodelsampling_defaultsdp_sizeload_balance_methodattn_cp_sizemoe_dp_sizedist_init_addrnnodes	node_rankjson_model_override_argspreferred_sampling_paramsenable_loraenable_lora_overlap_loadingmax_lora_rankz$Optional[Union[set[str], List[str]]]lora_target_moduleszOOptional[Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]]
lora_pathsmax_loaded_loras   max_loras_per_batchlora_eviction_policyri   lora_backend   max_lora_chunk_sizeattention_backenddecode_attention_backendprefill_attention_backendsampling_backendgrammar_backendmm_attention_backendfp8_gemm_runner_backendr   fp4_gemm_runner_backendnsa_prefill_backendnsa_decode_backenddisable_flashinfer_autotunerZ   mamba_backendspeculative_algorithmspeculative_draft_model_path speculative_draft_model_revisionspeculative_draft_load_formatspeculative_num_stepsspeculative_eagle_topkspeculative_num_draft_tokens#speculative_accept_threshold_single speculative_accept_threshold_accspeculative_token_mapprefillspeculative_attention_mode#speculative_draft_attention_backendspeculative_moe_runner_backendspeculative_moe_a2a_backend$speculative_draft_model_quantization'speculative_ngram_min_match_window_size   'speculative_ngram_max_match_window_size!speculative_ngram_min_bfs_breadth!speculative_ngram_max_bfs_breadthBFSzLiteral['BFS', 'PROB']speculative_ngram_match_type   speculative_ngram_branch_lengthi speculative_ngram_capacityenable_multi_layer_eagleep_sizers   zLLiteral['none', 'deepep', 'mooncake', 'mori', 'ascend_fuseep', 'flashinfer']moe_a2a_backendmoe_runner_backendzLiteral['default', 'bf16']flashinfer_mxfp4_moe_precision"enable_flashinfer_allreduce_fusionz(Literal['auto', 'normal', 'low_latency']deepep_modeep_num_redundant_expertsz.Optional[Literal['static', 'dynamic', 'fake']]ep_dispatch_algorithmtrivialinit_expert_locationenable_eplbeplb_algorithm  eplb_rebalance_num_iterationseplb_rebalance_layers_per_chunk*eplb_min_rebalancing_utilization_thresholdzAOptional[Literal['stat', 'stat_approx', 'per_pass', 'per_token']]!expert_distribution_recorder_mode(expert_distribution_recorder_buffer_size"enable_expert_distribution_metricsdeepep_configmoe_dense_tp_sizezLiteral[None, 'mooncake']elastic_ep_backendmooncake_ib_devicemax_mamba_cache_sizemamba_ssm_dtypeg?mamba_full_memory_ratiomamba_scheduler_strategy   mamba_track_intervalenable_hierarchical_cacheg       @hicache_ratiohicache_sizewrite_throughhicache_write_policykernelhicache_io_backendlayer_firsthicache_mem_layoutdisable_hicache_numa_detecthicache_storage_backendbest_efforthicache_storage_prefetch_policy$hicache_storage_backend_extra_config*hierarchical_sparse_attention_extra_configenable_lmcachekt_weight_path	kt_methodkt_cpuinferkt_threadpool_countkt_num_gpu_experts!kt_max_deferred_experts_per_tokendllm_algorithmdllm_algorithm_configenable_double_sparsityds_channel_config_path    ds_heavy_channel_numds_heavy_token_numqkds_heavy_channel_type   ds_sparse_decode_thresholdcpu_offload_gboffload_group_sizeoffload_num_in_groupoffload_prefetch_stepcpuoffload_modezOptional[Union[int]]multi_item_scoring_delimiterdisable_radix_cachecuda_graph_max_bsOptional[List[int]]cuda_graph_bsdisable_cuda_graphdisable_cuda_graph_paddingenable_profile_cuda_graphenable_cudagraph_gcenable_layerwise_nvtx_markerenable_nccl_nvlsenable_symm_mem,disable_flashinfer_cutlass_moe_fp4_allgatherenable_tokenizer_batch_encodedisable_tokenizer_batch_decodedisable_outlines_disk_cachedisable_custom_all_reduceenable_mscclppenable_torch_symm_memdisable_overlap_scheduleenable_mixed_chunkenable_dp_attentionenable_dp_lm_headenable_two_batch_overlapenable_single_batch_overlapgQ? tbo_token_distribution_thresholdenable_torch_compileenable_piecewise_cuda_graphenable_torch_compile_debug_modetorch_compile_max_bspiecewise_cuda_graph_max_tokenspiecewise_cuda_graph_tokenseagerpiecewise_cuda_graph_compilertorchao_configenable_nan_detectionenable_p2p_checktriton_attention_reduce_in_fp32triton_attention_num_kv_splits triton_attention_split_tile_sizenum_continuous_decode_stepsdelete_ckpt_after_loadingenable_memory_saverenable_weights_cpu_backupenable_draft_weights_cpu_backupallow_auto_truncateenable_custom_logit_processorflashinfer_mla_disable_raggeddisable_shared_experts_fusiondisable_chunked_prefix_cachedisable_fast_image_processorkeep_mm_feature_on_deviceenable_return_hidden_statesenable_return_routed_expertsscheduler_recv_interval	numa_nodeenable_deterministic_inferencerl_on_policy_targetenable_attn_tp_input_scattered#enable_nsa_prefill_context_parallelru   nsa_prefill_cp_modeenable_fused_qk_norm_rope&enable_precise_embedding_interpolationenable_dynamic_batch_tokenizer"dynamic_batch_tokenizer_batch_sizegMb`?%dynamic_batch_tokenizer_batch_timeoutdebug_tensor_dump_output_folderdebug_tensor_dump_layersdebug_tensor_dump_input_filedebug_tensor_dump_injectnullz$Literal['null', 'prefill', 'decode']disaggregation_moderj   disaggregation_transfer_backendi&#  disaggregation_bootstrap_portdisaggregation_decode_tpdisaggregation_decode_dpdisaggregation_prefill_ppdisaggregation_ib_device,disaggregation_decode_enable_offload_kvcache   num_reserved_decode_tokens&disaggregation_decode_polling_intervalencoder_onlylanguage_onlyencoder_transfer_backendencoder_urlscustom_weight_loaderweight_loader_disable_mmap.remote_instance_weight_loader_seed_instance_ip8remote_instance_weight_loader_seed_instance_service_port6remote_instance_weight_loader_send_weights_group_portsncclz"Literal['transfer_engine', 'nccl']%remote_instance_weight_loader_backend<remote_instance_weight_loader_start_seed_via_transfer_engineenable_pdmuxpdmux_config_pathsm_group_nummm_max_concurrent_callsg      $@mm_per_request_timeout"enable_broadcast_mm_inputs_processenable_prefix_mm_cachemm_enable_dp_encoderzOptional[Dict[str, Any]]mm_process_configz$Optional[Union[str, Dict[str, int]]]limit_mm_data_per_requestdecrypted_config_filedecrypted_draft_config_filezOptional[List[dict[str, Any]]]forward_hooksc                 C  sB  |    | j dv rdS |   |   |   |   |   |   t	| j
}| | |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |   |    | !  | "  | #  | $  | %  | &  | '  dS )zv
        Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
        )rs   r;   N)(_handle_load_balance_methodr   lower_handle_deprecated_args"_handle_prefill_delayer_env_compat_handle_missing_default_values_handle_hpu_backends_handle_cpu_backends_handle_npu_backendsr   r   _handle_gpu_memory_settings"_handle_model_specific_adjustments_handle_sampling_backend'_handle_attention_backend_compatibility_handle_mamba_backend_handle_kv4_compatibility_handle_page_size_handle_amd_specifics_handle_grammar_backend_handle_hicache_handle_data_parallelism_handle_context_parallelism_handle_moe_kernel_config_handle_a2a_moe_handle_eplb_and_dispatch#_handle_expert_distribution_metrics_handle_elastic_ep_handle_pipeline_parallelism_handle_speculative_decoding_handle_load_format_handle_pd_disaggregation_handle_encoder_disaggregation_handle_tokenizer_batching_handle_environment_variables_handle_cache_compatibility_handle_deterministic_inference_handle_dllm_inference_handle_debug_utils_handle_other_validations)selfgpu_memr3   r3   r   __post_init__  sN   

zServerArgs.__post_init__c                 C  sr   | j dvrtd| j | jdkr!| j dkrd| _d S d| _d S | j dkr5| jdkr7td d| _d S d S d S )Nr  rk  decodezInvalid disaggregation_mode=r7   rk  follow_bootstrap_roomround_robinzIn PD-disaggregation prefill mode, the 'round_robin' load balancing method means `bootstrap_room` routing (use 'follow_bootstrap_room' instead). Falling back to 'follow_bootstrap_room' for backward compatibility.)r  
ValueErrorrA  loggerwarningrQ  r3   r3   r   r,    s(   






z&ServerArgs._handle_load_balance_methodc                 C  sH   ddd}| j |v r"td| j  d|| j   d || j  | _ d S d S )Nqwenglm)qwen25glm45zThe tool_call_parser 'z' is deprecated. Please use 'z
' instead.)r<  rY  rZ  )rQ  deprecated_tool_call_parsersr3   r3   r   r.  =  s   

z"ServerArgs._handle_deprecated_argsc                 C  s@   t j rd| _t j  }r|| _t j  }r|| _d S d S )NT)r   &SGLANG_SCHEDULER_DECREASE_PREFILL_IDLEgetr   'SGLANG_PREFILL_DELAYER_MAX_DELAY_PASSESr   0SGLANG_PREFILL_DELAYER_TOKEN_USAGE_LOW_WATERMARKr   rQ  xr3   r3   r   r/  F  s   

z-ServerArgs._handle_prefill_delayer_env_compatc                 C  s   | j d u r	| j| _ | jd u r| j| _| jd u rt | _| jd u r'tdd| _| jd u r/i | _t	drYt
j| jsYddlm} || j| j| jd| _|| j | j| jddgd| _ | jd	krad
| _| jd u rl| j| _d S | jdkrvd | _d S d S )Nr   i   @SGLANG_USE_MODELSCOPE)snapshot_download)	cache_dirr   z*.binz*.safetensors)ri  r   ignore_patternsr7   r   rY   )r   r   r2  r   r   r   randomrandintr'  r   ospathexists
modelscoperh  r  r   r  rp  r   )rQ  rh  r3   r3   r   r0  N  s:   








z)ServerArgs._handle_missing_default_valuesc                 C  s   | j dkrd| _d| _d S d S )Nhpur[   r5   r   rU  rX  r[  r3   r3   r   r1  w  s   

zServerArgs._handle_hpu_backendsc                 C  s(   | j dkr| jd u rd| _d| _d S d S )Nr  rg   r5   rr  r[  r3   r3   r   r2  |  s
   


zServerArgs._handle_cpu_backendsc                 C  sD   | j dkrddlm} ||  | jdkr td d| _d S d S d S )Nnpur   )set_default_server_argsr  zAt this moment Ascend platform only support prefill graph compilation with piecewise_cuda_graph_compiler='eager', change piecewise_cuda_graph_compiler to 'eager'.)r   %sglang.srt.hardware_backend.npu.utilsrt  r  rY  rZ  )rQ  rt  r3   r3   r   r3    s   


zServerArgs._handle_npu_backendsc                 C  sJ  |dur|dk r| j du rd| _ | jdu rd| _n|dk r7| j du r%d| _ | jdu r6| jdk r3d| _nd| _n{|d	k rU| j du rCd
| _ | jdu rT| jdk rQd| _nad| _n]|dk rs| j du rad| _ | jdu rr| jdk rod| _nCd| _n?|dk r| j du rd| _ | jdu r| jdk rd| _n%d| _n!| j du rd| _ | jdu rd| _n| j du rd
| _ | jdu rd| _| jdu r|  | _nt| j| _| jdu r|  s| j | _nd| _| jdu r| 	 | _| j
du rd}| j dkr|t| j dd 7 }n
|t| jdd 7 }|| jd 7 }|| j| j d d 7 }| jr1|| j| j d 7 }| jdkr1|| j| j d 7 }| jr>|t| jd 7 }|durM|d	krMt|d}| jdurh| jdkr^|d7 }n
| jdkrh|d7 }|durvt|| | dnd| _
|  }|jr| js| | | jrtj stjd td dS dS dS )a#  
        Configure GPU memory-dependent settings including
        chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.

        Here are our heuristics:
        - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
          This is because GPUs with more memory are generally more powerful, we need to use a larger
          chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
        - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.

          GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers

          The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
          or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.

          In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
          The activation memory is proportional to the chunked_prefill_size.
          The cuda graph memory is proportional to the cuda_graph_max_bs.
          We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
          and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.

          The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
        Ni P  i   rO  i         P   i   r  r     i h     r  r  i  r   r   g      ?r        r   i (  
STANDALONEi   NGRAMg)\(?zSymmetric memory is enabled, setting symmetric memory prealloc size to 4GB as default.Use environment variable SGLANG_SYMM_MEM_PREALLOC_GB_SIZE to change the prealloc size.)r   r  r   r   _generate_cuda_graph_batch_sizesmaxr  use_mla_backendr  %_generate_piecewise_cuda_graph_tokensr   r   r   r  r@  r  lenra  roundget_model_configis_multimodalr  adjust_mem_fraction_for_vlmr  r    SGLANG_SYMM_MEM_PREALLOC_GB_SIZEis_setsetrY  rZ  )rQ  rR  reserved_memmodel_configr3   r3   r   r4    s   


























z&ServerArgs._handle_gpu_memory_settingsc                   s    j rttd jd }nP jdu r3g dttddd ttddd ttd jd d	 }n+ttdd
dttddd ttddd ttddd ttd jd d } fdd|D }|S )z
        Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
        This integrates the logic from cuda_graph_runner.py.
        r   N)r   r  rv  rO  rr  rS    rO  i  r  r  	   r   !   r  r'  A   rv  H   c                      g | ]	}| j kr|qS r3   )r  ).0bsr[  r3   r   
<listcomp>Q  s    z?ServerArgs._generate_cuda_graph_batch_sizes.<locals>.<listcomp>)r  r  ranger  ra  )rQ  
capture_bsr3   r[  r   r  7  s.   
	z+ServerArgs._generate_cuda_graph_batch_sizesc                   s|   t tdddt tddd t tddd t td	d
d t tddd t td jd d } fdd|D }|S )z
        Generate the list of batch sizes for piecewise CUDA graph capture
        based on piecewise_cuda_graph_max_tokens.
        rv  r  0   r  rS  i   i  r  i@  i  @   i   i  r  i   r   r  c                   r  r3   )r  )r  sr[  r3   r   r  c  s    zDServerArgs._generate_piecewise_cuda_graph_tokens.<locals>.<listcomp>)r  r  r  )rQ  capture_sizesr3   r[  r   r  U  s    
	z0ServerArgs._generate_piecewise_cuda_graph_tokensmajorreturnc                 C  s   | j d u}| jd u}|s|r| jdkrtd | jdkr3|dkr#dnd| _td| j d| d | jd	kr;d| _| jd
v sDJ dd S )Nr7   zWhen specifying --nsa-prefill-backend or --nsa-decode-backend, you should also explicitly set --kv-cache-dtype (e.g., 'fp8_e4m3' or 'bfloat16'). DeepSeek V3.2 defaults to FP8 KV cache which may not be compatible with all backends.r   fp8_e4m3r   zSetting KV cache dtype to z for DeepSeek DSA on SMz device.bf16)r   r  zCDeepSeek DSA only supports bf16/bfloat16 or fp8_e4m3 kv_cache_dtype)r]  r^  r   rY  rZ  )rQ  r  user_set_prefilluser_set_decoder3   r3   r   _set_default_nsa_kv_cache_dtypei  s    



z*ServerArgs._set_default_nsa_kv_cache_dtypec              	   C  s   | j d u}| jd u}|dkr|sd| _ |sd| _n|dkr(|s"d| _ |s'd| _n
|s-d| _ |s2d| _td| j d	| j  d
| j d d S )Nr  ry   rx   r   rw   r{   r_   zSet NSA backends for z KV Cache: prefill=z	, decode=.)r]  r^  rY  rZ  r   )rQ  r   r  r  r  r3   r3   r   _set_default_nsa_backends  s*   

z$ServerArgs._set_default_nsa_backendsc                 C  s>  ddl m} t| jtjkrd S |  j}|jd }|dv r!d| _	|dv r||r|dkr<t
 r<tjd td |  rHd	| _td
 t s| jrtd | jdkrnd| _d| _d| _| j| _d| _td nd| _d| _| jdks}J d| jdksJ dtd| j d| j d| j d| j d| j d| j d n| j| jk rtd| j d| j d| j d| j d	 t rd| _td  nd!| _td" dd l}|j ! \}}| "| | #| j| | jr| j$d#ksJ d$n'| j%rtd% t& r | jd u r | j'd u r | j(d u r d&| _td' t& rt)|}| j*d u rD|d u rA|d(v rAd)| _*td* n|| _*| jd+kr^| j+d,kr^| j*d-v r^d.| _+td/ | j*d0kr| j,d1kr| j-d u sv| j.d u rtj/0 rd2| _-d| _.td3 nd4| _-d+| _.td5 n|d6v r\|  rt& rd7| _nt1 rd8| _nd4| _g d9}| 2 \}	}
|	|v r|
|v sJ d:| d;|	 d<|
 d=|	d7ks|
d7krd| _3td> t)|}|d?k}|rd| _	| j+d,krJt
 r|rd@| _+tdA n?t r t4dBr |r d,| _+tdC n*t r2t4dBr2d4| _+tdD n| jdkrJt5 rJ| j*d u rJdE| _+tdF | j+dEkrZ| jdksZJ dGndH|v r| j,d1krd| _6tdI tj70 stj7d tdJ | j8rdK| _9tdL d| _3tdM ndN|v r| j,d1krd| _6tdO tj70 stj7d tdJ | j8rdK| _9tdP d| _3tdQ n{dR|v rE| j:dSkrE| jd u rt& rdT\| _}n%t1 rdU\| _}nt rdV\| _}n| j:dWkrdX\| _}ndY\| _}tdZ| j d[| d\ | jd]v s+J d^| j t& rC| j+d,krC| j*d_v rCd.| _+td` n|dav rXtdb| dc d| _3n|ddv r|j;d urtde| dc d| _3g df}| j|v sJ dg| dh| di| j n|djv rtde| dc d| _3| jd u rt< rt& rd7| _nt< rt= dkkrd8| _nd4| _| jdlksJ dmtdn| j do| dp nu|dqv r| j>|drds ng|dtv r-|  }|j*duv r|jj?dvksJ |j*dwkr|jj@dx dykrd0ndz| _*n|j*| _*d{| _+| j>|ddrdld| | jd4ks+J d}n|d~v rlt& rkt)|}| j*d u rH|d urH|| _*| j*dv sT| j*d u rk| jd+krk| j+d,krkd.| _+td|  n|dv rt& rt)|}| j*d u r|d ur|| _*| j*dv s| j*d u r| jd+kr| j+d,krd.| _+td|  | j>|ddd4d| n|dv rt& rtA|dd }|d ur|0dnd }| j*d u r|d ur|| _*| j*d0kr| jd+kr| j+d,krtBddrd.| _+td nM|dv r| j>|ddrd4d| n>|dkr0|  j}tCdd tA|dg D }|r/| j>|drd4d n|dv rK| j>|ddrdld| | jd4ksKJ | dtjDE rYd| _Ftd tG }|ogd|v ogd|v}| jHs|dv rt1 syt& r| js| jIdkr|s| jd+krd| _Hd S d S d S d S d S d S d S d S )Nr   )is_deepseek_nsaMistralLarge3ForCausalLMPixtralForConditionalGenerationr   )DeepseekV3ForCausalLMKimiK25ForConditionalGenerationr  r  GlmMoeDsaForCausalLMr  Tz_Force NSA prefill to use MLA (i.e. disable MHA_ONE_SHOT) for GlmMoeDsaForCausalLM on Blackwell.r]   z0Use nsa attention backend for DeepSeek with DSA.zaContext parallel feature is still under experiment. It has only been verified on Hopper platform.rt   r   r   r  zFor in-seq split mode, we have the following restrictions: moe_dense_tp_size == 1, moe_a2a_backend == deepep, ep_size == tp_size, kv_cache_dtype == bf16, batch_size == 1z:For round-robin split mode, dp attention is not supported.rO  z}Current multi-machine CP support suffers from precision issues. So context parallel only support Single machine(tp_size == 8)zEEnable Context Parallel opt for deeeseekv3.2-DSA, Setting dp_size == z and moe_dense_tp_size == z, ep_size == z, tp_size == z, kv_cache_dtype == z, moe_a2a_backend  z$DSA with TP mode is active, dp_size=z
, tp_size=z, attn_tp_size=z+, attention weights will be sharded across z ranks.z0Setting page size to 1 for DeepSeek DSA on ROCm.r  z)Setting page size to 64 for DeepSeek DSA.rU  zmCP is only supported for prefill when PD disaggregation, please remove --enable-nsa-prefill-context-parallel.z5Piecewise CUDA graph is enabled, use MLA for prefill.rb   zFUse trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM)r  rF   z@Quantization not specified, default to fp8 for DeepSeek on sm100rs   r7   )rF   rM   rN   r   zNUse flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLMrN   EAGLEr~   z`Use deep_gemm moe runner and deepep a2a backend for bf16 nextn layer in deepseek fp4 checkpoint.rZ   zPUse triton fused moe by default for bf16 nextn layer in deepseek fp4 checkpoint.)GptOssForCausalLMrc   r_   )rZ   rc   r_   r`   r4   z"GptOssForCausalLM requires one of z> attention backend, but got the following backends
- Prefill: z
- Decode: 
zNDisable hybrid SWA memory for GPT-OSS model with trtllm_mha attention backend.rU   r   ziDetected Blackwell and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel.SGLANG_USE_AITERz_Detected ROCm and MXFP4 quantization format for GPT-OSS model, enabling aiter MXFP4 MOE kernel.zTDetected ROCm with SGLANG_USE_AITER for GPT-OSS bf16 model, using triton MOE kernel.r   z;Detected GPT-OSS model, enabling triton_kernels MOE kernel.z5Triton kernel MoE is only supported when ep_size == 1MiMoV2FlashForCausalLMzOEnable multi-layer EAGLE speculative decoding for MiMoV2FlashForCausalLM model.z>Spec v2 is enabled for multi-layer EAGLE speculative decoding.r   z[Reset swa_full_tokens_ratio to 1.0 for MiMoV2FlashForCausalLM model with hierarchical cachezRDisable hybrid SWA memory for MiMoV2FlashForCausalLM model with hierarchical cacheStep3p5ForCausalLMzKEnable multi-layer EAGLE speculative decoding for Step3p5ForCausalLM model.zWReset swa_full_tokens_ratio to 1.0 for Step3p5ForCausalLM model with hierarchical cachezNDisable hybrid SWA memory for Step3p5ForCausalLM model with hierarchical cacheLlama4r  )rc   sm100)r_   sm90)re   hipxpu)rh   r  )rZ   zother platformsUse z as attention backend on z for Llama4 model>   r_   re   rZ   rh   rc   zQfa3, aiter, triton, trtllm_mha or intel_xpu is required for Llama4 model but got >   rF   rM   z?Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4)Gemma2ForCausalLMGemma3ForCausalLMGemma3ForConditionalGenerationGemma3nForCausalLMGemma3nForConditionalGenerationzDisable hybrid SWA memory for  as it is not yet supported.)Exaone4ForCausalLMExaoneMoEForCausalLMz Disabling hybrid SWA memory for )r_   rZ   rc   z!One of the attention backends in z is required for 
, but got )Olmo2ForCausalLMrx  r6   zMFlashInfer backend can significantly degrade the performance of Olmo3 models.zUsing z as attention backend for r  )KimiLinearForCausalLMBailingMoeV2_5ForCausalLMF)
model_archsupport_mamba_cache)NemotronHForCausalLM)rL   rM   rN   relu2rL   
quant_algoNVFP4rM   r   )r  r   support_mamba_cache_extra_buffersm100_default_attention_backendzqNemotronHForCausalLM does not support triton attention backend,as the first layer might not be an attention layer)Qwen3MoeForCausalLM"Qwen3VLMoeForConditionalGeneration)rF   rN   z9Use flashinfer_trtllm as MoE runner backend on sm100 for )Qwen3NextForCausalLM"Qwen3_5MoeForConditionalGenerationQwen3_5ForConditionalGeneration)Glm4MoeForCausalLMquantization_configquant_methodzflashinfer-pythonz0.6.3zKUse flashinfer_trtllm as MoE runner backend on sm100 for Glm4MoeForCausalLM)FalconH1ForCausalLMJetNemotronForCausalLMJetVLMForConditionalGenerationGraniteMoeHybridForCausalLMc                 s  s    | ]}|d kV  qdS )mambaNr3   )r  
layer_typer3   r3   r   	<genexpr>k  s
    
z@ServerArgs._handle_model_specific_adjustments.<locals>.<genexpr>layer_types)r  r  r  )Lfm2ForCausalLMz^ does not support triton attention backend, as the first layer might not be an attention layerzIOverlap scheduler is disabled when using sparse head for embedding model.H20H200)r  r  r  Glm4MoeLiteForCausalLMr  r  )Jsglang.srt.configs.model_configr  r-   r   r
   INSTANCEr  	hf_configarchitecturesr   r   r   SGLANG_NSA_FORCE_MLAr  rY  rZ  is_attention_backend_not_setrU  r
  r$   r  r  r  r  r}  r   r|  r   r@  r!   r   torchcudaget_device_capabilityr  r  r  r  r'   rW  rV  r   r   r~  ra  rn  ro  SGLANG_NVFP4_CKPT_FP8_NEXTN_MOErb  r&   get_attention_backendsr   r   r)   r{  SGLANG_ENABLE_SPEC_V2r  r   r   sliding_window_patternr   r   _handle_mamba_radix_cachemlp_hidden_actr  getattrr   anySGLANG_EMBEDDINGS_SPARSE_HEADr  r  r   r  rE  )rQ  r  r  r  r  r  _r  supported_backendsprefill_attn_backenddecode_attn_backendis_mxfp4_quant_formatplatformaccepted_backendsr  r  	has_mambadevice_nameis_h20_devicer3   r3   r   r5    s*  




4












	



















		

z-ServerArgs._handle_model_specific_adjustmentsTr  r  r  r  c                 C  s  t  r| jd u r|d ur|| _td| d|  |s*td| d d| _d S |s8|  r8J d| d|  rt sCJ d| jd urZ| j	| jksZJ d	| j	 d
| j | j
d ur| j	| j
 dkssJ d	| j	 d| j
 tt| j
tt| j
 dksJ dtd| j
d S d S | js| jd u rtd d| _| jdkrtd d| _d| _d S d S td| d d| _d S d S )Nr  z# as attention backend on sm100 for zDisabling Radix Cache for r  Tz(mamba extra_buffer is not supported for z modelzEMamba extra_buffer is only supported on CUDA devices with FLA backendzmamba_track_interval z? must be greater than or equal to speculative_num_draft_tokens r   z  must be divisible by page_size zxFor SSM models with extra buffer, either FLA_CHUNK_SIZE or page_size must be divisible by the other, got FLA_CHUNK_SIZE=z, self.page_size=zDisabling overlap schedule since mamba no_buffer is not compatible with overlap schedule, try to use --disable-radix-cache if overlap schedule is necessaryrc   zDisabling radix cache since trtllm_mha does not support page_size = 1, which is required by MambaRadixCache. Try to use --attention-backend triton if radix cache is necessary.Fz5Disabling radix cache since speculative decoding for z' is not supported with radix cache yet.)r'   rU  rY  r
  rZ  r  enable_mamba_extra_bufferr   rg  r  r   r  FLA_CHUNK_SIZEminra  r  )rQ  r  r  r  r  r3   r3   r   r    sv   











z$ServerArgs._handle_mamba_radix_cachec                 C  s"   | j d u rt r
dnd| _ d S d S )Nr6   r5   )rX  r    r[  r3   r3   r   r6    s   
z#ServerArgs._handle_sampling_backendc                 C  s  |   }|  }| jd ur| j| jkr| j| _| jd u r	 |sOt r*t| r*d| _nOt r?t| r?| jd u s;| j	d ur?d| _n:t
 rFd| _n3t rKdnd| _n*t rVd| _n#t r]d| _nt
 rv|| j}|dksn|dkrrd| _nd| _nd| _td| j d	 | jd
krtd d| _| jdkrtd d| _| jd u sJ d| jdks| jdkrtd d| _| jdks| jdkrtd d| _| jdks| jdkrt std| jdvrtd| j d d| _| jdvrtd| jdks| jdks| jdkr#t std| jdvr#td| j d d| _| jdkr7| jdkr7td  d| _| jd!krO|  sOtd"| j d# d| _| jdkrb|jd$krb|  jd%9  _| jd&krz| jd'krzt sztd( d
| _| jd)kr| jd*krt std+ d| _| jd)kr| jd,vrtd-| j d# d| _t|jd.d d ur| jd u rd/| _td0 n| jd/krtd1| j d2| jd/krtd3 d4| _d| _d S d S )5Nr_   rc   re   r6   rZ      rS  z%Attention backend not specified. Use z backend by default.r[   zFCuda graph is disabled because of using torch native attention backendTr\   zDCuda graph is disabled because of using torch Flex Attention backendzKSpeculative decoding is currently not supported with Flex Attention backendra   zAFlashMLA only supports a page_size of 64, change page_size to 64.r  r^   zFCutlass MLA only supports a page_size of 128, change page_size to 128.rb   zeTRTLLM MLA backend is only supported on Blackwell GPUs (SM100/SM12x). Please use a different backend.)r  r  zNTensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from z to 64.)r  fp4_e2m1r  r7   z[TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3, fp4_e2m1, bf16, or auto.z_TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend.)rS  r  r  zRTensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from fp8_e5m2zYFlashAttention3 only supports fp8_e4m3 if using FP8; Setting attention backend to triton.r`   zaFA4 backend only supports page size 128 for non-MLA model architectures, changing page_size from z to 128.rz  g333333?rg   r  zWThe current platform does not support Intel AMX, will fallback to torch_native backend.rh   r  zQThe current platform does not support Intel XMX, will fallback to triton backend.)r  r  r   z^Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from dual_chunk_attention_configrd   z-Dual chunk attention is turned on by default.zADual chunk attention is enabled, but attention backend is set to z+. Please set it to 'dual_chunk_flash_attn'.zVMixed chunk and radix cache are disabled when using dual-chunk flash attention backendF)r  r  rW  rV  rU  r"   r#   r'   ra  rf  r!   r    get_num_kv_headsr   rY  r
  rZ  r  r   r   rX  r   context_lenr   r   r   r0   r  r  r  r  )rQ  r  r  head_numr3   r3   r   r7    s  














z2ServerArgs._handle_attention_backend_compatibilityc                 C  s,  | j dkrdS |  }|  \| _| _t r| j| jkr1| jdkr1td| j d| j d dS | jdkrd|rNg d}| j|v sLJ d| d	| j dS g d
}| j|v sbJ d| d	| j dS |r|g d}| j|v szJ d| d	| j dS g d}| j|v sJ d| d	| j dS t	d)z;Check FP4 KV cache compatibility with the attention backendr  Nr`   z$Attention: Using KV4 with PREFILL = z and DECODE = zF. Compatibility issues are unlikely, but may occur in rare edge cases.)r^   r6   rb   z:KV4 FA4 MLA expects decode_attention_backend to be one of r  )rZ   r[   r\   z:KV4 FA4 MHA expects decode_attention_backend to be one of )r^   r6   rb   ra   z/KV4 MLA expects attention_backend to be one of )rZ   r[   r\   rc   z/KV4 MHA expects attention_backend to be one of z(KV4 is not tested on non-CUDA platforms.)
r   r  r  prefill_attention_backend_strdecode_attention_backend_strr   rY  rZ  rU  RuntimeError)rQ  r  KV4_FA4_MLA_BACKEND_CHOICESKV4_FA4_MHA_BACKEND_CHOICES!KV4_ATTENTION_MLA_BACKEND_CHOICES!KV4_ATTENTION_MHA_BACKEND_CHOICESr3   r3   r   r9    sj   




z$ServerArgs._handle_kv4_compatibilityc                 C     | j d u r
d| _ d S d S )Nr   )r   r[  r3   r3   r   r:       

zServerArgs._handle_page_sizec                 C  s   t  rd| _d S d S )NrS  )r!   r  r[  r3   r3   r   r;    s   
z ServerArgs._handle_amd_specificsc                 C  r  )Nrp   )rY  r[  r3   r3   r   r<    r  z"ServerArgs._handle_grammar_backendc              	   C  sP   | j dkr&t r"zdd l}td W d S  ttfy!   tdw tdd S )Nr6   r   z-Successfully imported FlashInfer mamba modulezLFlashInfer mamba module not available, please check flashinfer installation.)r`  r    flashinfer.mambarY  r
  ImportErrorAttributeErrorrX  )rQ  r6   r3   r3   r   r8    s   
z ServerArgs._handle_mamba_backendc                 C  s   | j dkr)| j| j  dksJ d| j| j| j   dks J d| jdks)J d| jdkrb| j| j dks:J d| j| j | jksGJ d| jdksPJ d| jdkrd| j| j | jksfJ dd S d S d S )	Nr   r   z)tp_size must be divisible by attn_cp_sizez3tp_size must be divisible by dp_size * attn_cp_sizez,PP is not supported with context parallelismz(tp_size must be divisible by moe_dp_sizez;ep_size * moe_dp_size must be less than or equal to tp_sizez.ep_size * moe_dp_size must be equal to tp_size)rB  r   r@  r   rC  r|  r[  r3   r3   r   r?  #  s.   



z&ServerArgs._handle_context_parallelismc                 C  sz   | j dkrd| _d| _| jr/| jd | _| j| j  dksJ | j| j  | _td| j d | jr9| js;J dd S d S )Nr   Fg333333?r   zADP attention is enabled. The chunked prefill size is adjusted to z to avoid MoE kernel issues. z;Please enable dp attention when setting enable_dp_lm_head. )r@  r  r  r   r   r   rY  rZ  r[  r3   r3   r   r>  =  s"   
z#ServerArgs._handle_data_parallelismc                 C  s   | j dkr| jdvrtd| jd d| _| jdkr6| j dv s*J d| j  d	| jd
| jfv s6J d| jdkrQ| j dv sIJ d| j  dd| _td tdrftd | j dv scJ dd| _| jdkry| j dv r{| jd
ks}J dd S d S d S )NrG   )r7   r   zCmxfp8 quantization forces --moe-runner-backend=cutlass. Overriding r  r   r   )rN   rM   NzInvalid quantization 'z]'. 
FlashInfer Cutlass MOE supports only: 'modelopt_fp4', 'modelopt_fp8', or bfloat16 (None).r   zJThe expert parallel size must be 1 or the same as the tensor parallel sizer   )rN   rF   rM   rV   Nzy'. 
FlashInfer TRTLLM MOE supports only: 'modelopt_fp4', 'fp8', 'modelopt_fp8', 'compressed-tensors', or bfloat16 (None).TzWFlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set.SGLANG_CUTLASS_MOEzzSGLANG_CUTLASS_MOE is deprecated, use --moe-runner-backend=cutlass and/or --speculative-moe-runner-backend=cutlass instead)rF   rG   z9cutlass MoE is only supported with fp8/mxfp8 quantizationz9FP8/MXFP8 Cutlass MoE is only supported with ep_size == 1)r   r~  rY  rZ  r|  r   r  r   r[  r3   r3   r   r@  O  sP   



z$ServerArgs._handle_moe_kernel_configc                 C  sd  | j dkr | jdkrtd d| _| j| _td| j d | j dkr3| j| _td| j d | j d	krF| j| _td
| j d | j dkr| j| _td| j d d| _td | jdkrktd tj	
dd u r~tjd td | jdv sJ d| j dkr| j| _d| _td td| j d | jtddksJ dd S d S )Nr   normalz3Cuda graph is disabled because deepep_mode=`normal`TzgDeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[z].rj   ziMooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[r   zpAscend fused EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[r6   zoFlashinfer MoE A2A is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[zTFlashinfer MoE A2A is enabled. --disable-shared-experts-fusion is automatically set.r7   z/--deepep-mode is ignored for Flashinfer MoE A2ASGLANG_MOE_NVFP4_DISPATCHz?SGLANG_MOE_NVFP4_DISPATCH is set to True for Flashinfer MoE A2A)r   zOFlashinfer MoE A2A is only supported with flashinfer_cutlass moe runner backendrm   z)auto set deepep_mode=`normal` for MORI EPzeMoRI MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[,SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANKr  zkSGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK (default 4096) must be larger or equal to chunked_prefill_size)r}  r  rY  rZ  r  r   r|  r  rm  environrb  r   r  r  r~  r   r   r[  r3   r3   r   rA    sf   









zServerArgs._handle_a2a_moec                 C  s\   | j r| jd u rd| _td | j s| jdkr | jd u r d| _| j r*| jdks,J d S d S )NstatzLEPLB is enabled. The expert_distribution_recorder_mode is automatically set.r  staticr   )r  r  rY  rZ  r  r  r|  r[  r3   r3   r   rB    s   
z$ServerArgs._handle_eplb_and_dispatchc                 C  sV   | j d ur'| jr| jdkrd| _| jdksJ d| j dkr)| | j| _d S d S d S )Nr7   elasticity_awarezMElastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'.rj   )r  r  r  _validate_ib_devicesr  r[  r3   r3   r   rD    s   



zServerArgs._handle_elastic_epc                 C  sT   | j r| jd u rd| _| jd u r&| j }d ur|| _d S | jd ur(d| _d S d S d S )Nr  r  )r  r  r  r  re  r3   r3   r   rC    s   




z.ServerArgs._handle_expert_distribution_metricsc                 C  s"   | j dkrd| _td d S d S )Nr   Tz;Pipeline parallelism is incompatible with overlap schedule.)r   r  rY  rZ  r[  r3   r3   r   rE    s   
z'ServerArgs._handle_pipeline_parallelismc                 C  s   | j dkr| jdkrd| _td | js| jrO| jdkrO| jd ur%| jn| j}|dkrO| jd u rG|  s>t	 r:dnd| _nt
 rCdnd| _nd| _td | jd	krz| j d
kr|| jdkrbd| _ n| jdkrjd| _ td| j  d| j d d S d S d S )Npage_first_directr  directz;Kernel io backend does not support page first direct layoutr_   r6   rZ   zFlashAttention3 decode backend is not compatible with hierarchical cache. Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes.rj   r  
page_firstzKMooncake storage backend does not support layer_first layout, switching to z layout for z io backend)r  r  rY  rZ  r  r  rV  rU  r  r    r'   r  )rQ  effective_decode_backendr3   r3   r   r=    sR   








zServerArgs._handle_hicachec                 C  s  | j d ur| jd u rd| _ddlm} | jd u r$| jdkrdn| j| _n|| j r/J d| jdkr7d| _| jd	v r| jd
krI| jrIt	d| j
d u rVd| _
td | jd	v rwtj rwd| _td | jd urv| jdkrvt	dnd| _td | jrd| _td |  jjd }|dv r| j d u r| j| _ | j| _n	|dvrtd | jd u r| jd u r| jd u sJ t| \| _| _| _| jdks| jdks| jdkr| jdkrt	d| jdkr| j| jd krtd | jd | _| jdkr| jdkr| jdvrt	d| jdkrm| j dst	d| j
d u r-d| _
td d| _d| _| j!| _| jd u rA| j"| _td  | jdkre| jdkre| jd!kret	d"| j d#| j d$| jrot	d%d S d S )&Nmainr   )MoeRunnerBackendr   r7   zcCurrently speculative MoE runner backend cannot be flashinfer_trtllm for risk in some draft models.NEXTNr  )r  EAGLE3r}  r}  zHCurrently standalone speculative decoding does not support dp attention.r  zMax running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests.Fz[Spec v2 is enabled for eagle/eagle3 speculative decoding and overlap schedule is turned on.r   zBSpec v2 currently only supports topk = 1 for speculative decoding.TzOverlap scheduler is disabled when spec v2 is off or using unsupported speculative algorithm. You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental overlap scheduler. zNMixed chunked prefill is disabled because of using eagle speculative decoding.)
DeepseekV32ForCausalLMr  r  r  r  BailingMoeForCausalLMBailingMoeV2ForCausalLMr  r  r  r  zCDeepSeek MTP does not require setting speculative_draft_model_path.rc   zCtrtllm_mha backend only supports topk = 1 for speculative decoding.zfspeculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1)r6   r_   zspeculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend.r~  r  z5Ngram speculative decoding only supports CUDA device.ziThe overlap scheduler and mixed chunked prefill are disabled because of using ngram speculative decoding.r6   zspeculative_eagle_topk(z) > 1 with page_size(z) > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend.zCCurrently ngram speculative decoding does not support dp attention.)#rb  rc  sglang.srt.layers.moe.utilsr!  rn  r~  is_flashinfer_trtllmra  r  rX  r   rY  rZ  r   r  rb  r  rf  r  r  r  r  r   r   re  rg  auto_choose_speculative_paramsrU  rV  rW  r   r   
startswithru  rs  )rQ  r!  r  r3   r3   r   rF  	  s   


















#z'ServerArgs._handle_speculative_decodingc                 C  s   | j dks
| j dkrt| jrd | _| _ t| jrd| _ | jd u r%g | _| j dkra| jd u s4| jd u r=t	d d| _ n$| j
d u rP| jdkrPt	d d| _ n|  sa| jdkrat	d	 d| _ | jrk|  | _d S d S )
Nr7   r=   rA   rB   zXFallback load_format to 'auto' due to incomplete remote instance weight loader settings.r  ziFallback load_format to 'auto' due to incomplete remote instance weight loader NCCL group ports settings.transfer_enginezQFallback load_format to 'auto' due to 'transfer_engine' backend is not supported.)r   r1   r   r   r%   r  r  r  rY  rZ  r  r  validate_transfer_enginer  r[  r3   r3   r   rG  	  s@   







zServerArgs._handle_load_formatc                 C  s   | j dkr!| jd u sJ d| jd u sJ dd| _td d S | j dkrQ| jd u r/| j| _| jd u r8| j| _| j| _	| 
| j| j | jsSd| _td d S d S d S )NrU  z<Cannot set --disaggregation-decode-tp for the decode engine.z<Cannot set --disaggregation-decode-dp for the decode engine.Tz3KV cache is forced as chunk cache for decode serverrk  zSCuda graph is disabled for prefill server when piecewise cuda graph is not enabled.)r  r  r  r  rY  rZ  r   r@  r   r  validate_disagg_tp_sizer  r  r[  r3   r3   r   rH  	  s.   



	z$ServerArgs._handle_pd_disaggregationc                 C  s   | j r
| js
td| jr| jrtd| jr | jdks td| jr.t| jdkr.td| jdkr8| jdv s=| jdkrF| 	| j
| _
d S d S )	Nz>--enable-prefix-mm-cache requires --encoder-only to be enabledz6Cannot set --encoder-only and --language-only togetherr  zKCannot set --encoder-only and --disaggregation-mode prefill/decode togetherr   z?requires at least one encoder urls to be set via --encoder-urlsrj   )rk  rU  )r%  r  rX  r  r  r  r  r	  r  r  r  r[  r3   r3   r   rI  
  s*   



z)ServerArgs._handle_encoder_disaggregation
device_strc                   s   |du rt d dS dd |dD }t|dkrtdt|tt|kr0td| d	}tj|s@t	d
| dtt
| t dkrTt	d|  fdd|D }t|dkrotd| dt  d|S )a  
        Validate IB devices before passing to mooncake.

        Args:
            device_str: Comma-separated IB device names (e.g., "mlx5_0,mlx5_1")

        Returns:
            Normalized comma-separated string of validated device names, or None if input is None.
        NzMNo IB devices specified for Mooncake backend, falling back to auto discovery.c                 S  s   g | ]
}|  r|  qS r3   )stripr  dr3   r3   r   r  8
  s    z3ServerArgs._validate_ib_devices.<locals>.<listcomp>,r   zNo valid IB devices specifiedz Duplicate IB devices specified: z/sys/class/infinibandz!InfiniBand sysfs path not found: z1. Please ensure InfiniBand drivers are installed.zNo IB devices found in c                   s   g | ]}| vr|qS r3   r3   r0  available_devicesr3   r   r  M
      zInvalid IB devices specified: z. Available devices: )rY  rZ  splitr  rX  r  rm  rn  isdirr	  listdirsortedjoin)rQ  r.  devicesib_sysfs_pathinvalid_devicesr3   r3  r   r  '
  s4   


zServerArgs._validate_ib_devicesc                 C  sv   | j r
| jr
td| jr7| jdkrtd| j d d| _| j r*td d| _ | jr9td d| _d S d S d S )NzCannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. Please choose one tokenizer batching approach.r   z_skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 (requested z).zOskip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it.FzPskip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it.)r  r   rX  r   r   rY  rZ  r[  r3   r3   r   rJ  V
  s0   

z%ServerArgs._handle_tokenizer_batchingc                 C  sd   t j| jrdnd | jd urt j| j t j| jrdnd t j| j	r-d d S d d S )N10)
r   SGLANG_ENABLE_TORCH_COMPILEr  r  r  SGLANG_MAMBA_SSM_DTYPE"SGLANG_DISABLE_OUTLINES_DISK_CACHEr  %SGLANG_ENABLE_DETERMINISTIC_INFERENCEr  r[  r3   r3   r   rK  q
  s   
z(ServerArgs._handle_environment_variablesc                 C  sr   | j r
| jr
td| jr$| jdkrtd| jdkr$tj r$tdd| j  k r2dks7td tdd S )NzThe arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive and cannot be used at the same time. Please use only one of them.rU  z\The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side.zTSpec v2 and decode offload kv cache are incompatible and cannot be enabled together.r   r   z4--swa-full-tokens-ratio should be in range (0, 1.0].)	r  r  rX  r  r  r   r  rb  r   r[  r3   r3   r   rL  |
  s(   

z&ServerArgs._handle_cache_compatibilityc                 C  s  | j d urtd d| _dtjd< dtjd< | jrd| _td d	}t| jt	j
krFz|  j}|jd
 }|dv }W n	 tyE   Y nw | jd u rlt sQt r[|rWd| _nd| _nd| _td| j dt d n| jtvr}tdt d| j d|r| jdvrtdt d| j d| jtvrd| _td| j d | jdkrt rtd d S dtjd< d| _td d S d S d S )Nz>Enable deterministic inference because of rl_on_policy_target.Tr?  SGLANG_VLM_CACHE_SIZE_MBr>  rC  r5   z?Sampling backend is set to pytorch for deterministic inference.Fr   )DeepseekV2ForCausalLMr  r$  r  r  r  rZ   r6   r_   z2Attention backend not specified. Falling back to 'zT' for deterministic inference. You can explicitly set --attention-backend to one of r  zCurrently only z] attention backends are supported for deterministic inference, but you explicitly specified 'z'.)r_   rZ   ze attention backends are supported for deterministic inference with DeepSeek models. But you're using z-Currently radix cache is not compatible with zS attention backend for deterministic inference. It will be supported in the future.r   z9AMD/ROCm: Using 1-stage all-reduce kernel (deterministic)zallreduce:tree	NCCL_ALGOztNCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1.)r  rY  rZ  r  rm  r  rX  r-   r   r
   r  r  r  r  	ExceptionrU  r'   r(   r   rX  r   r  r   r!   r
  r  )rQ  is_deepseek_modelr  r  r3   r3   r   rM  
  s|   










@z*ServerArgs._handle_deterministic_inferencec                 C  s   | j d u rd S t r#| jstd d| _| jdvr"td d| _n| js3| jdkr3td d| _| js>td d| _| jsItd	 d| _| jd
ksVtd d
| _| j	ratd d| _	| j
dkrntd d| _
| jr{td d| _d S d S )Nz>Cuda graph is disabled for diffusion LLM inference on AMD GPUsT)rZ   re   zJAttention backend is set to triton for diffusion LLM inference on AMD GPUsrZ   r6   z`Attention backend is set to flashinfer because of enabling cuda graph in diffusion LLM inferencezEOverlap schedule is disabled because of using diffusion LLM inferencez@Radix cache is disabled because of using diffusion LLM inferencer   zIPipeline parallelism is disabled because of using diffusion LLM inferencez;Currently LoRA is not supported by diffusion LLM inference.Fr  zECurrently disaggregation is not supported by diffusion LLM inference.zKMixed chunked prefill is disabled because of using diffusion LLM inference.)r  r!   r  rY  rZ  rU  r  r  r   rI  r  r  r[  r3   r3   r   rN  
  sf   





z!ServerArgs._handle_dllm_inferencec                 C  s   | j d urtd d| _d| _| jrBt| jtr t	| j| _t| jt
rBh d}| j D ]}||vrAtd| dt| q/| jrTt| jtrVt	| j| _d S d S d S )NzKCuda graph and server warmup are disabled because of using tensor dump modeT>   audioimagevideozInvalid modality 'z9' in --limit-mm-data-per-request.Allowed modalities are: )r  rY  rZ  r  r   r(  
isinstancer   jsonloadsdictkeysrX  r  rH  )rQ  allowed_modalitiesmodalityr3   r3   r   rP  $  s8   

z$ServerArgs._handle_other_validationsc                 C  s,   t  r| jd u rtd d| _d S d S d S )Nz%Set soft_watchdog_timeout since in CIr   )r2   r  rY  r
  r[  r3   r3   r   rO  D  s   

zServerArgs._handle_debug_utilsparserargparse.ArgumentParserc              	   C  s  | j ddtddd | j dttjdd | j d	ttjd
dgdd | j dttjdd | j dddd | j dttjtdd | j dtdtj	d | j dddd | j dttj
dd | j dddd | j dtjdd d! | j d"td d#d | j d$ttjd%d | j d&ttjd'd | j d(ttjd)d | j d*ttjd+d | j d,dd-d | j d.dd/d | j d0td1d2d3 | j d4ttjd5d | j d6dd7d | j d8ttjg d9d:d | j d;ttjtd<d | j d=td d>d | j d?ttjg d@dAd | j dBddCd | j dDttjdEd | j dFttjdGd | j dHttjdId | j dJttjdKd | j dLdtjdMdN | j dOttjdPd | j dQttjdRd | j dSttjdTd | j dUttjdVd | j dWttj dXd | j dYttj!dZd | j d[ttj"d\d | j d]dtj#d^dN | j d_ttj$d`d | j dattj%g dbdcd | j dddtj&dedN | j dfdtj'dgdN | j dhdtj(didN | j djttj)dkd | j dlttj*dmd | j dnttj+dod | j dpt,dqd | j drttj-dsd | j dtddud | j dvtt.tj/dwdx | j dyddzd | j d{ttj0d|d | j d}td d~d | j dtdd dd | j dtdd dd | j dttj1dd | j ddttj2dd | j ddttj3dd | j ddttj4dd | j ddttj5dd | j dttj6dd | j dttj7dd | j dttj8dd | j dddd | j dttj9dd | j dttj:dd | j dddd | j dttj;dd | j dttj<dd | j dttj=dd | j dttj>dd | j dtddd dd | j dttj?dd | j dttj@dd | j dddd | j ddd | j dttjAdd | j dttjBdd | j dddd | j dttjCdg dd | j dttjDddgdd | j dtdtjEdd | j dtdtFtGdd | j dttjHdd | j dddd | j dddd | j dddd | j dttjIdd | j dtdtjJdd | j dtKjLtjMdd | j dtdtjNdd | j dtdtjOdd | j dtdtjPdd | j ddtjQddN d}| j dtdtjRd| d | j dtdtjSd| d | j dttjTdd | j dttjUdd | j ddtjVddN | j dtd dd | j dddd | j dtddd | j dddd | j dttjWdd | j dttjXdd | j dttjYdd | j dttjZdd | j dttj[dd | j dttj\dd | j d ttj]dd | j dttj^dd | j dttj_dd | j dddd | j dttFt`jab tjcd	tFt`jab  d
dx tFtdjeb }| j dt|tjfd| d
dx | j dtd dd | j dtddgtjgddx | j ddttjhdd | j dttjidg dd | j dt,dd | j ddtdd | j dttjjd d | j d!ttjkd"d | j d#td$tjld | j d%tKjLd&d | j d'tjmdd(d! | j d)tjndd*d! | j d+tjotd,d- | j d.ttptqg dd d/d0 | j d1tdd trd2d3 | j d4td5d6d | j d7ttjsd8d | j d9ttjtd:d;gd<d | j d=ttutjvd>dx | j d?ttjwg d@dAd | j dBttxtjydCdx | j dDttxtjzdEdx | j dFttxtj{dGdx | j dHtt|tj}dIdx | j dJtt~tjdKdx | j dLtg dMtjdNdx | j dOtjttdPdQ | j dRtjttdSdQ | j dTtttjdUdVdW | j dXtttjdYdZdW | j d[tjdd\d! | j d]tg d^d_d` | j dadbtdcd | j ddtd ded | j dfttjtdgd | j dhtditjd | j djtdktjd | j dltdmtjd | j dntdotjd | j dptdqtjd | j drtdstjd | j dttdudvgdwtjdx | j dytdztjd | j d{tttjd|dx | j d}tttjd~dx | j dtttjddx | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dtddgtjddx | j dttjdd | j dttjdd | j dddd | j dddttjdd | j dtttjddx | j dtttjddx | j dtddgtjddx | j dddd | j dtg dd
ddx | j dttjdd | j dttjdd | j dttjdd | j dddd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dddd | j dttjdd | j dttjdd | j dttjddgdd | j dttjdd | j dttjdd | j dtd tdd | j dttjdd | j dtttjddx | j dttjdd | j dtttjddx | j dddd | j dttjdd | j dttjdd | j dtg dעtjddx | j dtg dڢtjddx | j dtg dݢtjddx | j dddd | j dtg dtjddx | j dtg dtjddx | j dttjdd | j dttjdd | j dddd | j dtdd | j dtddd | j dtdd | j dtddd | j dtdd | j dttjdd | j dttjdd | j dttjÐdd | j ddd d | j dttjĐdd | j dttjŐdd | j dttjƐdd | j dttjǐdd | j d	ttjȐd
d | j dttjɐdd | j dttjʐdd | j dttjːdd | j dttj̐dd | j dttj͐dd | j dttjΐdd | j dddd | j dttjϐdd | j dtddd | j dddd | j d dd!d | j d"dd#d | j d$dd%d | j d&dd'd | j d(dd)d | j d*dd+d | j d,dd-d | j d.dd/d | j d0dd1d | j d2dd3d | j d4dd5d | j d6dd7d | j d8dd9d | j d:dd;d | j d<dd=d | j d>dd?d | j d@ddAd | j dBddCd | j dDddEd | j dFttjАdGd | j dHddId | j dJddKd | j dLddMd | j dNtddOd | j dPttjѐdQdRdSgd | j dTttjҐdUd | j dVttjӐdWd | j dXttjԐdYd | j dZdd[d | j d\dd]d | j d^dd_d | j d`ttjՐdad | j dbttj֐dcd | j ddttjאded | j dfddgd | j dhddid | j djddkd | j dlddmd | j dnddod | j dpddqd | j drddsd | j dtddud | j dvddwd | j dxddyd | j dzdd{d | j d|dd}d | j d~ddd | j dttjؐdd | j dtddd | j dddd | j dttjtڐdd | j dddd | j dddd | j dttjtܐdd | j dddd | j dddd | j dddd | j dttjݐdd | j dttjސdd | j dttjߐdd | j dtddd | j dttjdd | j dttjdd | j dttjg ddd | j dttjtdd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dttjdd | j dddd | j dttjdd | j dttjdd | j dddd | j dddd | j dttjtdd | j ddtg dd | j dtdd dd | j dddd | j dttjdd | j dttjdd | j dttjdd | j dtdɐdgtjddx | j dddd | j dddd | j dtd dd | j dttjdd | j dtdՐd | j dttjdd | j dttjdd | j ddtjddN | j dtKjLtjdd | j ddtjddN | j dtKjLtjdd | j dttjdd | j dttjdd | j ddtjddN | j dttjdd d S (  Nz--model-pathz--modelzTThe path of the model weights. This can be a local folder or a Hugging Face repo ID.T)typehelprequiredz--tokenizer-pathzThe path of the tokenizer.)rU  r3  rV  z--tokenizer-moder7   slowzoTokenizer mode. 'auto' will use the fast tokenizer if available, and 'slow' will always use the slow tokenizer.)rU  r3  r   rV  z--tokenizer-worker-numz(The worker num of the tokenizer manager.z--skip-tokenizer-init
store_truezCIf set, skip init tokenizer and pass input_ids in generate request.)actionrV  z--load-formata  The format of the model weights to load. "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. "pt" will load the weights in the pytorch bin format. "safetensors" will load the weights in the safetensors format. "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading. "dummy" will initialize the weights with random values, which is mainly for profiling."gguf" will load the weights in the gguf format. "bitsandbytes" will load the weights using bitsandbytes quantization."layered" loads weights layer by layer so that one can quantize a layer before loading another to make the peak memory envelope smaller.z--model-loader-extra-configzoExtra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.)rU  rV  r3  z--trust-remote-codezYWhether or not to allow for custom models defined on the Hub in their own modeling files.z--context-lengthzoThe model's maximum context length. Defaults to None (will use the value from the model's config.json instead).z--is-embeddingz0Whether to use a CausalLM as an embedding model.z--enable-multimodalzzEnable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen)r3  rZ  rV  z
--revisionzThe specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.z--model-implag  Which implementation of the model to use.

* "auto" will try to use the SGLang implementation if it exists and fall back to the Transformers implementation if no SGLang implementation is available.
* "sglang" will use the SGLang model implementation.
* "transformers" will use the Transformers model * "mindspore" will use the MindSpore model implementation.
z--hostzThe host of the HTTP server.z--portzThe port of the HTTP server.z--fastapi-root-pathz)App is behind a path based routing proxy.z--grpc-modez/If set, use gRPC server instead of HTTP server.z--skip-server-warmupzIf set, skip warmup.z	--warmupsFzSpecify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests)rU  rW  rV  z--nccl-portzKThe port for NCCL distributed environment setup. Defaults to a random port.z---checkpoint-engine-wait-weights-before-readyzIf set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods before serving inference requests.z--dtype)r7   halfr   r   r   r   ag  Data type for model weights and activations.

* "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
* "half" for FP16. Recommended for AWQ quantization.
* "float16" is the same as "half".
* "bfloat16" for a balance between precision and range.
* "float" is shorthand for FP32 precision.
* "float32" for FP32 precision.z--quantizationzThe quantization method.z--quantization-param-pathzPath to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. z--kv-cache-dtype)r7   r  r  r  r   r  zData type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+. "fp4_e2m1" (only mxfp4) is supported for CUDA 12.8+ and PyTorch 2.8.0+z--enable-fp32-lm-headz1If set, the LM head outputs (logits) are in FP32.z--modelopt-quantzThe ModelOpt quantization configuration. Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modeloptz"--modelopt-checkpoint-restore-pathzPath to restore a previously saved ModelOpt quantized checkpoint. If provided, the quantization process will be skipped and the model will be loaded from this checkpoint.z--modelopt-checkpoint-save-pathzzPath to save the ModelOpt quantized checkpoint after quantization. This allows reusing the quantized model in future runs.z--modelopt-export-pathzPath to export the quantized model in HuggingFace format after ModelOpt quantization. The exported model can then be used directly with SGLang for inference. If not provided, the model will not be exported.z--quantize-and-servezQuantize the model with ModelOpt and immediately serve it without exporting. This is useful for development and prototyping. For production, it's recommended to use separate quantization and deployment steps.)rZ  r3  rV  z--rl-quant-profilezUPath to the FlashRL quantization profile. Required when using --load-format flash_rl.z--mem-fraction-staticzThe fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.z--max-running-requestsz'The maximum number of running requests.z--max-queued-requestsz]The maximum number of queued requests. This option is ignored when using disaggregation-mode.z--max-total-tokenszThe maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.z--chunked-prefill-sizeztThe maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.z--prefill-max-requestszWThe maximum number of requests in a prefill batch. If not specified, there is no limit.z--enable-dynamic-chunkingzEnable dynamic chunk size adjustment for pipeline parallelism. When enabled, chunk sizes are dynamically calculated based on fitted function to maintain consistent execution time across chunks.z--max-prefill-tokenszThe maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.z--schedule-policy)lpmrk  r   z
dfs-weightlofpriorityzrouting-keyz&The scheduling policy of the requests.z--enable-priority-schedulingzlEnable priority scheduling. Requests with higher priority integer values will be scheduled first by default.z!--abort-on-priority-when-disabledzTIf set, abort requests that specify a priority when priority scheduling is disabled.z$--schedule-low-priority-values-firstz~If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.z*--priority-scheduling-preemption-thresholdz_Minimum difference in priorities for an incoming request to have to preempt running request(s).z--schedule-conservativenesszHow conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.z--page-sizezThe number of tokens in a page.z--hybrid-kvcache-ratioz[Note: --hybrid-kvcache-ratio is deprecated now. Please use --swa-full-tokens-ratio instead.z--swa-full-tokens-ratiozThe ratio of SWA layer KV tokens / full layer KV tokens, regardless of the number of swa:full layers. It should be between 0 and 1. E.g. 0.5 means if each swa layer has 50 tokens, then each full layer has 100 tokens.z--disable-hybrid-swa-memoryz#Disable the hybrid SWA memory pool.z--radix-eviction-policyzqThe eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.)rU  r   r3  rV  z--enable-prefill-delayerz<Enable prefill delayer for DP attention to reduce idle time.z"--prefill-delayer-max-delay-passesz(Maximum forward passes to delay prefill.z+--prefill-delayer-token-usage-low-watermarkz.Token usage low watermark for prefill delayer.z(--prefill-delayer-forward-passes-buckets+ziCustom buckets for prefill delayer forward passes histogram. 0 and max_delay_passes-1 will be auto-added.)rU  nargsr3  rV  z&--prefill-delayer-wait-seconds-bucketszPCustom buckets for prefill delayer wait seconds histogram. 0 will be auto-added.z--devicezdThe device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.z--tensor-parallel-sizez	--tp-sizezThe tensor parallelism size.z!--attention-context-parallel-sizez--attn-cp-sizez'The attention context parallelism size.z--moe-data-parallel-sizez--moe-dp-sizezThe moe data parallelism size.z--pipeline-parallel-sizez	--pp-sizezThe pipeline parallelism size.z--pp-max-micro-batch-sizez5The maximum micro batch size in pipeline parallelism.z--pp-async-batch-depthz.The async batch depth of pipeline parallelism.z--stream-intervalzThe interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higherz--stream-outputz5Whether to output as a sequence of disjoint segments.z--random-seedzThe random seed.z%--constrained-json-whitespace-patternz(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [
	 ]*z)--constrained-json-disable-any-whitespacezb(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.z--watchdog-timeoutzuSet watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.z--soft-watchdog-timeoutzSet soft watchdog timeout in seconds. If a forward batch takes longer than this, the server will dump information for debugging.z--dist-timeoutz1Set timeout for torch.distributed initialization.z--download-dirz)Model download directory for huggingface.z--model-checksum?r   zModel file integrity verification. If provided without value, uses model-path as HF repo ID. Otherwise, provide checksums JSON file path or HuggingFace repo ID.)rU  r`  constr3  rV  z--base-gpu-idzjThe base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.z--gpu-id-stepzhThe delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...z--sleep-on-idlez%Reduce CPU usage when sglang is idle.z--custom-sigquit-handlerzRegister a custom sigquit handler so you can do additional cleanup after the server is shutdown. This is only available for Engine, not for CLI.)rV  z--log-levelz!The logging level of all loggers.z--log-level-httpzKThe logging level of HTTP server. If not set, reuse --log-level by default.z--log-requestsz_Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-levelz--log-requests-levelz0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.)r   r   r  r|  )rU  r3  rV  r   z--log-requests-formatr  rM  zJFormat for request logging: 'text' (human-readable) or 'json' (structured)z--log-requests-targetzTarget(s) for request logging: 'stdout' and/or directory path(s) for file output. Can specify multiple targets, e.g., '--log-requests-target stdout /my/path'. z%--uvicorn-access-log-exclude-prefixes*zExclude uvicorn access logs whose request path starts with any of these prefixes. Defaults to empty (disabled). Example: --uvicorn-access-log-exclude-prefixes /metrics /healthz--crash-dump-folderzvFolder path to dump requests from the last 5 min before a crash (if any). If not specified, crash dumping is disabled.z--show-time-costzShow time cost of custom marks.z--enable-metricszEnable log prometheus metrics.z#--enable-metrics-for-all-schedulerszEnable --enable-metrics-for-all-schedulers when you want schedulers on all TP ranks (not just TP 0) to record request metrics separately. This is especially useful when dp_attention is enabled, as otherwise all metrics appear to come from TP 0.z(--tokenizer-metrics-custom-labels-headerzHSpecify the HTTP header for passing custom labels for tokenizer metrics.z)--tokenizer-metrics-allowed-custom-labelsa  The custom labels allowed for tokenizer metrics. The labels are specified via a dict in '--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': 'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.z--extra-metric-labelszNThe custom labels for metrics. e.g. '{"label1": "value1", "label2": "value2"}'z--bucket-time-to-first-tokenzBThe buckets of time to first token, specified as a list of floats.z--bucket-inter-token-latencyzBThe buckets of inter-token latency, specified as a list of floats.z--bucket-e2e-request-latencyzIThe buckets of end-to-end request latency, specified as a list of floats.z--collect-tokens-histogramz+Collect prompt/generation tokens histogram.aU  Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets [984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> <value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500').--prompt-tokens-bucketsz#The buckets rule of prompt tokens. --generation-tokens-bucketsz2The buckets rule for generation tokens histogram. z--gc-warning-threshold-secszqThe threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.z--decode-log-intervalz!The log interval of decode batch.z#--enable-request-time-stats-loggingz%Enable per request time stats loggingz--kv-events-configzmConfig in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.z--enable-tracezEnable opentelemetry tracez--otlp-traces-endpointr,  zUConfig opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>z--export-metrics-to-filezdExport performance metrics for each request to local file (e.g. for forwarding to external systems).z--export-metrics-to-file-dirziDirectory path for writing performance metrics files (required when --export-metrics-to-file is enabled).z	--api-keyzOSet API key of the server. It is also used in the OpenAI API compatible server.z--admin-api-keyzSet admin API key for sensitive management endpoints (e.g. /clear_hicache_storage_backend). When set, admin endpoints require this key and do NOT accept --api-key.z--served-model-namezPOverride the model name returned by the v1/models endpoint in OpenAI API server.z--weight-versionzQVersion identifier for the model weights. Defaults to 'default' if not specified.z--chat-templatezzThe buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.z--hf-chat-template-namezWhen the HuggingFace tokenizer has multiple chat templates (e.g., 'default', 'tool_use', 'rag'), specify which named template to use. If not set, the first available template is used.z--completion-templatezThe buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.z--file-storage-pathz(The path of the file storage in backend.z--enable-cache-reportzVReturn number of cached tokens in usage.prompt_tokens_details for each openai request.z--reasoning-parserz@Specify the parser for reasoning models, supported parsers are: r  z--tool-call-parserzISpecify the parser for handling tool-call interactions. Options include: z--tool-serverzEither 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.z--sampling-defaultsopenair>  zWhere to get default sampling parameters. 'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). 'model' uses the model's generation_config.json to get the recommended sampling parameters if available. Default is 'model'.z--data-parallel-sizez	--dp-sizezThe data parallelism size.z--load-balance-methodz1The load balancing strategy for data parallelism.)r7   rW  rV  total_requeststotal_tokensz--prefill-round-robin-balancez6Note: --prefill-round-robin-balance is deprecated now.z--dist-init-addrz--nccl-init-addrzRThe host address for initializing distributed backend (e.g., `192.168.0.2:25000`).)rU  rV  z--nnodeszThe number of nodes.z--node-rankzThe node rank.z--json-model-override-argszQA dictionary in JSON string format used to override default model configurations.z--preferred-sampling-paramszIjson-formatted sampling settings that will be returned in /get_model_infoz--enable-lorazEnable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.z--enable-lora-overlap-loadingzEnable asynchronous LoRA weight loading in order to overlap H2D transfers with GPU compute. This should be enabled if you find that your LoRA workloads are bottlenecked by adapter weight loading, for example when frequently loading large LoRA adapters.z--max-lora-rankzThe maximum rank of LoRA adapters. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.)r3  rU  rV  z--lora-target-moduleszThe union set of all target modules where LoRA should be applied. If not specified, it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, all supported modules will be targeted.)rU  r   r`  r3  rV  z--lora-pathszThe list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool})rU  r`  r3  rZ  rV  z--max-loras-per-batchrO  zJMaximum number of adapters for a running batch, include base-only request.z--max-loaded-loraszIf specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.z--lora-eviction-policyrv   fifozLoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.z--lora-backendz1Choose the kernel backend for multi-LoRA serving.z--max-lora-chunk-size)rS  r  r  r   zMaximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.z--attention-backendz(Choose the kernels for attention layers.z--prefill-attention-backendzYChoose the kernels for prefill attention layers (have priority over --attention-backend).z--decode-attention-backendzXChoose the kernels for decode attention layers (have priority over --attention-backend).z--sampling-backendz'Choose the kernels for sampling layers.z--grammar-backendz/Choose the backend for grammar-guided decoding.z--mm-attention-backend)sdpar_   r`   triton_attnascend_attn
aiter_attnz!Set multimodal attention backend.z--nsa-prefill-backendzYNSA prefill backend. If not specified, auto-detects based on hardware and kv_cache_dtype.)r3  rU  r   rV  z--nsa-decode-backendzXNSA decode backend. If not specified, auto-detects based on hardware and kv_cache_dtype.z--fp8-gemm-backendr[  a  Choose the runner backend for Blockwise FP8 GEMM operations. Options: 'auto' (default, auto-selects based on hardware), 'deep_gemm' (JIT-compiled; enabled by default on NVIDIA Hopper (SM90) and Blackwell (SM100) when DeepGEMM is installed), 'flashinfer_trtllm' (optimal for Blackwell and low-latency), 'flashinfer_deepgemm' (Hopper SM90 only; uses swapAB optimization for small M dimensions in decoding), 'cutlass' (optimal for Hopper/Blackwell GPUs and high-throughput), 'triton' (fallback, widely compatible), 'aiter' (ROCm only). NOTE: This replaces the deprecated environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.)rU  r   r3  destrV  z--fp4-gemm-backendr\  a  Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). NOTE: This replaces the deprecated environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND.z--disable-flashinfer-autotunezDisable FlashInfer autotuning.z--speculative-algorithm)r  r#  r"  r}  r~  zSpeculative algorithm.)rU  r   rV  z--speculative-draft-model-pathz--speculative-draft-modelzZThe path of the draft model weights. This can be a local folder or a Hugging Face repo ID.z"--speculative-draft-model-revisionzThe specific draft model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.z--speculative-draft-load-formatzThe format of the draft model weights to load. If not specified, will use the same format as --load-format. Use 'dummy' to initialize draft model weights with random values for profiling.z--speculative-num-stepszEThe number of steps sampled from draft model in Speculative Decoding.z--speculative-eagle-topkzFThe number of tokens sampled from the draft model in eagle2 each step.z--speculative-num-draft-tokenszJThe number of tokens sampled from the draft model in Speculative Decoding.z%--speculative-accept-threshold-singlez[Accept a draft token if its probability in the target model is greater than this threshold.z"--speculative-accept-threshold-acczmThe accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).z--speculative-token-mapz0The path of the draft model's small vocab table.z--speculative-attention-moderk  rU  zAttention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.)rU  r   rV  r3  z%--speculative-draft-attention-backendz4Attention backend for speculative decoding drafting.z --speculative-moe-runner-backendz:Choose the runner backend for MoE in speculative decoding.z--speculative-moe-a2a-backendz6Choose the backend for MoE A2A in speculative decodingz&--speculative-draft-model-quantizationz.The quantization method for speculative model.z)--speculative-ngram-min-match-window-sizezKThe minimum window size for pattern matching in ngram speculative decoding.z)--speculative-ngram-max-match-window-sizezKThe maximum window size for pattern matching in ngram speculative decoding.z#--speculative-ngram-min-bfs-breadthzQThe minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.z#--speculative-ngram-max-bfs-breadthzQThe maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.z--speculative-ngram-match-typerv  PROBzThe match type for cache tree.z!--speculative-ngram-branch-lengthz1The branch length for ngram speculative decoding.z--speculative-ngram-capacityz2The cache capacity for ngram speculative decoding.z--enable-multi-layer-eaglez.Enable multi-layer Eagle speculative decoding.z--expert-parallel-sizez	--ep-sizez--epzThe expert parallelism size.z--moe-a2a-backendzChoose the backend for MoE A2A.z--moe-runner-backendz"Choose the runner backend for MoE.z --flashinfer-mxfp4-moe-precisionr3  r  z8Choose the computation precision of flashinfer mxfp4 moez$--enable-flashinfer-allreduce-fusionz9Enable FlashInfer allreduce fusion with Residual RMSNorm.z--deepep-mode)r  low_latencyr7   zSelect the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.z--ep-num-redundant-expertsz=Allocate this number of redundant experts in expert parallel.z--ep-dispatch-algorithmzGThe algorithm to choose ranks for redundant experts in expert parallel.z--init-expert-locationzInitial location of EP experts.z--enable-eplbzEnable EPLB algorithmz--eplb-algorithmzChosen EPLB algorithmz--eplb-rebalance-num-iterationsz@Number of iterations to automatically trigger a EPLB re-balance.z!--eplb-rebalance-layers-per-chunkz/Number of layers to rebalance per forward pass.z,--eplb-min-rebalancing-utilization-thresholdzkMinimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].z#--expert-distribution-recorder-modez%Mode of expert distribution recorder.z*--expert-distribution-recorder-buffer-sizezZCircular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.z$--enable-expert-distribution-metricsz.Enable logging metrics for expert balancednessz--deepep-configznTuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.z--moe-dense-tp-sizezTP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.z--elastic-ep-backendrs   rj   z[Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.z--mooncake-ib-devicezThe InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices (e.g., --mooncake-ib-device mlx5_0,mlx5_1). Default is None, which triggers automatic device detection when Mooncake Backend is enabled.z--max-mamba-cache-sizez$The maximum size of the mamba cache.z--mamba-ssm-dtypezmThe data type of the SSM states in mamba cache. If not set, will be read from model config (mamba_ssm_dtype).z--mamba-full-memory-ratioz8The ratio of mamba state memory to full kv cache memory.z--mamba-scheduler-strategyz*The strategy to use for mamba radix cache.z--mamba-track-intervalz4The interval to track the mamba state during decode.z--mamba-backendzChoose the kernel backend for Mamba SSM operations. Default is 'triton'. Options: 'triton' (default), 'flashinfer' (requires FlashInfer with Mamba support).z--enable-hierarchical-cachezEnable hierarchical cachez--hicache-ratiozNThe ratio of the size of host KV cache memory pool to the size of device pool.z--hicache-sizezaThe size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.z--hicache-write-policy)
write_backr  write_through_selectivez'The write policy of hierarchical cache.z--hicache-io-backend)r  r  kernel_ascendz8The IO backend for KV cache transfer between CPU and GPUz--hicache-mem-layout)r  r  r  page_first_kv_split	page_headz6The layout of host memory pool for hierarchical cache.z--disable-hicache-numa-detectzrDisable binding the process to the NUMA node closest to the active CUDA device when hierarchical cache is enabled.z--hicache-storage-backend)filerj   hf3fsrk   aibrixdynamiceica  The storage backend for hierarchical KV cache. Built-in backends: file, mooncake, hf3fs, nixl, aibrix. For dynamic backend, use --hicache-storage-backend-extra-config to specify: backend_name (custom name), module_path (Python module path), class_name (backend class name).z!--hicache-storage-prefetch-policy)r  wait_completetimeoutz>Control when prefetching from the storage backend should stop.z&--hicache-storage-backend-extra-configzA dictionary in JSON string format, or a string starting with a leading '@' and a config file in JSON/YAML/TOML format, containing extra configuration for the storage backend.z,--hierarchical-sparse-attention-extra-configaJ  A dictionary in JSON string format for hierarchical sparse attention configuration. Required fields: algorithm (str), backend (str). All other fields are algorithm-specific and passed to the algorithm constructor. Example: '{"algorithm": "quest", "backend": "flashattention", "sparsity_ratio": 0.7, "min_sparse_prompt_len": 2048}'z--enable-lmcachez;Using LMCache as an alternative hierarchical cache solutionz--kt-weight-pathzb[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.z--kt-methodAMXINT4zA[ktransformers parameter] Quantization formats for CPU execution.z--kt-cpuinferz9[ktransformers parameter] The number of CPUInfer threads.z--kt-threadpool-countr  z^[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).z--kt-num-gpu-expertsz4[ktransformers parameter] The number of GPU experts.z#--kt-max-deferred-experts-per-tokenz[ktransformers parameter] Maximum number of experts deferred to CPU per token. All MoE layers except the final one use this value; the final layer always uses 0.z--dllm-algorithmz3The diffusion LLM algorithm, such as LowConfidence.z--dllm-algorithm-configz@The diffusion LLM algorithm configurations. Must be a YAML file.z--enable-double-sparsityz Enable double sparsity attentionz--ds-channel-config-pathz.The path of the double sparsity channel configz--ds-heavy-channel-numz9The number of heavy channels in double sparsity attentionz--ds-heavy-token-numz7The number of heavy tokens in double sparsity attentionz--ds-heavy-channel-typez7The type of heavy channels in double sparsity attentionz--ds-sparse-decode-thresholdzThe minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.z--cpu-offload-gbz2How many GBs of RAM to reserve for CPU offloading.z--offload-group-sizez)Number of layers per group in offloading.z--offload-num-in-groupz0Number of layers to be offloaded within a group.z--offload-prefetch-stepz Steps to prefetch in offloading.z--offload-modezMode of offloading.z--multi-item-scoring-delimiterzDelimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.z--disable-radix-cachez*Disable RadixAttention for prefix caching.z--cuda-graph-max-bszjSet the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.z--cuda-graph-bsz+Set the list of batch sizes for cuda graph.)rU  r`  rV  z--disable-cuda-graphzDisable cuda graph.z--disable-cuda-graph-paddingz\Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.z--enable-profile-cuda-graphz'Enable profiling of cuda graph capture.z--enable-cudagraph-gczEnable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.z--enable-layerwise-nvtx-markerz:Enable layerwise NVTX profiling annotations for the model.z--enable-nccl-nvlsz;Enable NCCL NVLS for prefill heavy requests when available.z--enable-symm-memz2Enable NCCL symmetric memory for fast collectives.z.--disable-flashinfer-cutlass-moe-fp4-allgatherz?Disables quantize before all-gather for flashinfer cutlass moe.z--enable-tokenizer-batch-encodezEnable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.z --disable-tokenizer-batch-decodez:Disable batch decoding when decoding multiple completions.z--disable-outlines-disk-cachezdDisable disk cache of outlines to avoid possible crashes related to file system or high concurrency.z--disable-custom-all-reducez;Disable the custom all-reduce kernel and fall back to NCCL.z--enable-mscclppzTEnable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.z--enable-torch-symm-memzEnable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM100 supports world size 6, 8.z--disable-overlap-schedulezVDisable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.z--enable-mixed-chunkzIEnabling mixing prefill and decode in a batch when using chunked prefill.z--enable-dp-attentionzEnabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.z--enable-dp-lm-headzEnable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.z--enable-two-batch-overlapz&Enabling two micro batches to overlap.z--enable-single-batch-overlapzALet computation and communication overlap within one micro batch.z"--tbo-token-distribution-thresholdzThe threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.z--enable-torch-compilez<Optimize the model with torch.compile. Experimental feature.z!--enable-torch-compile-debug-modez#Enable debug mode for torch compilez--enable-piecewise-cuda-graphz[Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.z--piecewise-cuda-graph-tokensz?Set the list of token lengths for piecewise cuda graph capture.z--piecewise-cuda-graph-compilerzHSet the compiler for piecewise cuda graph. Choices are: eager, inductor.r  inductorz--torch-compile-max-bsz4Set the maximum batch size when using torch compile.z!--piecewise-cuda-graph-max-tokensz7Set the maximum tokens when using piecewise cuda graph.z--torchao-configzOptimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo, fp8dq-per_tensor, fp8dq-per_rowz--enable-nan-detectionz0Enable the NaN detection for debugging purposes.z--enable-p2p-checkzPEnable P2P check for GPU access, otherwise the p2p access is allowed by default.z!--triton-attention-reduce-in-fp32zCast the intermediate attention results to fp32 to avoid possible crashes related to fp16.This only affects Triton attention kernels.z --triton-attention-num-kv-splitszThe number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.z"--triton-attention-split-tile-sizez\The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.z--num-continuous-decode-stepszRun multiple continuous decoding steps to reduce scheduling overhead. This can potentially increase throughput but may also increase time-to-first-token latency. The default value is 1, meaning only run one decoding step at a time.z--delete-ckpt-after-loadingz4Delete the model checkpoint after loading the model.z--enable-memory-saverzPAllow saving memory using release_memory_occupation and resume_memory_occupationz--enable-weights-cpu-backupzSave model weights (both main model and draft model, if any) to CPU memory during release_weights_occupation and resume_weights_occupationz!--enable-draft-weights-cpu-backupzfSave draft model weights to CPU memory during release_weights_occupation and resume_weights_occupationz--allow-auto-truncatezkAllow automatically truncating requests that exceed the maximum input length instead of returning an error.z--enable-custom-logit-processorz]Enable users to pass custom logit processors to the server (disabled by default for security)z--flashinfer-mla-disable-raggedz<Not using ragged prefill wrapper when running flashinfer mlaz--disable-shared-experts-fusionz>Disable shared experts fusion optimization for deepseek v3/r1.z--disable-chunked-prefix-cachezbDisable chunked prefix cache feature for deepseek, which should save overhead for short sequences.z--disable-fast-image-processorz;Adopt base image processor instead of fast image processor.z--keep-mm-feature-on-devicezLKeep multimodal feature tensors on device after processing to save D2H copy.z--enable-return-hidden-statesz.Enable returning hidden states with responses.z--enable-return-routed-expertsz=Enable returning routed experts of each layer with responses.z--scheduler-recv-intervalz\The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.z--numa-nodezUSets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.z --enable-deterministic-inferencez=Enable deterministic inference mode with batch invariant ops.z--rl-on-policy-targetzBThe training system that SGLang needs to match for true on-policy.z --enable-attn-tp-input-scatteredzAllow input of attention to be scattered when only using tensor parallelism, to reduce the computational load of operations such as qkv latent.z%--enable-nsa-prefill-context-parallelzTEnable context parallelism used in the long sequence prefill phase of DeepSeek v3.2.z--nsa-prefill-cp-modea&  Token splitting mode for the prefill phase of DeepSeek v3.2 under context parallelism. Optional values: 'round-robin-split'(default), 'in-seq-split'  'round-robin-split' distributes tokens across ranks based on token_idx %% cp_size. It supports multi-batch prefill, fused MoE, and FP8 KV cache.z--enable-fused-qk-norm-ropez8Enable fused qk normalization and rope rotary embedding.z(--enable-precise-embedding-interpolationzEnable corner alignment for resize of embeddings grid to ensure more accurate(but slower) evaluation of interpolated embedding values.z --enable-dynamic-batch-tokenizerziEnable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.z$--dynamic-batch-tokenizer-batch-sizezf[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.z'--dynamic-batch-tokenizer-batch-timeoutzm[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.z!--debug-tensor-dump-output-folderz&The output folder for dumping tensors.z--debug-tensor-dump-layersz8The layer ids to dump. Dump all layers if not specified.z--debug-tensor-dump-input-filez&The input filename for dumping tensorsz--debug-tensor-dump-injectz8Inject the outputs from jax as the input of every layer.z--disaggregation-moderT  zOnly used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregatedz!--disaggregation-transfer-backendz=The backend for disaggregation transfer. Default is mooncake.z--disaggregation-bootstrap-portz=Bootstrap server port on the prefill server. Default is 8998.z--disaggregation-decode-tpzqDecode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.z--disaggregation-decode-dpzqDecode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.z--disaggregation-prefill-ppzWPrefill pp size. If not set, it is default to 1. This is only set on the decode server.z--disaggregation-ib-devicea&  The InfiniBand devices for disaggregation transfer, accepts single device (e.g., --disaggregation-ib-device mlx5_0) or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). Default is None, which triggers automatic device detection when mooncake backend is enabled.z.--disaggregation-decode-enable-offload-kvcachez<Enable async KV cache offloading on decode server (PD mode).z--num-reserved-decode-tokenszdNumber of decode tokens that will have memory reserved when adding new request to the running batch.z(--disaggregation-decode-polling-intervalz`The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.z--encoder-onlyz7For MLLM with an encoder, launch an encoder-only serverz--language-onlyz2For VLM, load weights for the language model only.z--encoder-transfer-backendzMThe backend for encoder disaggregation transfer. Default is zmq_to_scheduler.z--encoder-urlszList of encoder server urls.)r`  rU  r3  rV  z--custom-weight-loaderzThe custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_funcz--weight-loader-disable-mmapz4Disable mmap while loading weight using safetensors.z0--remote-instance-weight-loader-seed-instance-ipzEThe ip of the seed instance for loading weights from remote instance.z:--remote-instance-weight-loader-seed-instance-service-portzOThe service port of the seed instance for loading weights from remote instance.z8--remote-instance-weight-loader-send-weights-group-portszGThe communication group ports for loading weights from remote instance.z'--remote-instance-weight-loader-backendr+  r  zlThe backend for loading weights from remote instance. Can be 'transfer_engine' or 'nccl'. Default is 'nccl'.z>--remote-instance-weight-loader-start-seed-via-transfer-enginezPStart seed server via transfer engine backend for remote instance weight loader.z--enable-pdmuxz6Enable PD-Multiplexing, PD running on greenctx stream.z--pdmux-config-pathz,The path of the PD-Multiplexing config file.z--sm-group-numzNumber of sm partition groups.--configzTRead CLI options from a config file. Must be a YAML file with configuration options.z--mm-max-concurrent-callsz6The max concurrent calls for async mm data processing.z--mm-per-request-timeoutz4The timeout for each multi-modal request in seconds.z$--enable-broadcast-mm-inputs-processz0Enable broadcast mm-inputs process in scheduler.z--mm-process-configzWMultimodal preprocessing config, a json config contains keys: `image`, `video`, `audio`z--mm-enable-dp-encoderz_Enabling data parallelism for mm encoder. The dp size will be set to the tp size automatically.z--limit-mm-data-per-requestz^Limit the number of multimodal inputs per request. e.g. '{"image": 1, "video": 1, "audio": 1}'z--decrypted-config-filez&The path of the decrypted config file.z--decrypted-draft-config-filez,The path of the decrypted draft config file.z--enable-prefix-mm-cachez@Enable prefix multimodal cache. Currently only supports mm-only.z--forward-hookszBJSON-formatted forward hook specifications to attach to the model.)add_argumentr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   DeprecatedActionr   r   r   r   r   r   rB  rC  r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rM  rN  r  r   r!  r"  r#  r$  r%  r&  r(  r)  r/  r0  r1  r2  r4  r5  r6  r7  r9  r   DetectorMaprP  r;  r   ToolCallParserEnumr<  r?  r@  rA  rE  rF  rG  rI  rJ  rK  r   r   LoRAPathActionrN  rQ  LORA_BACKEND_CHOICESrR  rT  r   rU  rW  rV  SAMPLING_BACKEND_CHOICESrX  r   rY  rZ  r]  NSA_CHOICESr^  r   r[  r   r\  r_  rd  re  rf  rg  rh  ri  rj  rl  rm  r   rn  MOE_A2A_BACKEND_CHOICESro  ,SPECULATIVE_DRAFT_MODEL_QUANTIZATION_CHOICESrp  rq  rs  rt  ru  rw  ry  rz  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   MAMBA_SCHEDULER_STRATEGY_CHOICESr  r  MAMBA_BACKEND_CHOICESr`  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  NSA_PREFILL_CP_SPLIT_CHOICESr  r  r  r  r  r  r	  r   r
  r  r  r  r  r  r  r   ENCODER_TRANSFER_BACKEND_CHOICESr  r  r+   r  r  r!  r"  r#  r$  r'  r&  r(  r)  r*  r%  r+  )rS  bucket_ruletool_call_parser_choicesr3   r3   r   add_cli_argsI  sn  							


		
	






	
	
zServerArgs.add_cli_argsargsargparse.Namespacec                   s`    j  _ j _ j _ j _ j _	 j
 _dd t| D }| di  fdd|D S )Nc                 S  s   g | ]}|j qS r3   )namer  attrr3   r3   r   r    s    z,ServerArgs.from_cli_args.<locals>.<listcomp>c                   s   i | ]}|t  |qS r3   )r  r  r  r3   r   
<dictcomp>  r5  z,ServerArgs.from_cli_args.<locals>.<dictcomp>r3   )tensor_parallel_sizer   pipeline_parallel_sizer   attention_context_parallel_sizerB  moe_data_parallel_sizerC  data_parallel_sizer@  expert_parallel_sizer|  dataclassesfields)clsr  attrsr3   r  r   from_cli_args  s   zServerArgs.from_cli_argsc                 C  s2   t | jrd| j d| j S d| j d| j S )Nzhttp://[z]:zhttp://:)r*   r   r   r[  r3   r3   r   url  s   
zServerArgs.urlc                 C  s.   ddl m} t| dr| jS || | _| jS )Nr   )ModelConfigr  )r  r  hasattrr  from_server_args)rQ  r  r3   r3   r   r    s
   
zServerArgs.get_model_configc                 C  s,   | j r| j n| j}| jr| jn| j}||fS r   )rW  rU  rV  )rQ  r  r  r3   r3   r   r    s   z!ServerArgs.get_attention_backendsc                 C  s    ddl m} |  }|j|jkS )Nr   )AttentionArch)r  r  r  attention_archMLA)rQ  r  r  r3   r3   r   r    s   zServerArgs.use_mla_backendc                 C  s   | j d u o| jd u o| jd u S r   )rU  rW  rV  r[  r3   r3   r   r    s
   
z'ServerArgs.is_attention_backend_not_setc                 C  s
   | j dkS )Nr   )r  r[  r3   r3   r   r    s   
z$ServerArgs.enable_mamba_extra_bufferc                 C  s   t t| jS r   )r  r  r   r[  r3   r3   r   mamba_cache_chunk_size  s   z!ServerArgs.mamba_cache_chunk_sizec                 C  s~  | j | j | j dksJ d| jdkr#| jr| jd u r| jr#J d| jdkr4| jdkr4| js4J d| jdks=J d| j	dksFJ d| j
dv sOJ d	d
| jvs]J d| j d|   |   | jd urq| jrqJ d| jdkr| jdkr| j| j dksJ d| jr| jdksJ d| jdksJ d| jdksJ d| jsJ ddd l}tdkrtd|j d | jdksJ d| d| j | d| j | jr| jdv sJ d| j d| jd ur| jsJ d| jdksJ d | jdksJ d!| j d"krt! sJ d#| j"s| j#rt$d$| j%r-| j&d u r-t$d%| j'r;| j(d&kr=t$d'd S d S )(Nr   z,tp_size must be divisible by number of nodesr   zjPipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill.z>multi-node data parallel is not supported unless dp attention!z base_gpu_id must be non-negativezgpu_id_step must be positive>   Nr   z3moe_dense_tp_size only support 1 and None currentlyr  zserved_model_name cannot contain a colon (':') character. The colon is reserved for the 'model:adapter' syntax used in LoRA adapter specification. Invalid value: ''z7enable_mixed_chunk is required for speculative decodingrU  z3chunked_prefill_size must be divisible by page_sizezQPD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1).r  z7PD-Multiplexing is not compatible with chunked prefill.r  z;PD-Multiplexing is not compatible with disaggregation mode.z8PD-Multiplexing is not compatible with overlap schedule.)r     zxWARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.
  Current torch version is z(.
  Please manually install torch 2.6.x.zTokenizer worker num must >= 1rd  re  )r   r]  zFTo use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. 'z' is not supported.zMulti-item scoring requires radix cache to be disabled. Please set --disable-radix-cache when using --multi-item-scoring-delimiter.zMulti-item scoring requires chunked prefill to be disabled. Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter.z.schedule_conservativeness must be non-negative	mindsporez5MindSpore model impl is only supported on Ascend npu.zkPlease set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels.zQ--export-metrics-to-file-dir is required when --export-metrics-to-file is enabledrs   zBWhen enabling two batch overlap, moe_a2a_backend cannot be 'none'.))r   r   rE  r  ra  r  r@  r  r  r  r  r2  check_lora_server_args%check_torch_2_9_1_cudnn_compatibilityr   r  r   r  r  r.   rY  rZ  __version__r   validate_buckets_ruler$  r%  r   r   r  r  r   r   r$   r  r  rX  r.  r/  r  r}  )rQ  r  r3   r3   r   check_server_args
  s   




zServerArgs.check_server_argsc                 C  s   t drd S |  jrndd l}td d dkrpd }z|jj }W n ty,   d }Y nw |d ur^t	t
|d d d }|dk r\d}d}d	}| | d
|j d|dd| }t|d S d}d	}t| d|  d S d S d S )NSGLANG_DISABLE_CUDNN_CHECKr   r|  )r  r  r   d   gL"@z[91mz[1m[0mzCRITICAL WARNING: PyTorch 2.9.1 & CuDNN Compatibility Issue Detected
--------------------------------------------------------------------------------
Current Environment: PyTorch z	 | CuDNN z.2fac  

Issue:     There is a KNOWN BUG in PyTorch 2.9.1's `nn.Conv3d` implementation
           when used with CuDNN versions older than 9.15. This can cause
           SEVERE PERFORMANCE DEGRADATION and EXCESSIVE MEMORY USAGE.

Reference: https://github.com/pytorch/pytorch/issues/168167

Solution:  You MUST upgrade CuDNN to version 9.15+ to ensure correctness.

Run the following command immediately to fix:
    pip install nvidia-cudnn-cu12==9.16.0.29

Or you can disable this check by setting env var SGLANG_DISABLE_CUDNN_CHECK=1
--------------------------------------------------------------------------------
zqWARNING: Could not determine CuDNN version for torch==2.9.1. Please ensure CuDNN >= 9.15 to avoid nn.Conv3d bugs.)r   r  r  r  r.   backendscudnnversionrG  r   r   r  r	  rY  rZ  )rQ  r  cudnn_versionversion_floatREDBOLDRESETmsgr3   r3   r   r    sD   

z0ServerArgs.check_torch_2_9_1_cudnn_compatibilityc                 C  s  | j dks	J d| jr$| jd u rd| _td n
| jdu r$td | jr}| jd u r0d| _| jrI| j d }| jd urB| j|ksIJ d| | jd	vrRtd
t	| jt
r| j}g | _|D ]W}t	|trd|v r{|dd\}}t||dd}n6t||dd}n.t	|trd|v rd|v sJ d| t|d |d |ddd}n
tdt| d| j| q`n&t	| jtrdd | j D | _n| jd u rg | _ntdt| j d| jrt| j| _d| jv rt| jdksJ dtt| _| jdkrtd | jd | jd | js&| jr"| js&J d| jd urV| j| j ks?J d| j d | j  t| j| jksVJ d!| j d"t| j | jd urd#| j  krid$kryn J d%| j| jd @ dksJ d%d S d S d S )&Nr   z$max_loras_per_batch must be positiveTz>--enable-lora is set to True because --lora-paths is provided.FzG--enable-lora is set to False, any provided lora_paths will be ignored.r  zEnabling LoRA overlap loading requires pinning LoRA adapter weights in CPU memory, so --max-loaded-loras must be less than or equal to double --max-loras-per-batch: )r~  NzBCurrently LoRA is only compatible with NGRAM speculative decoding.=r   	lora_name	lora_pathpinnedr  r  zmWhen providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: r  z,Invalid type for item in --lora-paths list: z$. Expected a string or a dictionary.c                 S  s   g | ]\}}t ||d dqS )Fr  r   )r  kvr3   r3   r   r    s    z5ServerArgs.check_lora_server_args.<locals>.<listcomp>zInvalid type for --lora-paths: z". Expected a list or a dictionary.allzWIf 'all' is specified in --lora-target-modules, it should be the only module specified.ri   zLoRA backend 'csgmv' does not yet support embedding or lm_head layers; dropping 'embed_tokens' and 'lm_head' from --lora-target-modules=all. To apply LoRA to these, use --lora-backend triton.embed_tokenslm_headzWhen no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization.zZmax_loaded_loras should be greater than or equal to max_loras_per_batch. max_loaded_loras=z, max_loras_per_batch=zNThe number of LoRA paths should not exceed max_loaded_loras. max_loaded_loras=z, lora_paths=rS  r   z>--max-lora-chunk-size must be a power of 2 between 16 and 128.)rP  rM  rI  rY  rZ  rJ  rN  ra  rX  rL  r  r   r6  r   rO  rb  rU  appenditemsrL  r  r  r   rR  discardrK  rT  )rQ  max_loaded_loras_limitrM  r  r  rn  lora_refr3   r3   r   r    s   











dz!ServerArgs.check_lora_server_args
prefill_tp	decode_tpc                 C  s8   t ||}t||}|| dksJ d| d| d S )Nr   zTDifferent tp size is supported only when one tp is multiple of the other. decode_tp=z, prefill_tp=)r  r  )rQ  r  r  	larger_tp
smaller_tpr3   r3   r   r-  3  s   

z"ServerArgs.validate_disagg_tp_sizearg_namebuckets_rulec              	   C  s  |sd S t |dksJ | d|d }|dv s$J d| d| d|dkrt |dks9J | d	t | zt|d
 }t|d }t|d }W n ttfy^   	J | dw |d
kslJ | d| |dksyJ | d| |dksJ | d| d S |dkrt |d
ksJ | dt | d S |dkrt |dksJ | dzdd |d
d  D }W n ty   J | dw t t|t |ksJ | dtdd |D sJ | dd S d S )Nr   z cannot be empty list)tser3  customzUnsupported z rule type: 'z-'. Must be one of: 'tse', 'default', 'custom'r  rv  zK TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got r   r  r|  FzP TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]z& TSE base must be larger than 1, got: z" TSE count must be positive, got: z# TSE middle must be positive, got: r3  z? default rule should only have one parameter: ['default'], got r  zH custom rule requires at least one bucket value: ['custom', value1, ...]c                 S  s   g | ]}t |qS r3   )r   )r  rf  r3   r3   r   r  a  s    z4ServerArgs.validate_buckets_rule.<locals>.<listcomp>z* custom rule bucket values must be numericz8 custom rule bucket values should not contain duplicatesc                 s  s    | ]}|d kV  qdS )r   Nr3   )r  valr3   r3   r   r  g  s    
z3ServerArgs.validate_buckets_rule.<locals>.<genexpr>z1 custom rule bucket values should be non-negative)r  r   r   rX  
IndexErrorr  r  )rQ  r  r  rulemiddlebasecountbucket_valuesr3   r3   r   r  ;  sb   

z ServerArgs.validate_buckets_rulec                 C  s   t |jdd }|d u rd S | j}d}t |dd}t |dd}d}d}||d  }	||d  }
|
dkr6|	|
 nd	}d
}d	||d	   }tdtd|}|| }|| | _d S )Nvision_configgffffff?num_hidden_layersrw  hidden_sizer{  r  r   r   g?r   g?)r  r  r   r  r  )rQ  r  r   original_server_arg_mem_fraction!base_mem_fraction_reduction_ratiovit_num_layersvit_hidden_sizebaseline_vit_layersbaseline_vit_hidden_sizecurrent_complexity_scorebaseline_complexity_scorecomplexity_ratiosensitivity_scaledynamic_adjustment_factorfinal_overall_factorr3   r3   r   r  k  s,   z&ServerArgs.adjust_mem_fraction_for_vlmc                 C  s6   t jdd u rtd dS | jrtd dS dS )Nzmooncake.enginezqFailed to import mooncake.engine. Does not support using TransferEngine as remote instance weight loader backend.FzMemory saver is enabled, which is not compatible with TransferEngine. Does not support using TransferEngine as remote instance weight loader backend.T)	importlibutil	find_specrY  rZ  r  r[  r3   r3   r   r,    s   z#ServerArgs.validate_transfer_enginec                 C  s&   | j rdS | jdkr| jdkrdS dS )NTrB   r+  F)r  r   r  r[  r3   r3   r   1remote_instance_weight_loader_use_transfer_engine  s   

z<ServerArgs.remote_instance_weight_loader_use_transfer_engine)r  r   r  r   )r   r   r  r   r  r   )TTN)r  r   r  r   r  r   r  r   )r.  r   r  r   )rS  rT  )r  r  )r  r   )r  r   )r  r   r  r   )r  r   r  r  (  __name__
__module____qualname____doc____annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  fieldr  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r(  r)  r*  r+  r-  r.  r/  r0  r1  r2  r4  r5  r6  r7  r9  r:  r;  r<  r=  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rN  rP  rQ  rR  rT  rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rh  ri  rj  rl  rm  rn  ro  rp  rq  rs  rt  ru  rw  ry  rz  r{  r|  r}  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r+  rS  r,  r.  r/  r0  r1  r2  r3  r4  r  r  r  r  r5  r  r6  r7  r9  r:  r;  r<  r8  r?  r>  r@  rA  rB  rD  rC  rE  r=  rF  rG  rH  rI  r  rJ  rK  rL  rM  rN  rP  rO  staticmethodr  classmethodr  r  r  r  r  r  r  propertyr  r  r  r  r-  r  r  r,  r  r3   r3   r3   r   r     sX  
 





`	) )    C MO254 -,/[7                  	 +y0&r   zOptional[ServerArgs]_global_server_argsserver_argsc                 C  s   | a d S r   )r  )r  r3   r3   r   $set_global_server_args_for_scheduler  s   r  r  c                   C  s   t d u rtdt S )Nz"Global server args is not set yet!)r  rX  r3   r3   r3   r   get_global_server_args  s   r  argvr  c                 C  sL   t  }t| d| v rddlm} ||}|| } || }t|S )a#  
    Prepare the server arguments from the command line arguments.

    Args:
        args: The command line arguments. Typically, it should be `sys.argv[1:]`
            to ensure compatibility with `parse_args` when no arguments are passed.

    Returns:
        The server arguments.
    r  r   )ConfigArgumentMerger)	argparseArgumentParserr   r  $sglang.srt.server_args_config_parserr  merge_config_with_args
parse_argsr  )r  rS  r  config_mergerraw_argsr3   r3   r   prepare_server_args  s   



r        c                   @  sZ   e Zd ZU ded< ded< ded< ded< ded< ded< d	ed
< e		ddddZdS )PortArgsr   tokenizer_ipc_namescheduler_input_ipc_namedetokenizer_ipc_namer   r   rpc_ipc_namemetrics_ipc_namer   tokenizer_worker_ipc_nameNr  r   dp_rankr   worker_portsr  r  c                 C  sZ  | j d u r	t }n| j }| jdkrd }n
dtjddj }| jsTtdtjddj dtjddj dtjddj |dtjddj dtjddj |dS | jdkrf| j	d u rfd| j
t f}n| j	drzt| j	\}}|t|f}n| j	d}t|d	ksJ d
|\}}	t|	}	|	d }
|
d }|
d	 }|
d }|d u r|
d }n
|d usJ || }z1|d u rt|	d t|
d t|d t|d t|d t|d |d u s|d u rt|d W n ty   td|	d|
d|d|d|
  w td| d|
 d| d| d| d| |d| d| d| d| |dS )Nr   zipc://F)delete)r  r  r  r   r  r  r  r   [r  r  z9please provide --dist-init-addr as host:port of head noder|  rv  dist_init_port	port_basedetokenizer_portr   rpc_portr  scheduler_input_portz'Port is already in use. dist_init_port=z port_base=z detokenizer_port=z nccl_port=z scheduler_input_port=ztcp://)r   r   r   tempfileNamedTemporaryFiler  r  r
  rE  rD  r   ZMQ_TCP_PORT_DELTAr*  r   r   r6  r  r   r/   rX  rY  	exception)r  r  r  r   r  rD  port_numr   dist_init_hostr  r  r  r  r  r  r3   r3   r   init_new  s|   









 zPortArgs.init_new)NN)r  r   r  r   r  r  r  r
  )r  r  r  r  r  r   r3   r3   r3   r   r
    s   
 r
  c                   @  s   e Zd ZdddZdS )r  Nc                 C  s   g }|rAt |tsJ d|D ]1}| }|dr;|dr;t|}d|v r,d|v s5J t| d|| q|| qt	|| j
| d S )NzExpected a list of LoRA paths.{}r  r  zQ looks like a JSON str, but it does not contain 'lora_name' and 'lora_path' keys.)rL  r  r/  r*  endswithrM  rN  reprr  setattrrn  )rQ  rS  	namespacevaluesoption_stringrM  r  objr3   r3   r   __call__J  s   
zLoRAPathAction.__call__r   )r  r  r  r*  r3   r3   r3   r   r  I  s    r  messager   c                 C  s   t d|  d d S )Nz[1;33mr  )rY  rZ  )r+  r3   r3   r   print_deprecated_warning]  s   r,  c                      s(   e Zd Zd fdd	ZdddZ  ZS )	r  r   c                   s"   t t| j||fd|i| d S )Nr`  )superr  __init__)rQ  option_stringsrn  r`  kwargs	__class__r3   r   r.  b  s   

zDeprecatedAction.__init__Nc                 C  s   t d| d d S )NzThe command line argument 'z7' is deprecated and will be removed in future versions.)r,  )rQ  rS  r&  r'  r(  r3   r3   r   r*  g  s   
zDeprecatedAction.__call__)r   r   )r  r  r  r.  r*  __classcell__r3   r3   r1  r   r  a  s    r  rQ  c                 C  sJ   |   j}|jd }| jdkrdS |dv rdS |dv rdS |dv r#dS dS )z
    Automatically choose the parameters for speculative decoding.

    You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
    r   r}  )r|  r   rv  )LlamaForCausalLM)   rv  rO  )r$  r  rE  r  r  r  r  r%  r&  r  r  r  r  )Grok1ForCausalLMGrok1VForCausalLM)r  r  r  ra  )rQ  r  archr3   r3   r   r)  m  s   


r)  )r  r   )r  r   )r  r  r  r   )r+  r   )rQ  r   )~r  
__future__r   r   r  r  importlib.utilrM  loggingrm  rk  r  typingr   r   r   r   r   r   r	   sglang.srt.connectorr
   sglang.srt.environr   -sglang.srt.function_call.function_call_parserr   -sglang.srt.layers.attention.fla.chunk_delta_hr   r  sglang.srt.lora.lora_registryr   "sglang.srt.parser.reasoning_parserr   sglang.srt.utils.commonr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   &sglang.srt.utils.hf_transformers_utilsr1   sglang.utilsr2   	getLoggerr  rY  r  r  r   r   r  r   r  r   r  r   r   r   r  DEFAULT_LORA_EVICTION_POLICYr  r   r   r   r  r   r   r   r  r  r   r   r   r   r   r   r   r   r   r   r   r   r   	dataclassr   r  r  r  $set_global_server_args_for_tokenizerr  r  r  !DP_ATTENTION_HANDSHAKE_PORT_DELTAr
  Actionr  r,  r  r)  r3   r3   r3   r   <module>   s   $"

	
                                         D


i
