o
    
۾ia                 	   @   s0  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d dl&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZBmCZCmDZDmEZEmFZFmGZG d dlHmIZI d dlJmKZKmLZLmMZMmNZNmOZOmPZP d dlQmRZRmSZS d dlTmUZU d dlVmWZWmXZX d dlYmZZZ d dl[m\Z\ d dl]m^Z^ d dl_m`Z`maZa d dlbmcZcmdZd d dlemfZf d dlgmhZhmiZi d dljmkZkmlZl d dlmmnZn d dlompZp d dlqmrZr d dlsmtZt d dlumvZv d dlwmxZx d dlymzZz d d l{m|Z| d d!l}m~Z~ erod d"lmZ d d#lmZ d d$lmZ d d%lmZ neZeZeZeZe`eZed&Zee eB Zeed'< ee eB Zeed(< d)eegef d*eegef fd+d,Zd)eegef d*eegedB f fd-d.Zd/ed*eeeef B dB fd0d1Zd2ed3ed*e%e fd4d5Zd6ee d3ed*efd7d8Zd6ee d3ed*efd9d:Zd6ee d*eeef fd;d<Zd6ee d3ed*eeef fd=d>Zd2ed*efd?d@Zd2ed*ee fdAdBZedCdD ejD pAejd   ZdEpAedFZejdGdHdIe,d*eeeeef f fdJdKZdIe,d*eeeeef f fdLdMZe	G dNdO dOZe	G dPdQ dQeZdRefdSdTZdUed*efdVdWZdUed*efdXdYZdS )Z    N)Callable)MISSING	dataclassfieldsis_dataclass)permutations)	UnionType)
TYPE_CHECKING	AnnotatedAnyLiteral	TypeAliasTypeVarUnioncastget_args
get_origin)TypeAdapterValidationError)	FieldInfo)TypeIs)AttentionConfigCacheConfigCompilationConfig
ConfigTypeDeviceConfigECTransferConfig
EPLBConfigKernelConfigKVEventsConfigKVTransferConfig
LoadConfig
LoRAConfigModelConfigMultiModalConfigObservabilityConfigParallelConfigPoolerConfigProfilerConfigSchedulerConfigSpeculativeConfigStructuredOutputsConfig
VllmConfigWeightTransferConfigget_attr_docs)	BlockSize
CacheDTypeKVOffloadingBackendMambaCacheMode
MambaDTypePrefixCachingHashAlgo)Device)ConvertOptionHfOverridesLogprobsMode
ModelDTypeRunnerOptionTokenizerMode)MMCacheTypeMMEncoderTPMode)DetailedTraceModules)DistributedExecutorBackendExpertPlacementStrategy)SchedulerPolicy)	get_field)OptimizationLevel)init_loggersuppress_logging)CpuArchEnumcurrent_platformload_general_plugins)is_in_ray_actoris_ray_initialized)is_interleavedmaybe_override_with_speculators)is_gguf)get_model_path)is_cloud_storage)FlexibleArgumentParser)	GiB_bytes)get_ip)resolve_kv_cache_dtype_string)AttentionBackendEnum)LogitsProcessor)QuantizationMethods)LoadFormatsUsageContext)ExecutorTTypeHint	TypeHintTreturn_typereturnc                    s   dt dtf fdd}|S )Nvalr`   c              
      s>   z | W S  t y } ztd|  d  d|d }~ww )NzValue z cannot be converted to .)
ValueErrorargparseArgumentTypeError)ra   er_    I/home/ubuntu/.local/lib/python3.10/site-packages/vllm/engine/arg_utils.py_parse_type{   s   
zparse_type.<locals>._parse_typestrr\   )r_   rj   rh   rg   ri   
parse_typez   s   rm   c                    s   dt dtd B f fdd}|S )Nra   r`   c                    s    | dks| dkr
d S t  | S )N None)rm   ra   rg   rh   ri   _optional_type   s   z%optional_type.<locals>._optional_typerk   )r_   rq   rh   rg   ri   optional_type   s   rr   ra   c                 C   s"   t d| s
t| S ttj| S )Nz(?s)^\s*{.*}\s*$)rematchrl   rr   jsonloadsrp   rh   rh   ri   union_dict_and_str   s   rw   	type_hinttypec                 C   s   | |u p	t | |u S )z*Check if the type hint is a specific type.)r   )rx   ry   rh   rh   ri   is_type   s   rz   
type_hintsc                    s   t  fdd| D S )z0Check if the type hints contain a specific type.c                 3       | ]}t | V  qd S Nrz   ).0rx   ry   rh   ri   	<genexpr>       z contains_type.<locals>.<genexpr>)anyr{   ry   rh   r   ri   contains_type   s   r   c                    s   t  fdd| D dS )z*Get the specific type from the type hints.c                 3   s    | ]
}t | r|V  qd S r}   r~   r   thr   rh   ri   r      s    zget_type.<locals>.<genexpr>N)nextr   rh   r   ri   get_type   s   r   c                    st   t | t}t|}t|d  t fdd|D s)td| ddd |D  t| tr0dnd	}d
 |t|iS )zGet the `type` and `choices` from a `Literal` type hint in `type_hints`.

    If `type_hints` also contains `str`, we use `metavar` instead of `choices`.
    r   c                 3   r|   r}   )
isinstance)r   optionoption_typerh   ri   r      r   z$literal_to_kwargs.<locals>.<genexpr>z*All options must be of the same type. Got z with types c                 S   s   g | ]}t |qS rh   r   )r   crh   rh   ri   
<listcomp>   s    z%literal_to_kwargs.<locals>.<listcomp>metavarchoicesry   )	r   r   r   ry   allrc   r   rl   sorted)r{   rx   optionskwargrh   r   ri   literal_to_kwargs   s   
r   c                    s   t | |}t|}|d  t fdd|D s J d| dt tthv r8tt v s6J d  dt  |tusAt|v rEddS t	|dS )	Nr   c                 3   s     | ]}|t ur| u V  qd S r}   )Ellipsis)r   t	elem_typerh   ri   r      s    z'collection_to_kwargs.<locals>.<genexpr>z8All non-Ellipsis elements must be of the same type. Got rb   zTIf element can have multiple types, one must be 'str' (i.e. 'list[int | str]'). Got +)ry   nargs)
r   r   r   r   r   r   rl   tupler   len)r{   ry   rx   typesrh   r   ri   collection_to_kwargs   s$   

r   c                 C   s
   | j dkS )z*Check if the class is not a built-in type.builtins)
__module__)rx   rh   rh   ri   is_not_builtin   s   
r   c                 C   sj   t  }t| }t| }|tu r|t|d  |S |tthv r.|D ]	}|t| q"|S ||  |S )z6Extract type hints from Annotated or Union type hints.r   )	setr   r   r
   updateget_type_hintsr   r   add)rx   r{   originargsargrh   rh   ri   r      s   
r   c                 c   s    | ]}d |v V  qdS )--helpNrh   )r   r   rh   rh   ri   r          r   mkdocszmkdocs/__main__.py   )maxsizeclsc              	   C   s  t rt| ni }i }t| D ]}t|j}dd |D }t|d }|jturP|j}t|t	rO|j
d u r8|j}n!t  |
 }W d    n1 sJw   Y  n	|j
turY|
 }|j}||d }	|	dd}	||	d||< d}
|d ur|fdtd	tfd
d}||| d< || d  d|
 7  < nt|trtj|| d< nt|tr|| t| nt|tr|| t|t nt|tr|| t|t nt|tr|| t|t nt|tr#|dkrt|| d< || d  dtj 7  < n|dv rt || d< || d  dt j 7  < npt|| d< nit|t!r0t!|| d< n\t|t"rMt|tsFt#dd |D rMt$|| d< n?t|t"rjt%t&j'|| d< || d  d|
 7  < n"t|tszt#dd |D rt|| d< nt(d| d| dt)|| dtu r|| t|| d h td |v rt|tst*|| d || d< || dr|| d +d q|S )Nc                 s   s    | ]	}t |r|V  qd S r}   )r   r   rh   rh   ri   r      s    z"_compute_kwargs.<locals>.<genexpr>rn   %z%%)defaulthelpzFShould either be a valid JSON string or JSON keys passed individually.ra   r`   c              
   S   s:   zt || W S  ty } ztt||d }~ww r}   )r   validate_jsonr   rd   re   repr)ra   r   rf   rh   rh   ri   parse_dataclass  s   z(_compute_kwargs.<locals>.parse_dataclassry   r   z

actionmax_model_len)max_num_batched_tokenskv_cache_memory_bytesc                 s       | ]}t |V  qd S r}   r   r   rh   rh   ri   r   7  r   c                 s   r   r}   r   r   rh   rh   ri   r   =  s    
zUnsupported type z for argument rb   r   ro   ),
NEEDS_HELPr.   r   r   ry   r   r   r   r   r   default_factoryrE   namegetstripreplacerl   r   r   boolrd   BooleanOptionalActionr   r   r   r   r   listr   inthuman_readable_int_or_auto__doc__human_readable_intfloatdictr   rw   rm   ru   rv   rc   r   rr   append)r   cls_docskwargsfieldr{   	generatordataclass_clsr   r   r   json_tipr   rh   rh   ri   _compute_kwargs   s   














r   c                 C   s   t t| S )a{  Return argparse kwargs for the given Config dataclass.

    If `--help` or `mkdocs` are not present in the command line command, the
    attribute documentation will not be included in the help output.

    The heavy computation is cached via functools.lru_cache, and a deep copy
    is returned so callers can mutate the dictionary without affecting the
    cached version.
    )copydeepcopyr   )r   rh   rh   ri   
get_kwargsR  s   
r   c                	   @   s  e Zd ZU dZejZeed< ejZe	ed< ej
Z
eed< ejZeee B dB ed< ejZedB ed< ejZedB ed< ejZeed	< ejZeed
< ejZe	ed< ejZe	ed< ejZeeB ed< ejZe	ed< ejZeed< ejZee dB ed< ejZedB ed< ejZeed< ejZeeB ed< ejZeed< ej Z e!ed< e"j#Z$e%ed< ej&Z&e'ed< ej(Z(e'dB ed< e)j*Z*ee' dB ed< e+e)dZ,e'dB ed< e-j.Z.ee/B e0e1 B dB ed< e-j2Z2e'ed< e-j3Z3eed< e-j4Z4e'ed< e-j5Z5e'ed< e-j6Z6e'ed < e-j7Z7e'ed!< e-j8Z8e'ed"< e-j9Z9e'ed#< e-j:Z:e'ed$< e-j;Z;e'ed%< e-j<Z<e'ed&< dZ=e'dB ed'< dZ>e'dB ed(< dZ?e'dB ed)< dZ@edB ed*< dZAe'dB ed+< d,ZBe	ed-< d,ZCe	ed.< e-jDZDeed/< e-jEZEe	ed0< e-jFZFeed1< e-jGZGe	ed2< e-jHZHe'ed3< e-jIZIe'ed4< e-jJZJe'ed5< e-jKZKe	dB ed6< e+e-d7ZLeMed7< e-jNZNe	ed8< e-jOZOePed9< e-jQZQe'ed:< e-jRZRe'ed;< e-jSZSe'dB ed<< e"jTZTeUdB ed=< dZVe	dB ed>< e"jWZWeXed?< ejYZYe	ed@< ejZZZe	edA< e"j[Z[e\edB< e"j]Z]e\edC< e"j^Z^e\edD< e"j_Z_e'dB edE< dZ`e'dB edF< eajbZbe'edG< eajcZce'edH< eajdZde'edI< dZee'dB edJ< ejfZfe'edK< ejgZgehedL< d,Zie	edM< d,Zje	edN< ejkZkedB edO< ejlZledB edP< ejmZme	eB dB edQ< e+edRZneoedR< ejpZpedB edS< ejqZqerdB edT< ejsZse	edU< ejtZte	edV< e-juZue	edW< e+evdXZwexee'exee'f B f edY< evjyZye	edZ< evjzZze	ed[< e+evd\Z{exeexee|f f ed\< evj}Z}exee|f dB ed]< evj~Z~e\ed^< evjZedB ed_< evjZe'ed`< evjZe	eda< evjZeedb< evjZeeB dB edc< dZedB edd< evjZe	ede< evjZe\edf< d,Ze	edg< ejZe'edh< ejZe'edi< ejZexeef dB edj< ejZe	edk< ejZe'dB edl< ejZeej B dB edm< ejZe	edn< ejZe	edo< e-jZe	edp< e"jZe'dB edq< e+edrZexedr< e+edsZeee B eds< dZe	dB edt< eajZe	edu< eajZe	dB edv< e+edwZeedw< ejZeedx< dZedB edy< ejZedB edz< dZexee|f dB ed{< ejZedB ed|< ejZedB ed}< ejZee dB ed~< ejZe	ed< e+edZe\ed< ejZe	ed< ejZe	ed< ejZe	ed< ejZe	ed< ejZe	ed< eajZeed< eajZee0e B dB ed< ejZedB ed< e+edZe)ed< e+edZeed< e+edZeed< e+edZe	ed< e-jZeed< e-jZeed< e+edZeed< dZedB ed< dZedB ed< dZedB ed< ejZeed< ejZe	ed< e+edZexee|f ed< ejZeed< ejZeed< ejZedB ed< e"jZe	ed< e"jZeed< e"jZeed< e+e"dZe'dB ed< e"jZeed< e+edZexee|f ed< ejZe	ed< ejZeed< ejZeee0e B  dB ed< 	 eajZe	dB ed< eajZe'ed< e"jZe	ed< ejZeed< e"jZe\dB ed< e"jZeed< d,Ze	ed< e+edZedB ed< dd ZededefddZedejfddZdefddZdd ZdefddZdede-dedB fddZ		,ddedB de	defddZdefddÄZede'deexedB e'f exedB e'f f fddƄZdeddfddȄZdedB defddʄZdS )
EngineArgszArguments for vLLM engine.modelenable_return_routed_expertsmodel_weightsNserved_model_name	tokenizerhf_config_pathrunnerconvertskip_tokenizer_initenable_prompt_embedstokenizer_modetrust_remote_codeallowed_local_media_pathallowed_media_domainsdownload_dirsafetensors_load_strategyload_formatconfig_formatdtypekv_cache_dtypeseedr   cudagraph_capture_sizesmax_cudagraph_capture_sizedistributed_executor_backendpipeline_parallel_sizemaster_addrmaster_portnnodes	node_ranktensor_parallel_sizeprefill_context_parallel_sizedecode_context_parallel_sizedcp_kv_cache_interleave_sizecp_kv_cache_interleave_sizedata_parallel_sizedata_parallel_rankdata_parallel_start_rankdata_parallel_size_localdata_parallel_addressdata_parallel_rpc_portFdata_parallel_hybrid_lbdata_parallel_external_lbdata_parallel_backendenable_expert_parallelall2all_backend
enable_dboubatch_sizedbo_decode_token_thresholddbo_prefill_token_threshold#disable_nccl_for_dp_synchronizationeplb_configenable_eplbexpert_placement_strategy_api_process_count_api_process_rankmax_parallel_loading_workers
block_sizeenable_prefix_cachingprefix_caching_hash_algodisable_sliding_windowdisable_cascade_attn
swap_spacecpu_offload_gbgpu_memory_utilizationr   r   max_num_partial_prefillsmax_long_partial_prefillslong_prefill_token_thresholdmax_num_seqsmax_logprobslogprobs_modedisable_log_statsaggregate_engine_loggingrevisioncode_revisionhf_tokenhf_overridestokenizer_revisionquantizationallow_deprecated_quantizationenforce_eagerdisable_custom_all_reducelimit_per_promptlimit_mm_per_promptenable_mm_embedsinterleave_mm_stringsmedia_io_kwargsmm_processor_kwargsmm_processor_cache_gbmm_processor_cache_typemm_shm_cache_max_object_size_mbmm_encoder_onlymm_encoder_tp_modemm_encoder_attn_backendio_processor_pluginskip_mm_profilingvideo_pruning_rateenable_lora	max_lorasmax_lora_rankdefault_mm_lorasfully_sharded_lorasmax_cpu_loras
lora_dtypeenable_tower_connector_loraspecialize_active_loraray_workers_use_nsightnum_gpu_blocks_overridemodel_loader_extra_configignore_patternsenable_chunked_prefilldisable_chunked_mm_inputdisable_hybrid_kv_cache_managerstructured_outputs_configreasoning_parserreasoning_parser_pluginlogits_processor_patternspeculative_configshow_hidden_metrics_for_versionotlp_traces_endpointcollect_detailed_traceskv_cache_metricskv_cache_metrics_samplecudagraph_metricsenable_layerwise_nvtx_tracingenable_mfu_metrics enable_logging_iteration_detailsenable_mm_processor_statsscheduling_policyscheduler_clspooler_configcompilation_configattention_configkernel_configenable_flashinfer_autotune
worker_clsworker_extension_clsprofiler_configkv_transfer_configkv_events_configec_transfer_configgeneration_configenable_sleep_modeoverride_generation_config
model_imploverride_attention_dtypeattention_backendcalculate_kv_scalesmamba_cache_dtypemamba_ssm_cache_dtypemamba_block_sizemamba_cache_modeadditional_configuse_tqdm_on_loadpt_load_map_locationlogits_processorsasync_schedulingstream_intervalkv_sharing_fast_prefilloptimization_levelkv_offloading_sizekv_offloading_backendtokens_onlyweight_transfer_configc                 C   s*  t | jtrtdi | j| _t | jtrtdi | j| _t | jtr-tdi | j| _t | jtr<t	di | j| _t | j
trKtdi | j
| _
ddlm} |  tjjr| j}t| j| j| _|| jurptd|| j | jd ur| j}t| j| j| _|| jurtd|| j d S d S d S d S )Nr   rH   z@HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]zHHF_HUB_OFFLINE is True, replace tokenizer_id [%s] to tokenizer_path [%s]rh   )r   rZ  r   r   r[  r   r\  r   r
  r   rz  r-   vllm.pluginsrI   huggingface_hub	constantsHF_HUB_OFFLINEr   rO   r   loggerinfor   r$  )selfrI   model_idtokenizer_idrh   rh   ri   __post_init__S  sF   


	zEngineArgs.__post_init__parserr`   c                 C   sv  t t}| jdtjd}dtjdd v rdtjdd v s)|jdzi |d  |jd{i |d
  |jd|i |d  |jd}i |d  |jd~i |d  |j	di |d  |jdi |d  |jdi |d  |jdi |d  |j	di |d  |j	di |d  |jdi |d  |jdi |d   |j	!di |d"  |jdi |d$  |jdi |d'  |j	(di |d)  |jdi |d+  |j	,di |d-  |jdi |d/  |jdi |d1  |j	2di |d3  |j	4di |d5  |j	6di |d7  |j	8di |d9  |j	:di |d;  |jdi |d=  |jd>td?d@|dA dB |dA dC dD |jdi |dF  |jdi |dH  |j	Idi |dJ  |j	Kdi |dL  |j	Mdi |dN  |j	Odi |dP  |jdi |dR  |j	Sdi |dT  |j	Udi |dV  |j	Wdi |dX  t t}| jdYtjd}|jdi |d[  |jdi |d]  |j	^di |d_  |j	`di |da  |jdi |dc  |jdi |de  |j	fdi |dg  t t	}| jdht	jd}|j	idi |dj  t t
}| jdkt
jd}|j	ldi |dm  |j	ndi |do  t t}	| jdptjd}
|
j	qdi |	dr  |
j	s	tdi |	du  |
jdi |	dw  |
jdi |	dy  |
jdi |	d|  |
jdi |	d  |
j	di |	d  |
j		di |	d  |
j	di |	d  |
j	di |	d  |
j		di |	d  |
j	di |	d  |
jddtdd |
jddtdd |
jddtdd |
jddtdd |
jddtdd |
jddtddd |
j		di |	d  |
j		di |	d  |
j		di |	d  |
j	di |	d  |
jdi |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
jdi |	d  |
jdi |	d  |
j	di |	d  |
j	di |	d  |
j	di |	d  |
j	Đdi |	d  |
jdi |	d  |
j	Ȑdi |	d  t t}| jdtjd}|jdi |d  |j	͐di |d  |j	ϐdi |d  |jdi |d  |jdi |d  |j	Րdi |d  |j	אdi i |d dBdi |j	ِdi |d  |jdi |d  |j	ݐdi |d  |j	ߐdi |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  t t}| jdtjd}|j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	di |d  |j	 di |d  |j	di |d  |j	di |d  |j	di |d  t t}| jdtjd}|jd	tjd
d |jdi |d  |jdi |d  |j	di |d  |j	di |d  |jdi |d  |j	di |d  |jdi |d  |j	di |d  t t}| jdtjd}|j	di |d  |j	di |d   |d! d" }d#d$| d%}||d! d&< |d! d"  d'd( tttd)d*D 7  < |j	+di |d!  |j	,di |d-  |j	.di |d/  |j	0di |d1  |j	2di |d3  |j	4di |d5  |j	6di |d7  t t}| jd8tjd}|j	9di i |d: dBdi |j	;di i |d< dBdi |j	=di |d>  |j	?di |d@  |j	Adi |dB  |j	Cdi |dD  |j	Edi i |dF dBdi |j	Gdi |dH  |j	Idi |dJ  |j	Kdi |dL  |j	Mdi |dN  |j	Od i |dP  t t}| jdQtjd}|j	Rdi |dS  |j	Tdi |dU  t t}| jdVtjd}|j	Wdi |dX  t t}| jdYtjd}ttj|dZ d[< |j	\di |dZ  |j	]di |d^  |jdi |d`  |j	adi |db  |j	ddi |de  |j	gd	i |dh  |jd
i |dj  |j	kdi |dl  |j	mdi |dn  |jdi |dp  |j	qdi |dr  |j	sdi |dt  | jdudvdwd | jdxdvdyd | S (  z%Shared CLI arguments for vLLM engine.r#   )titledescriptionserve   Nr   --modelr   --runnerr   	--convertr   --tokenizerr   --tokenizer-moder   --trust-remote-coder   --dtyper   --seedr   --hf-config-pathr   --allowed-local-media-pathr   --allowed-media-domainsr   
--revisionr   --code-revisionr!  --tokenizer-revisionr$  --max-model-lenr   --quantization-qr%  --allow-deprecated-quantizationr&  --enforce-eagerr'  --enable-return-routed-expertsr   --max-logprobsr  --logprobs-moder  --disable-sliding-windowr  --disable-cascade-attnr  --skip-tokenizer-initr   --enable-prompt-embedsr   --served-model-namer   --config-formatr   z
--hf-token?Tr"  r   r   )ry   r   constr   r   --hf-overridesr#  --pooler-configrY  --logits-processor-patternrK  --generation-configrd  --override-generation-configrf  --enable-sleep-modere  --model-implrg  --override-attention-dtyperh  --logits-processorsrr  --io-processor-pluginr5  r!   --load-formatr   --download-dirr   --safetensors-load-strategyr   --model-loader-extra-configrC  --ignore-patternsrD  --use-tqdm-on-loadrp  --pt-load-map-locationrq  r   --attention-backendbackendr+   --reasoning-parserrI  --reasoning-parser-pluginrJ  r&   --distributed-executor-backendr   --pipeline-parallel-size-ppr   --master-addrr   --master-portr   --nnodes-nr   --node-rank-rr   --tensor-parallel-size-tpr   --decode-context-parallel-size-dcpr   --dcp-kv-cache-interleave-sizer   --cp-kv-cache-interleave-sizer   --prefill-context-parallel-size-pcpr   --data-parallel-size-dpr   z--data-parallel-rankz-dpnzSData parallel rank of this instance. When set, enables external load balancer mode.)ry   r   z--data-parallel-start-rankz-dprz0Starting data parallel rank for secondary nodes.z--data-parallel-size-localz-dplz5Number of data parallel replicas to run on this node.z--data-parallel-addressz-dpaz+Address of data parallel cluster head-node.z--data-parallel-rpc-portz-dppz)Port for data parallel RPC communication.z--data-parallel-backendz-dpbmpz0Backend for data parallel, either "mp" or "ray".)ry   r   r   --data-parallel-hybrid-lb-dphr   --data-parallel-external-lb-dper  --enable-expert-parallel-epr  --all2all-backendr  --enable-dbor  --ubatch-sizer  --dbo-decode-token-thresholdr  --dbo-prefill-token-thresholdr  %--disable-nccl-for-dp-synchronizationr	  --enable-eplbr  --eplb-configr
  --expert-placement-strategyr  --max-parallel-loading-workersr  --ray-workers-use-nsightrA  --disable-custom-all-reducer(  --worker-clsr^  --worker-extension-clsr_  r   --block-sizer  --gpu-memory-utilizationr  --kv-cache-memory-bytesr   --swap-spacer  --kv-cache-dtypecache_dtype--num-gpu-blocks-overriderB  --enable-prefix-cachingr  --prefix-caching-hash-algor  --cpu-offload-gbr  --calculate-kv-scalesrj  --kv-sharing-fast-prefillru  --mamba-cache-dtyperk  --mamba-ssm-cache-dtyperl  --mamba-block-sizerm  --mamba-cache-modern  --kv-offloading-sizerw  --kv-offloading-backendrx  r$   --limit-mm-per-promptr)  --enable-mm-embedsr+  --media-io-kwargsr-  --mm-processor-kwargsr.  --mm-processor-cache-gbr/  --mm-processor-cache-typer0  !--mm-shm-cache-max-object-size-mbr1  --mm-encoder-onlyr2  --mm-encoder-tp-moder3  --mm-encoder-attn-backendr4  --interleave-mm-stringsr,  --skip-mm-profilingr6  --video-pruning-rater7  r"   z--enable-loraz*If True, enable handling of LoRA adapters.)r   r   --max-lorasr9  --max-lora-rankr:  --lora-dtyper>  --enable-tower-connector-lorar?  --max-cpu-lorasr=  --fully-sharded-lorasr<  --default-mm-lorasr;  --specialize-active-lorar@  r%   !--show-hidden-metrics-for-versionrM  --otlp-traces-endpointrN  rO  r   {,}r   c                 S   s   g | ]}d  |qS )r  )join)r   prh   rh   ri   r   ,  s    
z+EngineArgs.add_cli_args.<locals>.<listcomp>   )r--collect-detailed-traces--kv-cache-metricsrP  --kv-cache-metrics-samplerQ  --cudagraph-metricsrR  --enable-layerwise-nvtx-tracingrS  --enable-mfu-metricsrT  "--enable-logging-iteration-detailsrU  r)   --max-num-batched-tokensr   --max-num-seqsr  --max-num-partial-prefillsr  --max-long-partial-prefillsr  --long-prefill-token-thresholdr  --scheduling-policypolicy--enable-chunked-prefillrE  --disable-chunked-mm-inputrF  --scheduler-clsrX  !--disable-hybrid-kv-cache-managerrG  --async-schedulingrs  --stream-intervalrt  r   --cudagraph-capture-sizesr   --max-cudagraph-capture-sizer   r   --enable-flashinfer-autotuner]  r,   rL  ry   --speculative-config--kv-transfer-configra  --kv-events-configrb  --ec-transfer-configrc  --compilation-config-ccrZ  --attention-config-acr[  --kernel-configr\  --additional-configro  --structured-outputs-configrH  --profiler-configr`  --optimization-levelrv  --weight-transfer-configrz  z--disable-log-stats
store_truezDisable logging statistics.z--aggregate-engine-loggingzLLog aggregate rather than per-engine statistics when using data parallelism.)r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  r  )r  )r  )r  r  )r  r  )r  r  )r  r  )r  )r  )r  r  )r  r  )r  r  )r  r  )r  r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r   )r  )r  )r  )r  )r  )r  )r  )r  )r	  )r
  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r  )r   )r!  )r#  )r$  )r%  )r&  )r'  )r(  )r)  )r*  )r+  )r,  )r-  )r.  )r/  )r0  r1  )r2  r3  )r4  )r5  )r6  )r7  )r8  )r9  )r   r#   add_argument_groupr   sysargvadd_argumentrl   r!   r   r+   r&   r   r   r$   r"   rd   r   r%   r  r   r   r>   r)   r   r   r,   rr   ru   rv   )r  model_kwargsmodel_groupload_kwargs
load_groupattention_kwargsattention_groupstructured_outputs_kwargsstructured_outputs_groupparallel_kwargsparallel_groupcache_kwargscache_groupmultimodal_kwargsmultimodal_grouplora_kwargs
lora_groupobservability_kwargsobservability_groupr   r   scheduler_kwargsscheduler_groupcompilation_kwargscompilation_groupkernel_kwargskernel_groupvllm_kwargs
vllm_grouprh   rh   ri   add_cli_args|  s  $


zEngineArgs.add_cli_argsr   c                    s4   dd t | D }| di  fdd|D }|S )Nc                 S   s   g | ]}|j qS rh   )r   r   attrrh   rh   ri   r     s    z,EngineArgs.from_cli_args.<locals>.<listcomp>c                    s"   i | ]}t  |r|t |qS rh   )hasattrgetattrrZ  r   rh   ri   
<dictcomp>  s   " z,EngineArgs.from_cli_args.<locals>.<dictcomp>rh   )dataclassesr   )r   r   attrsengine_argsrh   r^  ri   from_cli_args  s
   zEngineArgs.from_cli_argsc                 C   s  t | jrd | _| _tjstd| j t	d7i d| jd| j
d| jd| jd| jd| jd	| jd
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| j d| j!d| j"d| j#d | j$d!| j%d"| j&d#| j'd$| j(d%| j)d&| j*d'| j+d(| j,d)| j-d*| j.d+| j/d,| j0d-| j1d.| j2d/| j3d0| j4d1| j5d2| j6d3| j7d4| j8d5| j9d6| j:S )8NggufzThe global random seed is set to %d. Since VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may affect the random state of the Python process that launched vLLM.r   r   r   r   r   r   r   r   r   r   r   r   r   r!  r"  r#  r$  r   r%  r&  r'  r   r  r  r  r  r   r   r   r*  r+  r,  r-  r6  r   r.  r/  r0  r1  r2  r3  r4  rY  rK  rd  rf  re  rg  rh  rr  r7  r5  rh   );rN   r   r%  r   envsVLLM_ENABLE_V1_MULTIPROCESSINGr  warningr   r#   r   r   r   r   r   r   r   r   r   r   r   r!  r"  r#  r$  r   r&  r'  r   r  r  r  r  r   r   r   r*  r+  r,  r-  r6  r   r.  r/  r0  r1  r2  r3  r4  rY  rK  rd  rf  re  rg  rh  rr  r7  r5  r  rh   rh   ri   create_model_config  s   
	
 !"#$%&'()*+,-./01234zEngineArgs.create_model_configc                 C   s:   ddl m} | jD ]}||jv r| j| | jd |< q	d S )Nr   )TensorizerConfigtensorizer_config)+vllm.model_executor.model_loader.tensorizerrj  rC  _fields)r  rj  keyrh   rh   ri   validate_tensorizer_args'  s   

z#EngineArgs.validate_tensorizer_argsc              	   C   sx   | j dkrd| _| jdkr*t| jdr| j | _i | jd< | j| jd d< |   t| j| j| j	| j| j
| j| jdS )Nbitsandbytes
tensorizerto_serializablerk  tensorizer_dir)r   r   r   rC  rD  rp  rq  )r%  r   r\  rC  rr  r   ro  r!   r   r   rD  rp  rq  rh  rh   rh   ri   create_load_config0  s&   


zEngineArgs.create_load_configtarget_model_configtarget_parallel_configc                 C   s0   | j du rdS | j ||d tdi | j S )a[  Initializes and returns a SpeculativeConfig object based on
        `speculative_config`.

        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        Nru  rv  rh   )rL  r   r*   )r  ru  rv  rh   rh   ri   create_speculative_configI  s   
z$EngineArgs.create_speculative_configusage_contextheadlessc                 C   s  t   tttt jd}t| js&t| j| j	| j
| j| jd\| _| _	| _|  }|j| _|j| _|j	| _	| | | | | || d}t|jsQ| }t| j|}tdi d| jd| jd| jd| jd|d	|jd
| jd|d| jd| jd| j d| j!d| j"d| j#d| j$d| j%d| j&d| j'd| j(}d}t) rddl*}	|	+ j,}|r|- ni }
d|
v rdd |
d D |
d< t./d|
 d}t0 rddl*}	|	j12 }|r| j3rJ d| j3r| j4rJ d| j5dks| j6dksJ d d}| j6dkrr| j7| j8 | j9 }| j8| j9 }|| j6 }|| j6 dks2J d!| d"| j6 d#| j:| j6k sFJ d$| j: d%| j6 d#| j:| | }| j7dkrd| j4rd|| _;t./d&| j;| j: n| j<du rrt=|| d| _<| j4pz| j;du}|r| j;dusJ d'| j<d(v sJ d)d}d*| _3n`| j<dur| j<}| j>r|sd+| _3| j3r|dkrt.?d, d+}d*| _3|| j7krd*| _3| j>p|| _;| j6dkrt./d-| j;| j: n| j3rJ d.| j5d/krt@jAd0krd}n| j7}| jBdu r%| j5d/krtC }t./d1| |}n| j5dksJ d2| j5f| jDp#tEjF}n| jB}| jGdur1| jGntEjG}| jHrD|jIsDd+|_It./d3 tEdi d4| j8d5| j9d6| jJd7| j7d8| j;p]dd9|d:|d;| jDd<| jKd=| j6d>| j:d?|d@|dA| j5dB| j3dC|jLdD| jMdE| jNdF| jOdG| jPdH| jQdI| jRdJ| jSdK| jTdL| jUdM| jVdN| jWdO| jXdP| jYdQ|dR|dS| jZdT| j[dU| j\dV| j]dW| j^dX| j_dY| j`dZ| ja}| jb||d[}tcdi d\|jdd]| jed^| jfd_|jgd`| jhda| jidb|jjdc|jkdd| jlde| jmdf| jndg| jodh| jpdi| jqdj| jrdk| js}|jjs:| jtr:tudl| jvr^tw| jx| jy| jt| jz| j{| j|| j}| j~rZ| j~dkrZ| j~nddmnd}|durz|durz|je|jf|jd  k rztudn|jdokrdo | _| _t| j}| jdur|jdurtudpt| jtrt| j  |_n| j|_t| j}| jdur|jdurtudq| j|_|  }| jr| j| j_| jr| j| j_t| j| j| j| j| j| j| j| j| j| jdr
}t| j}| jdur|jdurtuds| j|_| jdur&|jdur"tudt| j|_tdi du|dv|dw|dx|dy|dz|d{|d||d}|d~|d| jd|d|d| jd| jd| jd| jd| jd| jd| j}|S )zi
        Create the VllmConfig.

        NOTE: If VllmConfig is incompatible, we raise an error.
        )device)r   r   r   r   vllm_speculative_configNr  r  r   r  r  is_attention_freerB  sliding_windowr  r  r  rj  ru  rk  rl  rm  rn  rw  rx  r   env_varsc                 S   s   i | ]}|d qS )z***rh   )r   krh   rh   ri   r_    s    z3EngineArgs.create_engine_config.<locals>.<dictcomp>z-Using ray runtime env (env vars redacted): %sz:data_parallel_hybrid_lb is not applicable in headless modezJdata_parallel_hybrid_lb and data_parallel_external_lb cannot both be True.r  r  z:nnodes > 1 is only supported with data_parallel_backend=mpzworld_size=z must be divisible by nnodes=rb   z
node_rank=z must be less than nnodes=z@Inferred data_parallel_rank %d from node_rank %d for external lbzYdata_parallel_rank or node_rank must be specified if data_parallel_external_lb is enable.)r  NzIdata_parallel_size_local must be 1 or None when data_parallel_rank is setFTzsdata_parallel_hybrid_lb is not eligible when data_parallel_size_local = 1, autoswitch to data_parallel_external_lb.z0Inferred data_parallel_rank %d from node_rank %dzDdata_parallel_size_local must be set to use data_parallel_hybrid_lb.rayspanz3Using host IP %s as ray-based data parallel addressz3data_parallel_backend can only be ray or mp, got %sz7Skipping tokenizer initialization for tokens-only mode.r   r   r   r   r   r  r   r   r   r   r   data_parallel_master_ipr   r  r   is_moe_modelr  r  r  r  r  r  r	  r  r
  r  r  r(  rA  ray_runtime_envplacement_groupr   r^  r_  r   r   r   r  r  rw  runner_typer   r  r   rE  rF  is_multimodal_modelis_encoder_decoderr"  rX  r  r  r  rG  rs  rt  zJDefault modality-specific LoRA(s) were provided for a non multimodal model)r:  r9  r;  r<  r>  r?  r@  r=  zOConsider increasing max_num_batched_tokens or decreasing num_speculative_tokensrp  zEattention_backend and attention_config.backend are mutually exclusivez^enable_flashinfer_autotune and kernel_config.enable_flashinfer_autotune are mutually exclusive)
rM  rN  rO  rP  rQ  rR  rS  rT  rV  rU  z]cudagraph_capture_sizes and compilation_config.cudagraph_capture_sizes are mutually exclusivezcmax_cudagraph_capture_size and compilation_config.max_cudagraph_capture_size are mutually exclusivemodel_configcache_configparallel_configscheduler_configdevice_configload_configr[  r\  lora_configrL  rH  observability_configrZ  ra  rb  rc  r`  ro  rv  rz  rh   )rG   pre_register_and_updater   r   r5   device_typerP   r   rM   r   r   r   rL  ri  r   _check_feature_supported4_set_default_chunked_prefill_and_prefix_caching_args1_set_default_max_num_seqs_and_batched_tokens_argsrL   hf_text_configget_sliding_windowrT   r   r   r  r  r   r  r}  rB  r  r  r  rj  ru  rk  rl  rm  rn  rw  rx  rK   r  get_runtime_contextruntime_envto_dictr  r  rJ   utilget_current_placement_groupr   r  r  r   r   r   r   r   r   r   maxr   rg  re  VLLM_RAY_DP_PACK_STRATEGYr   rS   r   r&   r  r   ry  r   r   r   is_moer  r  r  r  r  r  r	  r  r
  r  r  r(  rA  r   r^  r_  r   r   r   r  r  rx  r)   r  r   r  r   rE  rF  r  r  rW  rX  r  r  r  rG  rs  rt  r;  rc   r8  r"   r:  r9  r<  r>  r?  r@  r=  num_speculative_tokensr%  r   r   r   r[  ri  r  r   rl   rU   upperr\  r]  rt  rI  rH  rJ  r%   rM  rN  rO  rP  rQ  rR  rS  rT  rV  rU  rZ  r   r   r,   ra  rb  rc  r`  ro  rv  rz  )r  ry  rz  r  r  r~  resolved_cache_dtyper  r  r  sanitized_envr  inferred_data_parallel_rank
world_sizeworld_size_within_dplocal_world_sizer  r   host_ipr   r   r  rL  r  r  r[  r\  r  r  rZ  configrh   rh   ri   create_engine_configd  sJ  





	









	
 !"#$%&'*	



	
zEngineArgs.create_engine_configr  c                 C   s   | j tj krtdd | jtjks| jtjkrtdd | jdkr=t| jdd}|s?| jt	jddd	fvrAd
}t|d dS dS dS dS )z/Raise an error if the feature is not supported.r  )feature_namezConcurrent Partial Prefillr  supports_ppFr  r  external_launcherzfPipeline Parallelism without Ray distributed executor or multiprocessing executor or external launcherN)
rK  r   _raise_unsupported_errorr  r)   r  r   r]  r   r&   )r  r  r  r   rh   rh   ri   r    s,   


z#EngineArgs._check_feature_supportedr  c                 C   s2  ddl m} zt }t  }W n ty   d}d}Y nw |dt kr;d|vr;|jd|j	di}|jd|j	di}n|jd|j	d	i}|jd
|j	d
i}t
 ryt }|dkr`|jd	|j	di}n|dkrm|jd|j	di}n|dkry|jd|j	d
i}t r|jd| |j	d	| i}|jd
| |j	d| i}||fS )Nr   rY   rn   F   a100i @  i       i      V6EV5Ei   V5Pi      )vllm.usage.usage_librZ   rG   get_device_total_memoryget_device_namelower	ExceptionrR   	LLM_CLASSOPENAI_API_SERVERis_tpuis_cpu)r   r  rZ   device_memorydevice_namedefault_max_num_batched_tokensdefault_max_num_seqs	chip_namerh   rh   ri   get_batch_defaults7  sV   



zEngineArgs.get_batch_defaultsc                 C   s  |j }|j}| jd u r|| _td|rdnd n#|jdkr+| js+|r+tjddd n|jdkr<| jr<|s<tjd	dd | jd u rO|| _td
|rKdnd n|jdkr`| jr`|s`tjddd t	 rt
 tjtjtjfv rtd d| _td d| _d S d S d S )Nz%s chunked prefill by defaultEnabling	DisablinggeneratezThis model does not officially support disabling chunked prefill. Disabling this manually may cause the engine to crash or produce incorrect outputs.local)scopepoolingzThis model does not officially support chunked prefill. Enabling this manually may cause the engine to crash or produce incorrect outputs.z%s prefix caching by defaultzThis model does not officially support prefix caching. Enabling this manually may cause the engine to crash or produce incorrect outputs.z_Chunked prefill is not supported for POWER, S390X and RISC-V CPUs; disabling it for V1 backend.Fz^Prefix caching is not supported for POWER, S390X and RISC-V CPUs; disabling it for V1 backend.)is_chunked_prefill_supportedis_prefix_caching_supportedrE  r  debugr  warning_oncer  rG   r  get_cpu_architecturerF   POWERPCS390XRISCVr  )r  r  default_chunked_prefilldefault_prefix_cachingrh   rh   ri   r    sn   






	
z?EngineArgs._set_default_chunked_prefill_and_prefix_caching_argsc                 C   s   | j | j }| |\}}| j}| j}| jd u r ||tj| _| jd u r-||tj| _|d u rT| j	s<t
|j| j| _t| j|j | j| _td| j|rQ|jnd  |d u rv| jd us_J t| j| j| _td| j|rq|jnd  d S d S )Nz=Defaulting max_num_batched_tokens to %d for %s usage context.z3Defaulting max_num_seqs to %d for %s usage context.)r   r   r  r   r  r   r)   DEFAULT_MAX_NUM_BATCHED_TOKENSDEFAULT_MAX_NUM_SEQSrE  r  r   minr  r  value)r  ry  r  r  r  r  orig_max_num_batched_tokensorig_max_num_seqsrh   rh   ri   r    sR   


z<EngineArgs._set_default_max_num_seqs_and_batched_tokens_args)NF)__name__r   __qualname__r   r#   r   rl   __annotations__r   r   r   r   r   r   r   r   r:   r   r6   r   r   r   r;   r   r   r   r!   r   r   r   rX   r   r   r9   r   r  r   r0   r   r   r   r   r   rB   r   r&   r   r?   ry   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r   r  r  r@   r  r  r  r  r/   r  r  r4   r  r  r  r   r  r  r   r   r)   r  r  r  r  r  r  r8   r  r  r   r!  r"  r#  r7   r$  r%  rW   r&  r'  r(  r$   r*  r   r+  r,  r-  r   r.  r/  r0  r<   r1  r2  r3  r=   r4  rU   r5  r6  r7  r8  r"   r9  r:  r;  r<  r=  r>  torchr?  r@  rA  rB  rC  rD  rE  rF  rG  r,   rH  r+   rI  rJ  rK  rL  r%   rM  rN  rO  r>   rP  rQ  rR  rS  rT  rU  rV  r"  rW  rA   rX  objectrY  r'   rZ  r[  r   r\  r   r]  r^  r_  r`  r(   ra  r    rb  r   rc  r   rd  re  rf  rg  rh  r  ri  rj  rk  r3   rl  rm  rn  r2   ro  rp  rq  rr  rV   rs  rt  ru  rv  rC   rw  rx  r1   ry  rz  r-   r  staticmethodrQ   rY  classmethodrd   	Namespacerc  ri  ro  rt  r*   rx  rZ   r  r  r   r  r  r  rh   rh   rh   ri   r   _  s  
 

 


)    _	E	

   8"S
Kr   c                   @   s<   e Zd ZU dZdZeed< e	d
dededefddZ	d	S )AsyncEngineArgsz'Arguments for asynchronous vLLM engine.Fenable_log_requestsr  async_args_onlyr`   c                 C   sR   t   |s
t| } | jdtjtjdd | jdtjtj ddd t	|  | S )Nz--enable-log-requestszEnable logging requests.)r   r   r   z--disable-log-requestsz&[DEPRECATED] Disable logging requests.T)r   r   r   
deprecated)
rI   r   rY  r>  rd   r   r  r  rG   r  )r  r  rh   rh   ri   rY    s$   

zAsyncEngineArgs.add_cli_argsN)F)
r  r   r  r   r  r   r  r  rQ   rY  rh   rh   rh   ri   r    s   
 r  r  c                 C   s   |  d|  d}t |)Nz* is not supported. We recommend to remove z from your config.)NotImplementedError)r  msgrh   rh   ri   r  2  s
   r  r  c                 C   s   |   } td| }|r]ddddd}ddd	d
d}| \}}||v r0|| }tt|| S ||v r]|| }zt|| W S  ty\ } ztd| d| |	  d|d}~ww t| S )zParse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    z(\d+(?:\.\d+)?)([kKmMgGtT])i  i@B i ʚ;l    J))r  mgr   r  i   i   @l        )KMGr\   z3Decimals are not allowed with binary suffixes like z. Did you mean to use z	 instead?N)
r   rs   	fullmatchgroupsr   r   rc   rd   re   r  )r  rt   decimal_multiplierbinary_multipliernumbersuffixmultrf   rh   rh   ri   r   :  sH   	r   c                 C   s(   |   } | dks|  dkrdS t| S )aA  Parse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.
    Also accepts -1 or 'auto' as a special value for auto-detection.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    - '-1' or 'auto' -> -1 (special value for auto-detection)
    z-1auto)r   r  r   )r  rh   rh   ri   r   h  s   r   )rd   r   r`  	functoolsru   r<  collections.abcr   r   r   r   r   	itertoolsr   r   r   typingr	   r
   r   r   r   r   r   r   r   r   r|  regexrs   r  pydanticr   r   pydantic.fieldsr   typing_extensionsr   	vllm.envsre  vllm.configr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   vllm.config.cacher/   r0   r1   r2   r3   r4   vllm.config.devicer5   vllm.config.modelr6   r7   r8   r9   r:   r;   vllm.config.multimodalr<   r=   vllm.config.observabilityr>   vllm.config.parallelr?   r@   vllm.config.schedulerrA   vllm.config.utilsrB   vllm.config.vllmrC   vllm.loggerrD   rE   vllm.platformsrF   rG   r{  rI   vllm.ray.lazy_utilsrJ   rK   vllm.transformers_utils.configrL   rM   "vllm.transformers_utils.gguf_utilsrN   "vllm.transformers_utils.repo_utilsrO   vllm.transformers_utils.utilsrP   vllm.utils.argparse_utilsrQ   vllm.utils.mem_constantsrR   vllm.utils.network_utilsrS   vllm.utils.torch_utilsrT   #vllm.v1.attention.backends.registryrU   vllm.v1.sample.logits_processorrV   'vllm.model_executor.layers.quantizationrW    vllm.model_executor.model_loaderrX   r  rZ   vllm.v1.executorr[   r  r  r\   ry   r  r]   r  r^   rl   rm   rr   r   rw   rz   r   r   r   r   r   r   r   r   r   r=  argv0endswithr   	lru_cacher   r   r   r  r  r   r   r   rh   rh   rh   ri   <module>   s   
0h  &*"	"
$"e             > .