o
    پi                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddlm
Z
 ddlmZ ddlmZmZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' e$e(Z)dede*e+ef fddZ,d(dede-dB defddZ.G dd de-eZ/G dd de-eZ0G dd de-eZ1ej2G dd dZ3ej2G dd dZ4da5d e6e- de3fd!d"Z7d#e3fd$d%Z8de3fd&d'Z9dS ))z,The arguments of sglang-diffusion Inference.    N)field)Enum)AnyOptional)envs)PipelineConfig)NunchakuSVDQuantArgs)NunchakuConfig)BYTES_PER_GB)AttentionBackendEnumcurrent_platform)is_port_availableis_valid_ipv6_address)configure_loggerinit_logger)FlexibleArgumentParserStoreBooleanobjreturnc                 C   s0   zddl }t| |j|fW S  ty   Y dS w )zWReturn (is_tensor, torch_module_or_None) without importing torch at module import time.r   N)FN)torch
isinstanceTensor	Exception)r   r    r   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/server_args.py_is_torch_tensor/   s   r   key_hintc              	   C   s  | du st | ttttfr| S t | tr| jS t| \}}|rzm|  	 }|dkrt
|jt|jd}zt|  |d< W n	 tyJ   Y nw zt|  |d< W n	 ty`   Y nw zt|   |d< W n	 tyx   Y nw ddi|W S dt
|jt|jd	W S  ty   Y d
S w t| ri }t| D ]'}|jsq|j}d|v rqzt| |}	W n	 ty   Y qw t|	|d||< q|S t | tri }
|  D ]#\}}zt|}W n ty   d}Y nw |dkrqt||d|
|< q|
S t | t
ttfrdd | D S z,t| st| r9t| dd}t| dt| dd}|r6| d| W S |W S W n
 tyD   Y nw zt| W S  tyU   Y dS w )a  Recursively convert objects to JSON-serializable forms for concise logging.

    Rules:
    - Drop any field/dict key named 'param_names_mapping'.
    - Render Enums using their value.
    - Render torch.Tensor as a compact summary; if key name is 'scaling_factor', include stats.
    - Dataclasses are expanded to dicts and sanitized recursively.
    - Callables/functions are rendered as their qualified name.
    - Fallback to str(...) for unknown types.
    Nscaling_factor)shapedtypeminmaxmeantensorT)r#   r   r   z<tensor>names_mappingr   z<key>param_names_mappingc                 S   s   g | ]}t |qS r   )_sanitize_for_logging).0xr   r   r   
<listcomp>   s    z)_sanitize_for_logging.<locals>.<listcomp>
__module__ __qualname____name__z
<callable>.z<unserializable>) r   strintfloatboolr   valuer   detachcpulistr   r   r    itemr   r!   r"   dataclassesis_dataclassfieldsreprnamegetattrr'   dictitemstuplesetinspect	isroutineisclass)r   r   	is_tensor	torch_modtenstatsresultfr=   r4   result_dictkvkey_strmoduleqnr   r   r   r'   9   s   



r'   c                   @   s@   e Zd ZdZdZededd fddZedee fddZ	d	S )
ExecutionModez
    Enumeration for different pipeline modes.

    Inherits from str to allow string comparison for backward compatibility.
    	inferencer4   r   c                 C   D   z| |  W S  ty!   td| dddd | D  dw )z%Convert string to ExecutionMode enum.zInvalid mode: . Must be one of: , c                 S      g | ]}|j qS r   r4   r(   mr   r   r   r*          z-ExecutionMode.from_string.<locals>.<listcomp>Nlower
ValueErrorjoinclsr4   r   r   r   from_string      zExecutionMode.from_stringc                 C      dd | D S )2Get all available choices as strings for argparse.c                 S   rW   r   rX   )r(   moder   r   r   r*      r[   z)ExecutionMode.choices.<locals>.<listcomp>r   ra   r   r   r   choices      zExecutionMode.choicesN)
r.   r+   r-   __doc__	INFERENCEclassmethodr0   rb   r7   rh   r   r   r   r   rR      s    	rR   c                   @   sL   e Zd ZdZdZdZdZdZede	dd fdd	Z
edee	 fd
dZdS )WorkloadTypez
    Enumeration for different workload types.

    Inherits from str to allow string comparison for backward compatibility.
    i2vt2vt2ii2ir4   r   c                 C   rT   )z$Convert string to WorkloadType enum.zInvalid workload type: rU   rV   c                 S   rW   r   rX   rY   r   r   r   r*      r[   z,WorkloadType.from_string.<locals>.<listcomp>Nr\   r`   r   r   r   rb      rc   zWorkloadType.from_stringc                 C   rd   )re   c                 S   rW   r   rX   )r(   workloadr   r   r   r*      r[   z(WorkloadType.choices.<locals>.<listcomp>r   rg   r   r   r   rh      ri   zWorkloadType.choicesN)r.   r+   r-   rj   I2VT2VT2II2Irl   r0   rb   r7   rh   r   r   r   r   rm      s    	rm   c                   @   sH   e Zd ZdZdZdZdZededd fddZ	ede
e fd	d
ZdS )Backenda  
    Enumeration for different model backends.
    - AUTO: Automatically select backend (prefer sglang native, fallback to diffusers)
    - SGLANG: Use sglang's native optimized implementation
    - DIFFUSERS: Use vanilla diffusers pipeline (supports all diffusers models)
    autosglang	diffusersr4   r   c                 C   rT   )zConvert string to Backend enum.zInvalid backend: rU   rV   c                 S   rW   r   rX   rY   r   r   r   r*      r[   z'Backend.from_string.<locals>.<listcomp>Nr\   r`   r   r   r   rb      rc   zBackend.from_stringc                 C   rd   )re   c                 S   rW   r   rX   )r(   backendr   r   r   r*      r[   z#Backend.choices.<locals>.<listcomp>r   rg   r   r   r   rh      ri   zBackend.choicesN)r.   r+   r-   rj   AUTOSGLANG	DIFFUSERSrl   r0   rb   r7   rh   r   r   r   r   rw      s    	rw   c                	   @   s  e Zd ZU eed< ejZeed< dZeed< dZ	e
jdB ed< dZeeeef B dB ed< dZee ed< dZeed	< dZedB ed
< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZedB ed< e e!ddZ"e!ed< dZ#edB ed< dZ$edB ed< dZ%eed< dZ&e'ed< dZ(edB ed < dZ)e*e dB ed!< dZ+edB ed"< dZ,edB ed#< d$Z-e'ed%< dZ.edB ed&< dZ/edB ed'< dZ0edB ed(< dZ1eed)< d*Z2eed+< dZ3eed,< dZ4eed-< dZ5eed.< dZ6e*e ed/< dZ7edB ed0< e e8ddZ9e8e:B dB ed1< dZ;edB ed2< d3Z<edB ed4< d5Z=edB ed6< dZ>eed7< d8Z?edB ed9< d:Z@eed;< d<ZAedB ed=< dZBedB ed>< e ed?ZCeeef ed@< e dAdB d?ZDeeef edC< dZEe'dB edD< dEZFeedF< eGdGefdHdIZHeGdGefdJdKZIdLdM ZJdNdO ZKdPdQ ZLdRdS ZMdTdU ZNdVdW ZOdXdY ZPdZd[ ZQd\d] ZRd^d_ ZSd`edGeeef fdadbZTdcdd ZUeVdeeWdGeWfdfdgZXdhdi ZYeGdjdk ZZ	mdd6ednedoedGefdpdqZ[e\	ddre]j^dse*e dB dGd fdtduZ_e\dveeef dGd fdwdxZ`eVdyedGeeef fdzd{Zae\dvedGd fd|d}ZbeVdre]j^dse*e dGeeef fd~dZcdd Zddd Zedd Zfdd ZgdddZhdS )
ServerArgs
model_pathr{   Nattention_backendattention_backend_configcache_dit_config	nccl_portFtrust_remote_coderevision   num_gpustp_size	sp_degreeulysses_degreering_degreedp_size	dp_degreeenable_cfg_parallelhsdp_replicate_dimhsdp_shard_dimi  dist_timeout)default_factoryr<   pipeline_configpipeline_class_name	lora_pathdefaultlora_nickname      ?
lora_scalevae_pathlora_target_modulesdit_cpu_offloaddit_layerwise_offload        dit_offload_prefetch_sizetext_encoder_cpu_offloadimage_encoder_cpu_offloadvae_cpu_offloaduse_fsdp_inferenceTpin_cpu_memorycomfyui_modeenable_torch_compilewarmupwarmup_resolutionsdisable_autocastnunchaku_configmaster_port	127.0.0.1hosti0u  portwebuii0  
webui_port  scheduler_portzoutputs/output_pathprompt_file_path)r   model_pathsc                   C   s   ddddddddS )NT)transformervae	video_vae	audio_vae	video_dit	audio_ditdual_tower_bridger   r   r   r   r   <lambda>Y  s   zServerArgs.<lambda>model_loadedboundary_ratioinfo	log_levelr   c                 C   s
   | j d S )Nr   )r   selfr   r   r   broker_portm  s   
zServerArgs.broker_portc                 C   s   | j du p	| jdu S )z
        If no server is running when a generation task begins, 'local_mode' will be enabled: a dedicated server will be launched
        N)r   r   r   r   r   r   is_local_modeq  s   zServerArgs.is_local_modec                 C   sD   |    |   |   |   |   |   |   |   dS )z"set defaults and normalize values.N)_adjust_offload_adjust_quant_config_adjust_warmup_adjust_network_ports_adjust_parallelism_adjust_attention_backend_adjust_platform_specific_adjust_autocastr   r   r   r   _adjust_parametersx  s   zServerArgs._adjust_parametersc                 C   s$   |    |   |   |   dS )z6check consistency and raise errors for invalid configsN)_validate_pipeline_validate_offload_validate_parallelism_validate_cfg_parallelr   r   r   r   _validate_parameters  s   zServerArgs._validate_parametersc                 C   s`   | j }|du st|trdS |  |jr|jsd| _ dS t| j j| j j| j j| j jd| _ dS )zvalidate and adjustN)	precisionrankact_unsignedquantized_model_path)	r   r   r	   validateenable_svdquantr   quantization_precisionquantization_rankquantization_act_unsigned)r   ncfgr   r   r   r     s   
zServerArgs._adjust_quant_configc                 C   s  t  t dk r1td | jd u rd| _| jd u rd| _| jd u r%d| _| jd u r/d| _d S d S | j	j
 r`td | jd u rDd| _| jd u rLd| _| jd u rTd| _| jd u r^d| _d S d S | jd u rhd| _| jd u rpd| _| jd u rxd| _| jd u rd| _d S d S )N   z6Enabling all offloading for GPU with low device memoryTzODisabling some offloading (except dit, text_encoder) for image generation modelF)r   get_device_total_memoryr
   loggerr   r   r   r   r   r   	task_typeis_image_genr   r   r   r   r     sB   















zServerArgs._adjust_offloadc                 C   s   | j dv rd| _ | jd u rt | _nt| jtr#t| | j| _| jdkrC| j d ur6| j dvr6td| j d u rCd| _ t	
d | j d u rT| jtjkrV|   d S d S d S )N)fa3fa4far   )r   	sage_attnzVRing Attention is only supported for flash attention or sage attention backend for nowzRing Attention is currently only supported for flash attention or sage attention; attention_backend has been automatically set to flash attention)r   r   addictDictr   r0   _parse_attention_backend_configr   r^   r   r   r{   rw   r~   _set_default_attention_backendr   r   r   r   r     s*   




z$ServerArgs._adjust_attention_backendc                 C   s(   | j d urd| _| jrtd d S d S )NTzCWarmup enabled, the launch time is expected to be longer than usual)r   r   r   r   r   r   r   r   r     s   
zServerArgs._adjust_warmupc                 C   sl   |  | j| _| j| jdkrtddnd }|  || _| jd ur%| jndtdd }|  |d| _d S )Nr   r   d   i5u  %   )settle_portr   r   randomrandintr   )r   initial_scheduler_portinitial_master_portr   r   r   r     s   
z ServerArgs._adjust_network_portsc                 C   s   | j d u rd| _ | jd u r| j| _| jd u r4| j| j  }| jr#|d9 }| j| dkr1| j| | _nd| _| jd u rQ| jd u rQ| jdkrQ| j| _t	d| j d | jd u rbd| _t
d| j  | jd u rud| _t
d| j  d S d S )Nr      r   z+Automatically set ulysses_degree=sp_degree=z for best performancez,Ulysses degree not set, using default value z)Ring degree not set, using default value )r   r   r   r   r   r   r   r   r   r   debug)r   num_gpus_per_groupr   r   r   r     s6   








zServerArgs._adjust_parallelismc                 C   s|   t  r
d| _d| _tjs6| jjj	 }d|v sd|v r:| jd u r8t 
 r<td| jjj d d| _d S d S d S d S d S )NFwanmovaz/Automatically enable dit_layerwise_offload for z' for low memory and performance balanceT)r   is_mpsr   r   r   SGLANG_CACHE_DIT_ENABLEDr   	__class__r.   r]   /enable_dit_layerwise_offload_for_wan_by_defaultr   r   )r   pipeline_name_lowerr   r   r   r     s"   

z$ServerArgs._adjust_platform_specificc                 C   s   | j d u r| jj | _ d S d S N)r   r   enable_autocastr   r   r   r   r   -  s   
zServerArgs._adjust_autocast
config_strc                 C   sf  |si S t j|rI|dr*t|d}t|W  d   S 1 s$w   Y  n|drIt|d}t|W  d   S 1 sDw   Y  zt	|W S  tj
yY   Y nw zJi }|d}|D ]=}|dd\}}| }| }| dkrd	}n| d
krd}n|ddd rd|v rt|nt|}|||< qd|W S  ty   td| w )z+parse attention backend config from string.z.yamlz.ymlrN.json,=r   trueTfalseFr/   r,   z*Could not parse attention backend config: )ospathexistsendswithopenyaml	safe_loadjsonloadloadsJSONDecodeErrorsplitstripr]   replaceisdigitr2   r1   r   r^   )r   r   rK   configpairspairrM   rN   r   r   r   r   1  sD   
"
 

z*ServerArgs._parse_attention_backend_configc                 C   sh   t | d |   |   zt| dd}tdtj|dd W d S  ty3   td|   Y d S w )Nserver_argsr  r%   zserver_args: %sF)ensure_asciizserver_args: )	r   r   r   r'   r   r   r  dumpsr   )r   	safe_argsr   r   r   __post_init__X  s   
zServerArgs.__post_init__parserc                 C   sj  | j dtdd | j dttjdd | j dtd dd | j d	td d
d | j dtdd tjd | j dttjdd | j dttjdd | j dttj	dd | j dt
tjdd | j dt
d dd | j dt
d dd | j dt
tjdd | j dt
tjdd | j dd tjd!d | j d"d#d$t
tjd%d | j d&t
tjd%d | j d't
d d(d | j d)t
tjd*d | j d+ttjd,d | j d-td.d | j d/ttjd0d | j d1ttjd2d | j d3td4tjd5d6 | j d7td8d9 | j d:ttjd;d | j d<ttjd=d | j d>td?d9 | j d@tdAd9 | j dBtdCd9 | j dDtdEd9 | j dFtdGd9 | j dHtdId9 t|  | j dJt
tjdKd | j dLt
tjdMd | j dNttjdOd | j dPt
tjdQd | j dRttjdSd | j dTt
tj dSd | j dUttj!dVd | j dWttj"dXd | j dYttj#dZd | j d[ttj$d\d t%|  | j d]ttj&d^d | j d_tt'( tj)j*d`da | S )bNz--model-pathzTThe path of the model weights. This can be a local folder or a Hugging Face repo ID.)typehelpz
--vae-pathzzCustom path to VAE model (e.g., for distilled autoencoder). If not specified, VAE will be loaded from the main model path.)r   r   r!  z--attention-backendzThe attention backend to use. For SGLang-native pipelines, use values like fa, torch_sdpa, sage_attn, etc. For diffusers pipelines, use diffusers attention backend names such as flash, _flash_3_hub, sage, or xformers.z--attention-backend-configznConfiguration for the attention backend. Can be a JSON string, a path to a JSON/YAML file, or key=value pairs.z--diffusers-attention-backendr   )r   destr   r!  z--cache-dit-configzNPath to a Cache-DiT YAML/JSON config. Enables cache-dit for diffusers backend.z--trust-remote-codez1Trust remote code when loading HuggingFace models)actionr   r!  z
--revisionzPThe specific model version to use (can be a branch name, tag name, or commit id)z
--num-gpuszThe number of GPUs to use.z	--tp-sizez<The tensor parallelism size. Defaults to 1 if not specified.z--sp-degreezlThe sequence parallelism size. If not specified, will use all remaining GPUs after accounting for TP and DP.z--ulysses-degreez:Ulysses sequence parallel degree. Used in attention layer.z--ring-degreez7Ring sequence parallel degree. Used in attention layer.z--enable-cfg-parallel
store_truezEnable cfg parallel.z--data-parallel-sizez	--dp-sizez--dpzThe data parallelism size.z--hsdp-replicate-dimz--hsdp-shard-dimzCThe data parallelism shards. Defaults to num_gpus if not specified.z--dist-timeoutzTimeout for torch.distributed operations in seconds. Increase this value if you encounter 'Connection closed by peer' errors after the service is idle. z--prompt-file-pathzJPath to a text file containing prompts (one per line) for batch processingz--mask-strategy-file-pathz'Path to mask strategy JSON file for STAz--enable-torch-compilezUse torch.compile to speed up DiT inference.However, will likely cause precision drifts. See (https://github.com/pytorch/pytorch/issues/145213)z--warmupa  Perform some warmup after server starts (if `--warmup-resolutions` is specified) or before processing the first request (if `--warmup-resolutions` is not specified).Recommended to enable when benchmarking to ensure fair comparison and best performance.When enabled with `--warmup-resolutions` unspecified, look for the line ending with `(with warmup excluded)` for actual processing time.z--warmup-resolutions+zWSpecify resolutions for server to warmup. e.g., `--warmup-resolutions 256x256, 720x720`)r   nargsr   r!  z--dit-cpu-offloadzIUse CPU offload for DiT inference. Enable if run out of memory with FSDP.)r#  r!  z--dit-layerwise-offloadzEnable layerwise CPU offload with async H2D prefetch overlap for supported DiT models (e.g., Wan, MOVA). Cannot be used together with cache-dit (SGLANG_CACHE_DIT_ENABLED), dit_cpu_offload, or use_fsdp_inference.z--dit-offload-prefetch-sizeaQ  The size of prefetch for dit-layerwise-offload. If the value is between 0.0 and 1.0, it is treated as a ratio of the total number of layers. If the value is >= 1, it is treated as the absolute number of layers. 0.0 means prefetch 1 layer (lowest memory). Values above 0.5 might have peak memory close to no offload but worse performance.z--use-fsdp-inferencezwUse FSDP for inference by sharding the model weights. Latency is very low due to prefetch--enable if run out of memory.z--text-encoder-cpu-offloadz>Use CPU offload for text encoder. Enable if run out of memory.z--image-encoder-cpu-offloadz?Use CPU offload for image encoder. Enable if run out of memory.z--vae-cpu-offloadz5Use CPU offload for VAE. Enable if run out of memory.z--pin-cpu-memoryzPin memory for CPU offload. Only added as a temp workaround if it throws "CUDA error: invalid argument". Should be enabled in almost all casesz--disable-autocastzIDisable autocast for denoising loop and vae decoding in pipeline samplingz--master-portzSMaster port for distributed inference. If not set, a random free port will be used.z--scheduler-portzPort for the scheduler server.z--hostzHost for the HTTP API server.z--portzPort for the HTTP API server.z--webuiz'Whether to use webui for better displayz--webui-portz--output-pathz.Directory path to save generated images/videosz--lora-pathzYThe path to the LoRA adapter weights (can be local file path or HF hub id) to launch withz--lora-nicknamez0The nickname for the LoRA adapter to launch withz--lora-scalezRLoRA scale for merging (e.g., 0.125 for Hyper-SD). Same as lora_scale in Diffusersz--log-levelz!The logging level of all loggers.z	--backendzThe model backend to use. 'auto' prefers sglang native and falls back to diffusers. 'sglang' uses native optimized implementation. 'diffusers' uses vanilla diffusers pipeline.)r   rh   r   r!  )+add_argumentr0   r   r   argparseSUPPRESSr   r   r   r   r1   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r   add_cli_argsr   r   r   r   r   r   r   r   r   r   r   r   rw   rh   r{   r4   )r  r   r   r   r*  j  s  			

zServerArgs.add_cli_argsc                 C   s2   t | jrd| j d| j S d| j d| j S )Nzhttp://[z]:zhttp://:)r   r   r   r   r   r   r   url  s   
zServerArgs.urlc                 C   s,   | j }|du s|dkrd}d| d| j S )z
        Internal endpoint for scheduler.
        Prefers the configured host but normalizes localhost -> 127.0.0.1 to avoid ZMQ issues.
        N	localhostr   ztcp://r+  )r   r   )r   scheduler_hostr   r   r   scheduler_endpoint  s   zServerArgs.scheduler_endpoint*   r   port_incmax_attemptsc                 C   s   d}|}||k r7t |r|dkrtd| d| d |S |d7 }|dk r+||7 }ndtdd }||k std	| d
| d)z:
        Find an available port with retry logic.
        r   zPort z was unavailable, using port z insteadr   `  i    z$Failed to find available port after z attempts (started from port ))r   r   r   r   r   RuntimeError)r   r   r1  r2  attemptsoriginal_portr   r   r   r     s&   
zServerArgs.settle_portargsunknown_argsc                 C   sF   |d u rg }|  ||}|d}|r| |}i ||}| |S )Nr  )get_provided_argsgetload_config_file	from_dict)ra   r9  r:  provided_argsconfig_fileconfig_argsr   r   r   from_cli_args  s   


zServerArgs.from_cli_argskwargsc                 C   s   dd t | D }i }|D ]2}|dkr(t|}tdt|  ||d< q|dkr6t|}||d< q||v r@|| ||< q| di |S )z-Create a ServerArgs object from a dictionary.c                 S   rW   r   )r=   )r(   attrr   r   r   r*     r[   z(ServerArgs.from_dict.<locals>.<listcomp>r   zUsing PipelineConfig: r   Nr   )	r9   r;   r   from_kwargsr   r   r   r   r>  )ra   rC  attrsserver_args_kwargsrD  r   r   r   r   r   r>    s   



zServerArgs.from_dictr@  c                 C   s   |  dr!t| d}t|W  d   S 1 sw   Y  dS |  drTzddl}W n ty7   tdw t| d}||W  d   S 1 sMw   Y  dS td|  )zLoad a config file.r  r  Nr   r   zDPlease install PyYAML to use YAML config files. `pip install pyyaml`z Unsupported config file format: )r
  r  r  r  r  ImportErrorr  r^   )r@  rK   r  r   r   r   r=    s    
$
$zServerArgs.load_config_filec                 K   s   d|v rt |d trt|d |d< d|v r(t |d tr(t|d |d< d|v r<t |d tr<t|d |d< t||d< | di |S )Nrf   workload_typer{   r   r   )r   r0   rR   rb   rm   rw   r   rE  )ra   rC  r   r   r   rE    s   zServerArgs.from_kwargsc           	      C   sz   i }t j| }t }|D ]}|dr'|ddd ddd}|| qt| 	 D ]\}}||v r:|||< q.|S )z'Get the arguments provided by the user.z--r  r   r   -_)
sysargvrB   
startswithr  r  lstripaddvarsr@   )	r9  r:  r?  raw_argvprovided_arg_namesargarg_namerM   rN   r   r   r   r;    s   


zServerArgs.get_provided_argsc                 C   s    | j d u r	td| j   d S )Nz(pipeline_config is not set in ServerArgs)r   r^   check_pipeline_configr   r   r   r   r   *  s   
zServerArgs._validate_pipelinec                 C   s   | j dkr"t| j tr"| j  s"tt| j | _ td| j   d| j   kr-dk r4n ntd | j	r_| j dk r@t
d| jrKtd d	| _| jd u rXtd
 d	| _tjrat
dd S d S )Nr   z@Invalid --dit-offload-prefetch-size value passed, truncated to: g      ?r   zIWe do not recommend --dit-offload-prefetch-size to be between 0.5 and 1.0r   z.dit_offload_prefetch_size must be non-negativezMdit_layerwise_offload is enabled, automatically disabling use_fsdp_inference.FzJdit_layerwise_offload is enabled, automatically disabling dit_cpu_offload.a  dit_layerwise_offload cannot be enabled together with cache-dit. cache-dit may reuse skipped blocks whose weights have been released by layerwise offload, causing shape mismatch errors. Please disable either --dit-layerwise-offload or SGLANG_CACHE_DIT_ENABLED.)r   r   r2   
is_integerr1   mathfloorr   r   r   r^   r   warningr   r   r   r   r   r   r   r   0  sD   





zServerArgs._validate_offloadc              
   C   s  | j | jks| j| j  dkrtd| j d| j  d| j| jks)| j| j dkr6td| j d| j d| j| jksD| j| j dkrQtd| j d| j d| j| j dkrftd| j d| j d| jdk rotd	| jdkrxtd
| j| j }| jr|d9 }| j| dkrtd| j d| jrdnd d| | j | j| j	 krtd| j  d| j d| j	 d| j| j	  d	t
dd dkr| j dk}| jdk}|r|rtd d S d S d S d S )Nr   z
num_gpus (z)) must be >= and divisible by sp_degree (r5  z2) must be >= and divisible by hsdp_replicate_dim (z.) must be >= and divisible by hsdp_shard_dim (z ) must be divisible by dp_size (r   z"--dp-size must be a natural numberzDP is not yet supportedr   z)) must be divisible by (dp_size * tp_sizez * 2r,   z) = zsp_degree (z+) must equal ring_degree * ulysses_degree (z * z = r   r  zucache-dit is enabled with hybrid parallelism (SP + TP). Proceeding anyway (SGLang integration may support this mode).)r   r   r^   r   r   r   r   r   r   r   r  getenvr]   r   rZ  )r   r   has_sphas_tpr   r   r   r   [  s`   

 



z ServerArgs._validate_parallelismc                 C   s    | j r| jdkrtdd S d S )Nr   zICFG Parallelism is enabled via `--enable-cfg-parallel`, but num_gpus == 1)r   r   r^   r   r   r   r   r     s
   z!ServerArgs._validate_cfg_parallelc                 C   s.   t  rtjj }|| _td| dS dS )zGConfigure ROCm defaults when users do not specify an attention backend.z\Attention backend not specified. Using '%s' by default on ROCm to match SGLang SRT defaults.N)	r   is_rocmr   AITERr=   r]   r   r   r   )r   default_backendr   r   r   r     s   z)ServerArgs._set_default_attention_backend)r0  r   r   )r   N)ir.   r+   r-   r0   __annotations__rw   r|   r{   r   r   r   r   r   r?   r   r   r   r1   r   r3   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   r   r7   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r  staticmethodr   r*  r,  r/  r   rl   r(  	NamespacerB  r>  r=  rE  r;  r   r   r   r   r   r   r   r   r   r      s   
 "	&'  /



+9r   c                   @   s`   e Zd ZU eed< eed< eed< eed< dZedB ed< e	ddede	e d	d fd
dZ
dS )PortArgsscheduler_input_ipc_namer   rpc_ipc_namemetrics_ipc_nameNr   r  dp_rankr   c                 C   s   | j d u r#| jtdd }	 t|rn|dk r|d7 }n|d8 }qn| j }tdtjdd	j |dtjdd	j dtjdd	j | j	d
S )Nr   r4  Tr3  r0  +   zipc://F)delete)rf  r   rg  rh  r   )
r   r   r   r   r   re  tempfileNamedTemporaryFiler=   r   )r  ri  r   r   r   r   from_server_args  s$   

zPortArgs.from_server_argsr   )r.   r+   r-   r0   ra  r1   r   rc  r   r   rn  r   r   r   r   re    s   
 re  rM  c                 C   s(   t  }t| || }t|}|S )zJ
    Prepare the inference arguments from the command line arguments.
    )r   r   r*  
parse_argsrB  )rM  r  raw_argsr  r   r   r   prepare_server_args  s
   


rq  r  c                 C   s   | a dS )z>
    Set the global sgl_diffusion config for each process
    N)_global_server_argsr  r   r   r   set_global_server_args  s   rs  c                   C   s   t d u rtdt S )Nz%Global sgl_diffusion args is not set.)rr  r^   r   r   r   r   get_global_server_args  s   rt  r   ):rj   r(  r9   rC   r  rX  r  r   rL  rl  r   enumr   typingr   r   r   r  sglang.multimodal_genr   3sglang.multimodal_gen.configs.pipeline_configs.baser   *sglang.multimodal_gen.configs.quantizationr   Isglang.multimodal_gen.runtime.layers.quantization.configs.nunchaku_configr	   *sglang.multimodal_gen.runtime.loader.utilsr
   'sglang.multimodal_gen.runtime.platformsr   r   *sglang.multimodal_gen.runtime.utils.commonr   r   1sglang.multimodal_gen.runtime.utils.logging_utilsr   r   sglang.multimodal_gen.utilsr   r   r.   r   rA   r3   r   r0   r'   rR   rm   rw   	dataclassr   re  rr  r7   rq  rs  rt  r   r   r   r   <module>   sZ   
b       @*