o
    ٷiJC                     @   sf  d Z ddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl
Z
ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ee*Z+dZ,dZ-G dd deZ.dej/de0e1e2ef  fddZ3dej/ddfddZ4de0e fddZ5dS )z
Omni serve command for vLLM-Omni.

Supports both multi-stage LLM models (e.g., Qwen2.5-Omni) and
diffusion models (e.g., Qwen-Image) through the same CLI interface.
    N)Any)CLISubcommand)make_arg_parservalidate_parsed_serve_args)VLLM_SUBCMD_PARSER_EPILOG)init_logger)FlexibleArgumentParser)make_zmq_socket)get_engine_client_zmq_addr)get_connectors_config_for_stageload_omni_transfer_config) resolve_omni_kv_config_for_stage)log_logo)OmniBaseomni_snapshot_download)	OmniStage)omni_run_server)inject_omni_kv_config   ax  Launch a local OpenAI-compatible API server to serve Omni models
via HTTP. Supports both multi-stage LLM models and diffusion models.

The server automatically detects the model type:
- LLM models: Served via /v1/chat/completions endpoint
- Diffusion models: Served via /v1/images/generations endpoint

Examples:
  # Start an Omni LLM server
  vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091

  # Start a diffusion model server
  vllm serve Qwen/Qwen-Image --omni --port 8091

Search by using: `--help=<ConfigGroup>` to explore options by section (e.g.,
--help=OmniConfig)
  Use `--help=all` to show all available flags at once.
c                   @   sT   e Zd ZdZdZedejddfddZdejddfdd	Z	d
ej
defddZdS )OmniServeCommandz(The `serve` subcommand for the vLLM CLI.serveargsreturnNc                 C   s\   t jdsdt jd< t  t| dr| jd ur| j| _| jr%t|  d S t	
t|  d S )NVLLM_DISABLE_LOG_LOGO1	model_tag)osenvirongetr   hasattrr   modelheadlessrun_headlessuvlooprunr   )r    r%   S/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/entrypoints/cli/serve.pycmdC   s   
zOmniServeCommand.cmdc                 C   sr   |j d ur|jd u s|jd u rtdddlm} t|dd p$t|dd }|r3||r3td| d S t	| d S )NzO--stage-id requires both --omni-master-address and --omni-master-port to be setr   )is_diffusion_modelr   r    zDetected diffusion model: %s)
stage_idomni_master_addressomni_master_port
ValueError"vllm_omni.diffusion.utils.hf_utilsr(   getattrloggerinfor   )selfr   r(   r    r%   r%   r&   validateR   s   zOmniServeCommand.validate
subparsersc                 C   s  |j | jtdd}t|}tj| jd|_|jddd}|jddd	d
 |jdt	d dd |jdt
d dd |jdt
ddd |jdt
ddd |jdt
ddd |jdddd
 |jdt	d dd |jdt
ddd |jd t	d!d!d"gd#d$ |jd%t	d d&d |jd'd(t	d)d* |jd+d,t
d-d* |jd.t
d d/d |jd0d1d2t
d d3d4 |jd5d6t
d d7d4 |jd8tjd d9d |jd:d;dd<d= |jd>t
d?d@d |jdAt
dBdCd |jdDt	dEdFd |jdGt	d dHd |jdIddJd
 |jdKddLd
 |jdMddNd
 |jdOddPd
 |jdQddRd
 |jdStd dTd |jdUtd dVd |jdWt
dBdBdXgdYd$ |jdZt	d[d* |jd\t
d]d* |jd^t
d d_d |S )`Nz'vllm serve [model_tag] --omni [options])descriptionusage)subcmd
OmniConfigz=Configuration for vLLM-Omni multi-stage and diffusion models.)titler4   z--omni
store_truez:Enable vLLM-Omni mode for multi-modal and diffusion models)actionhelpz--stage-configs-pathzbPath to the stage configs file. If not specified, the stage configs will be loaded from the model.)typedefaultr;   z
--stage-idz-Select and launch a single stage by stage_id.z--stage-init-timeouti,  zEThe timeout for initializing a single stage in seconds (default: 300)z--init-timeoutiX  z(The timeout for initializing the stages.z--shm-threshold-bytesi   z)The threshold for the shared memory size.z--log-statszEnable logging the stats.z
--log-filezThe path to the log file.z--batch-timeout
   zThe timeout for the batch.z--worker-backendmulti_processrayz%The backend to use for stage workers.)r<   r=   choicesr;   z--ray-addressz-The address of the Ray cluster to connect to.z--omni-master-addressz-omaz9Hostname or IP address of the Omni orchestrator (master).)r<   r;   z--omni-master-portz-ompz'Port of the Omni orchestrator (master).z
--num-gpusz4Number of GPUs to use for diffusion model inference.z--uspz--ulysses-degreeulysses_degreezwUlysses Sequence Parallelism degree for diffusion models. Equivalent to setting DiffusionParallelConfig.ulysses_degree.)destr<   r=   r;   z--ringring_degreezqRing Sequence Parallelism degree for diffusion models. Equivalent to setting DiffusionParallelConfig.ring_degree.z--quantization-configzoJSON string for diffusion quantization_config. Example: '{"method":"gguf","gguf_model":"/path/to/model.gguf"}'.z
--use-hsdpuse_hsdpzEnable HSDP (Hybrid Sharded Data Parallel) for diffusion models. Shards model weights across GPUs to reduce per-GPU memory usage.)rC   r:   r;   z--hsdp-shard-sizezPNumber of GPUs to shard weights across. -1 = auto (world_size / replicate_size).z--hsdp-replicate-size   zHNumber of replica groups for HSDP. Each group holds a full sharded copy.z--cache-backendnonezECache backend for diffusion models, options: 'tea_cache', 'cache_dit'z--cache-configzDJSON string of cache configuration (e.g., '{"rel_l1_thresh": 0.2}').z--enable-cache-dit-summaryz@Enable cache-dit summary logging after diffusion forward passes.z--vae-use-slicingzNEnable VAE slicing for memory optimization (useful for mitigating OOM issues).z--vae-use-tilingzMEnable VAE tiling for memory optimization (useful for mitigating OOM issues).z--enable-cpu-offloadz+Enable CPU offloading for diffusion models.z--enable-layerwise-offloadz7Enable layerwise (blockwise) offloading on DiT modules.z--boundary-ratiozOBoundary split ratio for low/high DiT in video models (e.g., 0.875 for Wan2.2).z--flow-shiftzJScheduler flow_shift for video models (e.g., 5.0 for 720p, 12.0 for 480p).z--cfg-parallel-size   zNumber of devices for CFG parallel computation for diffusion models. Equivalent to setting DiffusionParallelConfig.cfg_parallel_size.z--default-sampling-paramszJson str for Default sampling parameters, 
Structure: {"<stage_id>": {<sampling_param>: value, ...}, ...}
e.g., '{"0": {"num_inference_steps":50, "guidance_scale":1}}'. Currently only supports diffusion models.z--max-generated-image-sizez0The max size of generate image (height * width).z--tts-max-instructions-lengthzWMaximum length for TTS voice style instructions (overrides stage config, default: 500).)
add_parsernameDESCRIPTIONr   r   formatepilogadd_argument_groupadd_argumentstrintjsonloadsfloat)r1   r3   serve_parseromni_config_groupr%   r%   r&   subparser_init_   s  	
	zOmniServeCommand.subparser_init)__name__
__module____qualname____doc__rK   staticmethodargparse	Namespacer'   r2   _SubParsersActionr   rX   r%   r%   r%   r&   r   >   s    r   r   r   c                 C   s   t t }|t| S N)r   __new__#_create_default_diffusion_stage_cfgvars)r   	omni_baser%   r%   r&   rc   O  s   
rc   c                    s  | j d ur| j dkrtd| jdkrtdt| j}tt}t|  }||d< |	||\}}| j
}|d u rLt|dkrDtdt|d dd}d }|D ]}t|dd |kr^|} nqP|d u rktd	| d
t|| jd}	t|	|}
| j}| j}t }td||d}t||tjddd}d|d}|tj| |jtd dstdt d| d| z
tj| }W n4 tj y } ztd| d| d| |d }~w t!y } ztd| d| d| |d }~ww |d s	|d }td| d| |d |d }}t"#d| d | d!|  W d    n	1 s+w   Y  W d    n	1 s;w   Y  d  fd"d#}t$$t$j%| t$$t$j&| t'|| j(d$}|)|| zt*|	|\}}}|rvt+|||| W n t!y } zt",d%|| W Y d }~nd }~ww t-j./d&}d'| d(|d u rd)n| t-j.d&< z+|j0|d*t1| jt1| j2|
dd*d+ |j3d ur|j34  W |5  d S W |5  d S |5  w ),NrG   z.api_server_count can't be set in headless moder?   z3headless mode requires worker_backend=multi_processr    z?--stage-id is required in headless mode for multi-stage configsr   r)   zNo stage matches stage_id=.)default_shm_thresholdF)
local_onlyhostporti  )bindlinger	handshake)r<   r)   i`  )timeoutzHandshake timeout (z minutes) for stage-z at z"Handshake decode failed for stage-z: z.Unexpected error decoding handshake for stage-okerrorzHandshake failed for stage-in_endpointout_endpointz[Headless] Stage-z& received endpoints via handshake: in=z, out=c                    s    rd S d t )NT)
SystemExit)signumframeshutdown_requestedr%   r&   signal_handler  s   z$run_headless.<locals>.signal_handler)stage_init_timeoutzC[Headless] Failed to inject omni connector config into stage-%s: %sVLLM_LOGGING_PREFIXz[Stage-z]  T)is_asyncshm_threshold_bytesbatch_timeoutconnectors_configworker_backendignore_runtime_config)6api_server_countr,   r   r   r    r   rb   rd   copy_resolve_stage_configsr)   lenr.   r   r}   r   r*   r+   zmqContextr
   r	   REQsendmsgspecmsgpackencodepollHANDSHAKE_TIMEOUT_MINSRuntimeErrordecoderecvDecodeError	Exceptionr/   r0   signalSIGTERMSIGINTr   ry   attach_queuesr   r   debugr   r   r   init_stage_workerrR   r~   _procjoinstop_stage_worker)r   r    re   	args_dictconfig_pathstage_configssingle_stage_idstage_configcfgtransfer_configr   r*   r+   zmq_ctxhandshake_endpointhandshake_sockethandshake_msgresponseexc	error_msgrq   rr   rx   stageomni_conn_cfg	omni_fromomni_toeold_envr%   rv   r&   r"   T  s   






'$	r"   c                   C   s   t  gS ra   )r   r%   r%   r%   r&   cmd_init  s   r   )6r\   r^   rS   r   r   typingr   msgspec.msgpackr   r#   r   vllm.entrypoints.cli.typesr    vllm.entrypoints.openai.cli_argsr   r   vllm.entrypoints.utilsr   vllm.loggerr   vllm.utils.argparse_utilsr   vllm.utils.network_utilsr	   vllm.v1.utilsr
   %vllm_omni.distributed.omni_connectorsr   r   :vllm_omni.distributed.omni_connectors.utils.initializationr   vllm_omni.entrypoints.cli.logor   vllm_omni.entrypoints.omnir   r    vllm_omni.entrypoints.omni_stager   'vllm_omni.entrypoints.openai.api_serverr   vllm_omni.entrypoints.utilsr   rY   r/   r   rL   r   r_   listdictrQ   rc   r"   r   r%   r%   r%   r&   <module>   s@       s