++ date
+ echo 'STARTING at Mon Mar 16 01:23:13 PM UTC 2026'
STARTING at Mon Mar 16 01:23:13 PM UTC 2026
+ nvidia-smi --query-compute-apps=pid --format=csv,noheader
+ xargs -r kill -9
+ pkill -9 -f vllm
/home/ubuntu/run_all.sh: line 9:+ sleep 3
+ echo '=== GPU AFTER CLEANUP ==='
=== GPU AFTER CLEANUP ===
+ nvidia-smi
Mon Mar 16 13:23:16 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 1Mon Mar 16 13:23:16 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | G|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   30C    P0             51W /  400W |       1MiB /  81920MiB |      0%      Default |
|                     |   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   30C    P0             51W /  400W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=====================================================================+ echo '=== LAUNCHING SERVER ==='
=== LAUNCHING SERVER ===
+ rm -f /home/ubuntu/vllm_server_log.txt
+ export VLLM_ATTENTION_BACKEND=TORCH_SDPA
+ VLLM_ATTENTION_BACKEND=TORCH_SDPA
+ SERVER_PID=2103038
+ echo+ echo '=== LAUNCHING SERVER ==='
=== LAUNCHING SERVER ===
+ rm -f /home/ubuntu/vllm_server_log.txt
+ export VLLM_ATTENTION_BACKEND=TORCH_SDPA
+ VLLM_ATTENTION_BACKEND=TORCH_SDPA
+ SERVER_PID=2103043
+ ech+ kill -0 2103038
+ echo 'Alive at 1x5s = 5s'
Alive at 1x5s = 5s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 2x5s = 10s'
Alive at 2x5s = 10s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 3x5s = 15s'
Alive at 3x5s = 15s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 4x5s = 20s'
Alive at 4x5s = 20s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 5x5s = 25s'
Alive at 5x5s = 25s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 6x5s = 30s'
Alive at 6x5s = 30s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 7x5s = 35s'
Alive at 7x5s = 35s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 8x5s = 40s'
Alive at 8x5s = 40s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 9x5s = 45s'
Alive at 9x5s = 45s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 10x5s = 50s'
Alive at 10x5s = 50s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 11x5s = 55s'
Alive at 11x5s = 55s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 12x5s = 60s'
Alive at 12x5s = 60s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 13x5s = 65s'
Alive at 13x5s = 65s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 14x5s = 70s'
Alive at 14x5s = 70s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 15x5s = 75s'
Alive at 15x5s = 75s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 16x5s = 80s'
Alive at 16x5s = 80s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 17x5s = 85s'
Alive at 17x5s = 85s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 18x5s = 90s'
Alive at 18x5s = 90s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 19x5s = 95s'
Alive at 19x5s = 95s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 20x5s = 100s'
Alive at 20x5s = 100s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 21x5s = 105s'
Alive at 21x5s = 105s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 22x5s = 110s'
Alive at 22x5s = 110s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 23x5s = 115s'
Alive at 23x5s = 115s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 24x5s = 120s'
Alive at 24x5s = 120s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 25x5s = 125s'
Alive at 25x5s = 125s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 26x5s = 130s'
Alive at 26x5s = 130s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 27x5s = 135s'
Alive at 27x5s = 135s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 28x5s = 140s'
Alive at 28x5s = 140s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 29x5s = 145s'
Alive at 29x5s = 145s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 30x5s = 150s'
Alive at 30x5s = 150s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 31x5s = 155s'
Alive at 31x5s = 155s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 32x5s = 160s'
Alive at 32x5s = 160s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 33x5s = 165s'
Alive at 33x5s = 165s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 34x5s = 170s'
Alive at 34x5s = 170s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 35x5s = 175s'
Alive at 35x5s = 175s
+ for i in $(seq 1 36)
+ sleep 5
+ kill -0 2103038
+ echo 'Alive at 36x5s = 180s'
Alive at 36x5s = 180s
+ echo '=== SERVER LOG ==='
=== SERVER LOG ===
+ cat /home/ubuntu/vllm_server_log.txt
Using stage config: /home/ubuntu/vllm_env/lib/pyt+ kill -0 2103043
+ echo 'Alive at 36x5s = 180s'
Alive at 36x5s = 180s
+ echo '=== SERVER LOG ==='
=== SERVER LOG ===
+ cat /home/ubuntu/vllm_server_log.txt
Using stage config: /home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml
Running: /home/ubuntu/vllm_env/bin/vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --stage-configs-path /home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml --host 0.0.0.0 --port 8091 --trust-remote-code --enforce-eager
WARNING 03-16 13:23:24 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
INFO 03-16 13:23:25 [logo.py:45]        █     █     █▄   ▄█       ▄▀▀▀▀▄ █▄   ▄█ █▄    █ ▀█▀ 
INFO 03-16 13:23:25 [logo.py:45]  ▄▄ ▄█ █     █     █ ▀▄▀ █  ▄▄▄  █    █ █ ▀▄▀ █ █ ▀▄  █  █  
INFO 03-16 13:23:25 [logo.py:45]   █▄█▀ █     █     █     █       █    █ █     █ █   ▀▄█  █  
INFO 03-16 13:23:25 [logo.py:45]    ▀▀  ▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀        ▀▀▀▀  ▀     ▀ ▀     ▀ ▀▀▀ 
INFO 03-16 13:23:25 [logo.py:45] 
(APIServer pid=2103043) INFO 03-16 13:23:25 [utils.py:287] vLLM server version 0.16.0, serving model Qwen/Qwen3-TTS-12Hz-1.7B-Base
(APIServer pid=2103043) INFO 03-16 13:23:25 [utils.py:223] non-default args: {'model_tag': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'host': '0.0.0.0', 'port': 8091, 'model': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'trust_remote_code': True, 'enforce_eager': True}
(APIServer pid=2103043) INFO 03-16 13:23:25 [weight_utils.py:50] Using model weights format ['*']
(APIServer pid=2103043) INFO 03-16 13:23:25 [omni.py:181] Initializing stages for model: Qwen/Qwen3-TTS-12Hz-1.7B-Base
(APIServer pid=2103043) INFO 03-16 13:23:25 [omni.py:313] No omni_master_address provided, defaulting to localhost (127.0.0.1)
(APIServer pid=2103043) WARNING 03-16 13:23:25 [utils.py:111] Filtered out 1 callable object(s) from base_engine_args that are not compatible with OmegaConf: ['dispatch_function']. 
(APIServer pid=2103043) INFO 03-16 13:23:26 [initialization.py:270] Loaded OmniTransferConfig with 1 connector configurations
(APIServer pid=2103043) INFO 03-16 13:23:26 [factory.py:46] Created connector: SharedMemoryConnector
(APIServer pid=2103043) INFO 03-16 13:23:26 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector
(APIServer pid=2103043) INFO 03-16 13:23:26 [omni.py:347] [AsyncOrchestrator] Loaded 2 stages
[Stage-0] WARNING 03-16 13:23:34 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
[Stage-1] WARNING 03-16 13:23:34 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
[Stage-0] INFO 03-16 13:23:34 [omni_stage.py:1132] [Stage-0] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)
[Stage-0] INFO 03-16 13:23:34 [initialization.py:324] [Stage-0] Initializing OmniConnectors with config keys: ['to_stage_1']
[Stage-0] INFO 03-16 13:23:34 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 13:23:34] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
(APIServer pid=2103043) INFO 03-16 13:23:35 [omni.py:458] [AsyncOrchestrator] Waiting for 2 stages to initialize (timeout: 600s)
[Stage-1] INFO 03-16 13:23:35 [omni_stage.py:1132] [Stage-1] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)
[Stage-1] INFO 03-16 13:23:35 [initialization.py:324] [Stage-1] Initializing OmniConnectors with config keys: ['from_stage_0']
[Stage-1] INFO 03-16 13:23:35 [factory.py:46] Created connector: SharedMemoryConnector
[Stage-1] INFO 03-16 13:23:35 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector
[Stage-1] INFO 03-16 13:23:35 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks
[Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
[Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
[Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
[Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
[Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
[Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
[Stage-1] INFO 03-16 13:23:45 [model.py:529] Resolved architecture: Qwen3TTSCode2Wav
[Stage-1] INFO 03-16 13:23:45 [model.py:1549] Using max model len 32768
[Stage-1] INFO 03-16 13:23:45 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192.
[Stage-1] INFO 03-16 13:23:45 [vllm.py:689] Asynchronous scheduling is disabled.
[Stage-1] WARNING 03-16 13:23:45 [vllm.py:727] Enforce eager set, overriding optimization level to -O0
[Stage-1] INFO 03-16 13:23:45 [vllm.py:845] Cudagraph is disabled under eager mode
[Stage-0] INFO 03-16 13:23:45 [model.py:529] Resolved architecture: Qwen3TTSTalkerForConditionalGeneration
[Stage-0] INFO 03-16 13:23:46 [model.py:1549] Using max model len 4096
[Stage-0] INFO 03-16 13:23:46 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=512.
[Stage-0] INFO 03-16 13:23:46 [vllm.py:689] Asynchronous scheduling is disabled.
[Stage-1] WARNING 03-16 13:23:55 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
[Stage-0] WARNING 03-16 13:23:55 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
(EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:23:56 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.NONE: 0>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.NONE: 0>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=2104145) [Stage-1] WARNING 03-16 13:23:56 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:23:56 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [512], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 2, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=2104154) [Stage-0] WARNING 03-16 13:23:56 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
[Stage-1] WARNING 03-16 13:24:03 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
[Stage-0] WARNING 03-16 13:24:03 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation
[Stage-1] INFO 03-16 13:24:04 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44833 backend=nccl
[Stage-1] INFO 03-16 13:24:04 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A
/bin/sh: 1: sox: not found
[2026-03-16 13:24:04] WARNING __init__.py:10: SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:05 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base...
[Stage-0] INFO 03-16 13:24:05 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34577 backend=nccl
[Stage-0] INFO 03-16 13:24:05 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:05 [default_loader.py:293] Loading weights took 8321162.85 seconds
/bin/sh: 1: sox: not found
[2026-03-16 13:24:05] WARNING __init__.py:10: SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:05 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base...
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [gpu_model_runner.py:4221] Model loading took 0.0 GiB memory and 0.001688 seconds
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [kernel_warmup.py:44] Skipping FlashInfer autotune because it is disabled.
(Worker pid=2104567) `torch_dtype` is deprecated! Use `dtype` instead!
(Worker pid=2104567) [2026-03-16 13:24:06] WARNING logging.py:328: `torch_dtype` is deprecated! Use `dtype` instead!
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [configuration_qwen3_tts_tokenizer_v2.py:156] encoder_config is None. Initializing encoder with default values
(Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [configuration_qwen3_tts_tokenizer_v2.py:159] decoder_config is None. Initializing decoder with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:06 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(Worker pid=2104589) <frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(Worker pid=2104589) <frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:06 [vllm.py:689] Asynchronous scheduling is disabled.
(Worker pid=2104567) [Stage-1] WARNING 03-16 13:24:06 [gpu_generation_model_runner.py:451] Dummy sampler run is not implemented for generation model
(EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:24:06 [core.py:278] init engine (profile, create kv cache, warmup model) took 0.70 seconds
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:07 [weight_utils.py:579] No model.safetensors.index.json found in remote.
(Worker pid=2104589) Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
(Worker pid=2104589) Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.58it/s]
(Worker pid=2104589) Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.57it/s]
(Worker pid=2104589) 
(EngineCore_DP0 pid=2104145) [Stage-1] WARNING 03-16 13:24:07 [scheduler.py:166] Using custom scheduler class vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler. This scheduler interface is not public and compatibility may not be maintained.
(EngineCore_DP0 pid=2104145) [Stage-1] WARNING 03-16 13:24:07 [core.py:130] Disabling chunked prefill for model without KVCache
(EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:24:07 [factory.py:46] Created connector: SharedMemoryConnector
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:07 [qwen3_tts_talker.py:1534] Loaded 381 weights for Qwen3TTSTalkerForConditionalGeneration
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:07 [default_loader.py:293] Loading weights took 0.80 seconds
(EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:24:08 [vllm.py:689] Asynchronous scheduling is disabled.
(EngineCore_DP0 pid=2104145) [Stage-1] WARNING 03-16 13:24:08 [vllm.py:734] Inductor compilation was disabled by user settings, optimizations settings that are only active during inductor compilation will be ignored.
(EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:24:08 [vllm.py:845] Cudagraph is disabled under eager mode
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:08 [gpu_model_runner.py:4221] Model loading took 3.62 GiB memory and 1.854550 seconds
(APIServer pid=2103043) INFO 03-16 13:24:09 [omni.py:448] [AsyncOrchestrator] Stage-1 reported ready
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [backends.py:916] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/bbfc3f167b/rank_0_0/backbone for vLLM's torch.compile
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:15 [backends.py:976] Dynamo bytecode transform time: 6.39 s
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:19 [backends.py:351] Cache the graph of compile range (1, 512) for later use
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:21 [backends.py:368] Compiling a graph for compile range (1, 512) takes 1.99 s
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:21 [monitor.py:34] torch.compile takes 8.37 s in total
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:22 [base.py:81] Available KV cache memory: 19.62 GiB (process-scoped)
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:24:22 [kv_cache_utils.py:1307] GPU KV cache size: 183,648 tokens
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:24:22 [kv_cache_utils.py:1312] Maximum concurrency for 4,096 tokens per request: 44.84x
(Worker pid=2104589) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/2 [00:00<?, ?it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 2/2 [00:00<00:00, 18.51it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 2/2 [00:00<00:00, 18.47it/s]
(Worker pid=2104589) Capturing CUDA graphs (decode, FULL):   0%|          | 0/1 [00:00<?, ?it/s]Capturing CUDA graphs (decode, FULL): 100%|██████████| 1/1 [00:00<00:00, 19.60it/s]
(Worker pid=2104589) [Stage-0] INFO 03-16 13:24:23 [gpu_model_runner.py:5246] Graph capturing finished in 1 secs, took 0.04 GiB
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:24:23 [core.py:278] init engine (profile, create kv cache, warmup model) took 15.18 seconds
(EngineCore_DP0 pid=2104154) [Stage-0] WARNING 03-16 13:24:24 [scheduler.py:166] Using custom scheduler class vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler. This scheduler interface is not public and compatibility may not be maintained.
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:24:24 [factory.py:46] Created connector: SharedMemoryConnector
(EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:24:25 [vllm.py:689] Asynchronous scheduling is disabled.
(APIServer pid=2103043) INFO 03-16 13:24:26 [omni.py:448] [AsyncOrchestrator] Stage-0 reported ready
(APIServer pid=2103043) INFO 03-16 13:24:26 [omni.py:477] [AsyncOrchestrator] All stages initialized successfully
(APIServer pid=2103043) INFO 03-16 13:24:27 [async_omni.py:232] [AsyncOrchestrator] Initialized input_processor, io_processor, and model_config from stage-0
(APIServer pid=2103043) WARNING 03-16 13:24:27 [api_server.py:469] vllm_config is None, some features may not work correctly
(APIServer pid=2103043) INFO 03-16 13:24:27 [api_server.py:477] Supported tasks: {'generate'}
(APIServer pid=2103043) WARNING 03-16 13:24:27 [api_server.py:548] Cannot initialize processors: vllm_config is None. OpenAIServingModels may fail.
(APIServer pid=2103043) WARNING 03-16 13:24:27 [model.py:1350] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.9, 'max_tokens': 8192}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
(APIServer pid=2103043) INFO 03-16 13:24:27 [serving.py:188] Warming up chat template processing...
(APIServer pid=2103043) INFO 03-16 13:24:28 [hf.py:318] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217] Chat template warmup failed
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217] Traceback (most recent call last):
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/entrypoints/openai/chat_completion/serving.py", line 204, in warmup
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]     await self._preprocess_chat(
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/openai/serving_chat.py", line 457, in _preprocess_chat
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]     (conversation,), (engine_prompt,) = await renderer.render_chat_async(
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/renderers/protocol.py", line 377, in render_chat_async
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]     for conv, prompt in await asyncio.gather(*rendered):
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/renderers/hf.py", line 706, in render_messages_async
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]     prompt_raw = safe_apply_chat_template(
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/renderers/hf.py", line 459, in safe_apply_chat_template
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217]     raise ChatTemplateResolutionError(
(APIServer pid=2103043) ERROR 03-16 13:24:28 [serving.py:217] vllm.entrypoints.chat_utils.ChatTemplateResolutionError: As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.
(APIServer pid=2103043) WARNING 03-16 13:24:28 [serving_speech.py:154] No speakers found in talker_config (checked spk_id and speaker_id)
(APIServer pid=2103043) INFO 03-16 13:24:28 [serving_speech.py:76] Loaded 0 supported speakers: []
(APIServer pid=2103043) INFO 03-16 13:24:28 [serving_speech.py:107] Using codec frame rate from hf_config: 13.0 Hz
(APIServer pid=2103043) INFO 03-16 13:24:28 [api_server.py:265] Starting vLLM API server (pure diffusion mode) on http://0.0.0.0:8091
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:38] Available routes are:
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /openapi.json, Methods: GET, HEAD
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /docs, Methods: GET, HEAD
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /docs/oauth2-redirect, Methods: GET, HEAD
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /redoc, Methods: GET, HEAD
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /load, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /version, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /scale_elastic_ep, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /is_scaling_elastic_ep, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /tokenize, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /detokenize, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /inference/v1/generate, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /metrics, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /health, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /ping, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /ping, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /invocations, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/chat/completions/render, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/responses, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/responses/{response_id}, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/responses/{response_id}/cancel, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/completions, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/completions/render, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/messages, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/chat/completions, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/audio/speech, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/audio/voices, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /health, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/models, Methods: GET
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/images/generations, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/images/edits, Methods: POST
(APIServer pid=2103043) INFO 03-16 13:24:28 [launcher.py:47] Route: /v1/videos, Methods: POST
(APIServer pid=2103043) INFO:     Started server process [2103043]
(APIServer pid=2103043) INFO:     Waiting for application startup.
(APIServer pid=2103043) INFO:     Application startup complete.
+ echo '=== GPU STATUS ==='
=== GPU STATUS ===
+ nvidia-smi
Mon Mar 16 13:26:17 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   31C    P0             74W /  400W |   75921MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A   2104562      C   VLLM::Worker                                24404MiB |
|    0   N/A  N/A   2104567      C   VLLM::Worker                                  890MiB |
|    0   N/A  N/A   2104578      C   VLLM::Worker                                  890MiB |
|    0   N/A  N/A   2104579      C   VLLM::Worker                                24404MiB |
|    0   N/A  N/A   2104584      C   VLLM::Worker                                  890MiB |
|    0   N/A  N/A   2104589      C   VLLM::Worker                                24404MiB |
+-----------------------------------------------------------------------------------------+
+ echo '=== TEST ENDPOINT ==='
=== TEST ENDPOINT ===
+ curl -s -m 10 http://localhost:8091/v1/audio/voices
{"voices":[]}++ date
+ echo '=== ALL DONE at Mon Mar 16 01:26:17 PM UTC 2026 ==='
=== ALL DONE at Mon Mar 16 01:26:17 PM UTC 2026 ===