++ date + echo 'STARTING at Mon Mar 16 01:23:13 PM UTC 2026' STARTING at Mon Mar 16 01:23:13 PM UTC 2026 + nvidia-smi --query-compute-apps=pid --format=csv,noheader + xargs -r kill -9 + pkill -9 -f vllm /home/ubuntu/run_all.sh: line 9:+ sleep 3 + echo '=== GPU AFTER CLEANUP ===' === GPU AFTER CLEANUP === + nvidia-smi Mon Mar 16 13:23:16 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 1Mon Mar 16 13:23:16 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | G| 0 NVIDIA A100-SXM4-80GB On | 00000000:01:00.0 Off | 0 | | N/A 30C P0 51W / 400W | 1MiB / 81920MiB | 0% Default | | | 0 NVIDIA A100-SXM4-80GB On | 00000000:01:00.0 Off | 0 | | N/A 30C P0 51W / 400W | 1MiB / 81920MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=====================================================================+ echo '=== LAUNCHING SERVER ===' === LAUNCHING SERVER === + rm -f /home/ubuntu/vllm_server_log.txt + export VLLM_ATTENTION_BACKEND=TORCH_SDPA + VLLM_ATTENTION_BACKEND=TORCH_SDPA + SERVER_PID=2103038 + echo+ echo '=== LAUNCHING SERVER ===' === LAUNCHING SERVER === + rm -f /home/ubuntu/vllm_server_log.txt + export VLLM_ATTENTION_BACKEND=TORCH_SDPA + VLLM_ATTENTION_BACKEND=TORCH_SDPA + SERVER_PID=2103043 + ech+ kill -0 2103038 + echo 'Alive at 1x5s = 5s' Alive at 1x5s = 5s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 2x5s = 10s' Alive at 2x5s = 10s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 3x5s = 15s' Alive at 3x5s = 15s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 4x5s = 20s' Alive at 4x5s = 20s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 5x5s = 25s' Alive at 5x5s = 25s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 6x5s = 30s' Alive at 6x5s = 30s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 7x5s = 35s' Alive at 7x5s = 35s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 8x5s = 40s' Alive at 8x5s = 40s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 9x5s = 45s' Alive at 9x5s = 45s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 10x5s = 50s' Alive at 10x5s = 50s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 11x5s = 55s' Alive at 11x5s = 55s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 12x5s = 60s' Alive at 12x5s = 60s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 13x5s = 65s' Alive at 13x5s = 65s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 14x5s = 70s' Alive at 14x5s = 70s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 15x5s = 75s' Alive at 15x5s = 75s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 16x5s = 80s' Alive at 16x5s = 80s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 17x5s = 85s' Alive at 17x5s = 85s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 18x5s = 90s' Alive at 18x5s = 90s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 19x5s = 95s' Alive at 19x5s = 95s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 20x5s = 100s' Alive at 20x5s = 100s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 21x5s = 105s' Alive at 21x5s = 105s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 22x5s = 110s' Alive at 22x5s = 110s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 23x5s = 115s' Alive at 23x5s = 115s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 24x5s = 120s' Alive at 24x5s = 120s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 25x5s = 125s' Alive at 25x5s = 125s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 26x5s = 130s' Alive at 26x5s = 130s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 27x5s = 135s' Alive at 27x5s = 135s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 28x5s = 140s' Alive at 28x5s = 140s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 29x5s = 145s' Alive at 29x5s = 145s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 30x5s = 150s' Alive at 30x5s = 150s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 31x5s = 155s' Alive at 31x5s = 155s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 32x5s = 160s' Alive at 32x5s = 160s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 33x5s = 165s' Alive at 33x5s = 165s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 34x5s = 170s' Alive at 34x5s = 170s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 35x5s = 175s' Alive at 35x5s = 175s + for i in $(seq 1 36) + sleep 5 + kill -0 2103038 + echo 'Alive at 36x5s = 180s' Alive at 36x5s = 180s + echo '=== SERVER LOG ===' === SERVER LOG === + cat /home/ubuntu/vllm_server_log.txt Using stage config: /home/ubuntu/vllm_env/lib/pyt+ kill -0 2103043 + echo 'Alive at 36x5s = 180s' Alive at 36x5s = 180s + echo '=== SERVER LOG ===' === SERVER LOG === + cat /home/ubuntu/vllm_server_log.txt Using stage config: /home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml Running: /home/ubuntu/vllm_env/bin/vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --stage-configs-path /home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml --host 0.0.0.0 --port 8091 --trust-remote-code --enforce-eager WARNING 03-16 13:23:24 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation INFO 03-16 13:23:25 [logo.py:45] █ █ █▄ ▄█ ▄▀▀▀▀▄ █▄ ▄█ █▄ █ ▀█▀ INFO 03-16 13:23:25 [logo.py:45] ▄▄ ▄█ █ █ █ ▀▄▀ █ ▄▄▄ █ █ █ ▀▄▀ █ █ ▀▄ █ █ INFO 03-16 13:23:25 [logo.py:45] █▄█▀ █ █ █ █ █ █ █ █ █ ▀▄█ █ INFO 03-16 13:23:25 [logo.py:45] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ ▀▀▀▀ ▀ ▀ ▀ ▀ ▀▀▀ INFO 03-16 13:23:25 [logo.py:45] (APIServer pid=2103043) INFO 03-16 13:23:25 [utils.py:287] vLLM server version 0.16.0, serving model Qwen/Qwen3-TTS-12Hz-1.7B-Base (APIServer pid=2103043) INFO 03-16 13:23:25 [utils.py:223] non-default args: {'model_tag': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'host': '0.0.0.0', 'port': 8091, 'model': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'trust_remote_code': True, 'enforce_eager': True} (APIServer pid=2103043) INFO 03-16 13:23:25 [weight_utils.py:50] Using model weights format ['*'] (APIServer pid=2103043) INFO 03-16 13:23:25 [omni.py:181] Initializing stages for model: Qwen/Qwen3-TTS-12Hz-1.7B-Base (APIServer pid=2103043) INFO 03-16 13:23:25 [omni.py:313] No omni_master_address provided, defaulting to localhost (127.0.0.1) (APIServer pid=2103043) WARNING 03-16 13:23:25 [utils.py:111] Filtered out 1 callable object(s) from base_engine_args that are not compatible with OmegaConf: ['dispatch_function']. (APIServer pid=2103043) INFO 03-16 13:23:26 [initialization.py:270] Loaded OmniTransferConfig with 1 connector configurations (APIServer pid=2103043) INFO 03-16 13:23:26 [factory.py:46] Created connector: SharedMemoryConnector (APIServer pid=2103043) INFO 03-16 13:23:26 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector (APIServer pid=2103043) INFO 03-16 13:23:26 [omni.py:347] [AsyncOrchestrator] Loaded 2 stages [Stage-0] WARNING 03-16 13:23:34 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation [Stage-1] WARNING 03-16 13:23:34 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation [Stage-0] INFO 03-16 13:23:34 [omni_stage.py:1132] [Stage-0] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize) [Stage-0] INFO 03-16 13:23:34 [initialization.py:324] [Stage-0] Initializing OmniConnectors with config keys: ['to_stage_1'] [Stage-0] INFO 03-16 13:23:34 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 13:23:34] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. (APIServer pid=2103043) INFO 03-16 13:23:35 [omni.py:458] [AsyncOrchestrator] Waiting for 2 stages to initialize (timeout: 600s) [Stage-1] INFO 03-16 13:23:35 [omni_stage.py:1132] [Stage-1] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize) [Stage-1] INFO 03-16 13:23:35 [initialization.py:324] [Stage-1] Initializing OmniConnectors with config keys: ['from_stage_0'] [Stage-1] INFO 03-16 13:23:35 [factory.py:46] Created connector: SharedMemoryConnector [Stage-1] INFO 03-16 13:23:35 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector [Stage-1] INFO 03-16 13:23:35 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks [Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values [Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values [Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [Stage-0] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 13:23:35] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values [Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values [Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values [Stage-1] INFO 03-16 13:23:35 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values [Stage-1] INFO 03-16 13:23:45 [model.py:529] Resolved architecture: Qwen3TTSCode2Wav [Stage-1] INFO 03-16 13:23:45 [model.py:1549] Using max model len 32768 [Stage-1] INFO 03-16 13:23:45 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. [Stage-1] INFO 03-16 13:23:45 [vllm.py:689] Asynchronous scheduling is disabled. [Stage-1] WARNING 03-16 13:23:45 [vllm.py:727] Enforce eager set, overriding optimization level to -O0 [Stage-1] INFO 03-16 13:23:45 [vllm.py:845] Cudagraph is disabled under eager mode [Stage-0] INFO 03-16 13:23:45 [model.py:529] Resolved architecture: Qwen3TTSTalkerForConditionalGeneration [Stage-0] INFO 03-16 13:23:46 [model.py:1549] Using max model len 4096 [Stage-0] INFO 03-16 13:23:46 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=512. [Stage-0] INFO 03-16 13:23:46 [vllm.py:689] Asynchronous scheduling is disabled. [Stage-1] WARNING 03-16 13:23:55 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation [Stage-0] WARNING 03-16 13:23:55 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation (EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:23:56 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['all'], 'splitting_ops': [], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': [], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 0, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} (EngineCore_DP0 pid=2104145) [Stage-1] WARNING 03-16 13:23:56 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. (EngineCore_DP0 pid=2104154) [Stage-0] INFO 03-16 13:23:56 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [512], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 2, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} (EngineCore_DP0 pid=2104154) [Stage-0] WARNING 03-16 13:23:56 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. [Stage-1] WARNING 03-16 13:24:03 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation [Stage-0] WARNING 03-16 13:24:03 [envs.py:94] No Flash Attention backend found, using pytorch SDPA implementation [Stage-1] INFO 03-16 13:24:04 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44833 backend=nccl [Stage-1] INFO 03-16 13:24:04 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A /bin/sh: 1: sox: not found [2026-03-16 13:24:04] WARNING __init__.py:10: SoX could not be found! If you do not have SoX, proceed here: - - - http://sox.sourceforge.net/ - - - If you do (or think that you should) have SoX, double-check your path variables. (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:05 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base... [Stage-0] INFO 03-16 13:24:05 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34577 backend=nccl [Stage-0] INFO 03-16 13:24:05 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:05 [default_loader.py:293] Loading weights took 8321162.85 seconds /bin/sh: 1: sox: not found [2026-03-16 13:24:05] WARNING __init__.py:10: SoX could not be found! If you do not have SoX, proceed here: - - - http://sox.sourceforge.net/ - - - If you do (or think that you should) have SoX, double-check your path variables. (Worker pid=2104589) [Stage-0] INFO 03-16 13:24:05 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base... (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [gpu_model_runner.py:4221] Model loading took 0.0 GiB memory and 0.001688 seconds (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [kernel_warmup.py:44] Skipping FlashInfer autotune because it is disabled. (Worker pid=2104567) `torch_dtype` is deprecated! Use `dtype` instead! (Worker pid=2104567) [2026-03-16 13:24:06] WARNING logging.py:328: `torch_dtype` is deprecated! Use `dtype` instead! (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [configuration_qwen3_tts_tokenizer_v2.py:156] encoder_config is None. Initializing encoder with default values (Worker pid=2104567) [Stage-1] INFO 03-16 13:24:06 [configuration_qwen3_tts_tokenizer_v2.py:159] decoder_config is None. Initializing decoder with default values (Worker pid=2104589) [Stage-0] INFO 03-16 13:24:06 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. (Worker pid=2104589) :1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. (Worker pid=2104589) :1184: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. (Worker pid=2104589) [Stage-0] INFO 03-16 13:24:06 [vllm.py:689] Asynchronous scheduling is disabled. (Worker pid=2104567) [Stage-1] WARNING 03-16 13:24:06 [gpu_generation_model_runner.py:451] Dummy sampler run is not implemented for generation model (EngineCore_DP0 pid=2104145) [Stage-1] INFO 03-16 13:24:06 [core.py:278] init engine (profile, create kv cache, warmup model) took 0.70 seconds (Worker pid=2104589) [Stage-0] INFO 03-16 13:24:07 [weight_utils.py:579] No model.safetensors.index.json found in remote. (Worker pid=2104589) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00