+ echo '=== Killing old processes ===' === Killing old processes === + nvidia-smi --query-compute-apps=pid --format=csv,noheader + xargs -r kill -9 + pkill -9 -f 'python3.*qwen' + sleep 3 + echo '=== GPU cleared ===' === GPU cleared === + nvidia-smi Mon Mar 16 14:38:19 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 560.35.05 Driver Version: 560.35.05 CUDA Version: 12.6 | |-----------------------------------------+------------------------+----------------------+ | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA A100-SXM4-80GB On | 00000000:01:00.0 Off | 0 | | N/A 29C P0 50W / 400W | 4MiB / 81920MiB | 0% Default | | | | Disabled | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | No running processes found | +-----------------------------------------------------------------------------------------+ + echo '=== Starting vLLM-Omni ===' === Starting vLLM-Omni === + source /home/ubuntu/vllm_env/bin/activate ++ deactivate nondestructive ++ '[' -n '' ']' ++ '[' -n '' ']' ++ '[' -n /usr/bin/bash -o -n '' ']' ++ hash -r ++ '[' -n '' ']' ++ unset VIRTUAL_ENV ++ unset VIRTUAL_ENV_PROMPT ++ '[' '!' nondestructive = nondestructive ']' ++ VIRTUAL_ENV=/home/ubuntu/vllm_env ++ export VIRTUAL_ENV ++ _OLD_VIRTUAL_PATH=/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin ++ PATH=/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin ++ export PATH ++ '[' -n '' ']' ++ '[' -z '' ']' ++ _OLD_VIRTUAL_PS1= ++ PS1='(vllm_env) ' ++ export PS1 ++ VIRTUAL_ENV_PROMPT='(vllm_env) ' ++ export VIRTUAL_ENV_PROMPT ++ '[' -n /usr/bin/bash -o -n '' ']' ++ hash -r + python3 -c 'import flash_attn; print('\''flash_attn:'\'', flash_attn.__version__)' flash_attn: 2.8.3 + rm -f /home/ubuntu/vllm_server_log.txt + export VLLM_ATTENTION_BACKEND=FLASH_ATTN + VLLM_ATTENTION_BACKEND=FLASH_ATTN + SERVER_PID=2130421 + echo 'Server PID: 2130421' Server PID: 2130421 + echo '=== Waiting for server (max 300s) ===' === Waiting for server (max 300s) === + /home/ubuntu/vllm_env/bin/python3 /home/ubuntu/launch_server.py ++ seq 1 60 + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 1x5s' Waiting... 1x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 2x5s' Waiting... 2x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 3x5s' Waiting... 3x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 4x5s' Waiting... 4x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 5x5s' Waiting... 5x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 6x5s' Waiting... 6x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 7x5s' Waiting... 7x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 8x5s' Waiting... 8x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 9x5s' Waiting... 9x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 10x5s' Waiting... 10x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 11x5s' Waiting... 11x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 12x5s' Waiting... 12x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 13x5s' Waiting... 13x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + grep -q 200 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + echo 'Waiting... 14x5s' Waiting... 14x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 15x5s' Waiting... 15x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 16x5s' Waiting... 16x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 17x5s' Waiting... 17x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 18x5s' Waiting... 18x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 19x5s' Waiting... 19x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 20x5s' Waiting... 20x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 21x5s' Waiting... 21x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 22x5s' Waiting... 22x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 23x5s' Waiting... 23x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 24x5s' Waiting... 24x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 25x5s' Waiting... 25x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 26x5s' Waiting... 26x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 27x5s' Waiting... 27x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 28x5s' Waiting... 28x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 29x5s' Waiting... 29x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 30x5s' Waiting... 30x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 31x5s' Waiting... 31x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 32x5s' Waiting... 32x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 33x5s' Waiting... 33x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + grep -q 200 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + echo 'Waiting... 34x5s' Waiting... 34x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 35x5s' Waiting... 35x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 36x5s' Waiting... 36x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 37x5s' Waiting... 37x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 38x5s' Waiting... 38x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 39x5s' Waiting... 39x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 40x5s' Waiting... 40x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 41x5s' Waiting... 41x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 42x5s' Waiting... 42x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 43x5s' Waiting... 43x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 44x5s' Waiting... 44x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 45x5s' Waiting... 45x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 46x5s' Waiting... 46x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 47x5s' Waiting... 47x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 48x5s' Waiting... 48x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 49x5s' Waiting... 49x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 50x5s' Waiting... 50x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 51x5s' Waiting... 51x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 52x5s' Waiting... 52x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 53x5s' Waiting... 53x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 54x5s' Waiting... 54x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 55x5s' Waiting... 55x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 56x5s' Waiting... 56x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 57x5s' Waiting... 57x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 58x5s' Waiting... 58x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 59x5s' Waiting... 59x5s + for i in $(seq 1 60) + sleep 5 + kill -0 2130421 + curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health + grep -q 200 + echo 'Waiting... 60x5s' Waiting... 60x5s + echo '=== Server Log ===' === Server Log === + cat /home/ubuntu/vllm_server_log.txt Using stage config: /home/ubuntu/qwen3_tts_optimized.yaml Running: /home/ubuntu/vllm_env/bin/vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --stage-configs-path /home/ubuntu/qwen3_tts_optimized.yaml --host 0.0.0.0 --port 8091 --trust-remote-code INFO 03-16 14:38:31 [logo.py:45] █ █ █▄ ▄█ ▄▀▀▀▀▄ █▄ ▄█ █▄ █ ▀█▀ INFO 03-16 14:38:31 [logo.py:45] ▄▄ ▄█ █ █ █ ▀▄▀ █ ▄▄▄ █ █ █ ▀▄▀ █ █ ▀▄ █ █ INFO 03-16 14:38:31 [logo.py:45] █▄█▀ █ █ █ █ █ █ █ █ █ ▀▄█ █ INFO 03-16 14:38:31 [logo.py:45] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ ▀▀▀▀ ▀ ▀ ▀ ▀ ▀▀▀ INFO 03-16 14:38:31 [logo.py:45] (APIServer pid=2130421) INFO 03-16 14:38:31 [utils.py:287] vLLM server version 0.16.0, serving model Qwen/Qwen3-TTS-12Hz-1.7B-Base (APIServer pid=2130421) INFO 03-16 14:38:31 [utils.py:223] non-default args: {'model_tag': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'host': '0.0.0.0', 'port': 8091, 'model': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'trust_remote_code': True} (APIServer pid=2130421) INFO 03-16 14:38:31 [weight_utils.py:50] Using model weights format ['*'] (APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:181] Initializing stages for model: Qwen/Qwen3-TTS-12Hz-1.7B-Base (APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:313] No omni_master_address provided, defaulting to localhost (127.0.0.1) (APIServer pid=2130421) WARNING 03-16 14:38:31 [utils.py:111] Filtered out 1 callable object(s) from base_engine_args that are not compatible with OmegaConf: ['dispatch_function']. (APIServer pid=2130421) INFO 03-16 14:38:31 [initialization.py:270] Loaded OmniTransferConfig with 1 connector configurations (APIServer pid=2130421) INFO 03-16 14:38:31 [factory.py:46] Created connector: SharedMemoryConnector (APIServer pid=2130421) INFO 03-16 14:38:31 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector (APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:347] [AsyncOrchestrator] Loaded 2 stages [Stage-0] INFO 03-16 14:38:39 [omni_stage.py:1132] [Stage-0] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize) [Stage-0] INFO 03-16 14:38:39 [initialization.py:324] [Stage-0] Initializing OmniConnectors with config keys: ['to_stage_1'] [Stage-0] INFO 03-16 14:38:39 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 14:38:39] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. (APIServer pid=2130421) INFO 03-16 14:38:40 [omni.py:458] [AsyncOrchestrator] Waiting for 2 stages to initialize (timeout: 600s) [Stage-1] INFO 03-16 14:38:40 [omni_stage.py:1132] [Stage-1] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize) [Stage-1] INFO 03-16 14:38:40 [initialization.py:324] [Stage-1] Initializing OmniConnectors with config keys: ['from_stage_0'] [Stage-1] INFO 03-16 14:38:40 [factory.py:46] Created connector: SharedMemoryConnector [Stage-1] INFO 03-16 14:38:40 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector [Stage-1] INFO 03-16 14:38:40 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values [Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values [Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values [Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. [Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values [Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values [Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values [Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values [Stage-0] INFO 03-16 14:38:50 [model.py:529] Resolved architecture: Qwen3TTSTalkerForConditionalGeneration [Stage-1] INFO 03-16 14:38:50 [model.py:529] Resolved architecture: Qwen3TTSCode2Wav [Stage-0] INFO 03-16 14:38:50 [model.py:1549] Using max model len 4096 [Stage-0] INFO 03-16 14:38:50 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=512. [Stage-0] INFO 03-16 14:38:50 [vllm.py:689] Asynchronous scheduling is disabled. [Stage-1] INFO 03-16 14:38:51 [model.py:1549] Using max model len 32768 [Stage-1] INFO 03-16 14:38:51 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192. [Stage-1] INFO 03-16 14:38:51 [vllm.py:689] Asynchronous scheduling is disabled. (EngineCore_DP0 pid=2130871) [Stage-1] INFO 03-16 14:39:00 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 32, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} (EngineCore_DP0 pid=2130871) [Stage-1] WARNING 03-16 14:39:00 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. (EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:01 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [512], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 32, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} (EngineCore_DP0 pid=2130874) [Stage-0] WARNING 03-16 14:39:01 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. [Stage-1] INFO 03-16 14:39:09 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60659 backend=nccl [Stage-1] INFO 03-16 14:39:09 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A [Stage-0] INFO 03-16 14:39:09 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:51091 backend=nccl [Stage-0] INFO 03-16 14:39:09 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A /bin/sh: 1: sox: not found [2026-03-16 14:39:10] WARNING __init__.py:10: SoX could not be found! If you do not have SoX, proceed here: - - - http://sox.sourceforge.net/ - - - If you do (or think that you should) have SoX, double-check your path variables. (Worker pid=2131034) [Stage-1] INFO 03-16 14:39:10 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base... /bin/sh: 1: sox: not found [2026-03-16 14:39:10] WARNING __init__.py:10: SoX could not be found! If you do not have SoX, proceed here: - - - http://sox.sourceforge.net/ - - - If you do (or think that you should) have SoX, double-check your path variables. (Worker pid=2131039) [Stage-0] INFO 03-16 14:39:10 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base... (Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:10 [vllm.py:1535] `torch.compile` is turned on, but the model Qwen/Qwen3-TTS-12Hz-1.7B-Base does not support it. Please open an issue on GitHub if you want it to be supported. (Worker pid=2131034) [Stage-1] INFO 03-16 14:39:10 [default_loader.py:293] Loading weights took 8325668.04 seconds (Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. (Worker pid=2131039) :1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. (Worker pid=2131039) :1184: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. (Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [vllm.py:689] Asynchronous scheduling is disabled. (Worker pid=2131034) [Stage-1] INFO 03-16 14:39:11 [gpu_model_runner.py:4221] Model loading took 0.0 GiB memory and 0.002019 seconds (Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [weight_utils.py:579] No model.safetensors.index.json found in remote. (Worker pid=2131039) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00