+ echo '=== Killing old processes ==='
=== Killing old processes ===
+ nvidia-smi --query-compute-apps=pid --format=csv,noheader
+ xargs -r kill -9
+ pkill -9 -f 'python3.*qwen'
+ sleep 3
+ echo '=== GPU cleared ==='
=== GPU cleared ===
+ nvidia-smi
Mon Mar 16 14:38:19 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   29C    P0             50W /  400W |       4MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+
+ echo '=== Starting vLLM-Omni ==='
=== Starting vLLM-Omni ===
+ source /home/ubuntu/vllm_env/bin/activate
++ deactivate nondestructive
++ '[' -n '' ']'
++ '[' -n '' ']'
++ '[' -n /usr/bin/bash -o -n '' ']'
++ hash -r
++ '[' -n '' ']'
++ unset VIRTUAL_ENV
++ unset VIRTUAL_ENV_PROMPT
++ '[' '!' nondestructive = nondestructive ']'
++ VIRTUAL_ENV=/home/ubuntu/vllm_env
++ export VIRTUAL_ENV
++ _OLD_VIRTUAL_PATH=/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
++ PATH=/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/vllm_env/bin:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/home/ubuntu/.cursor-server/bin/linux-x64/224838f96445be37e3db643a163a817c15b36060/bin/remote-cli:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
++ export PATH
++ '[' -n '' ']'
++ '[' -z '' ']'
++ _OLD_VIRTUAL_PS1=
++ PS1='(vllm_env) '
++ export PS1
++ VIRTUAL_ENV_PROMPT='(vllm_env) '
++ export VIRTUAL_ENV_PROMPT
++ '[' -n /usr/bin/bash -o -n '' ']'
++ hash -r
+ python3 -c 'import flash_attn; print('\''flash_attn:'\'', flash_attn.__version__)'
flash_attn: 2.8.3
+ rm -f /home/ubuntu/vllm_server_log.txt
+ export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+ VLLM_ATTENTION_BACKEND=FLASH_ATTN
+ SERVER_PID=2130421
+ echo 'Server PID: 2130421'
Server PID: 2130421
+ echo '=== Waiting for server (max 300s) ==='
=== Waiting for server (max 300s) ===
+ /home/ubuntu/vllm_env/bin/python3 /home/ubuntu/launch_server.py
++ seq 1 60
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 1x5s'
Waiting... 1x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 2x5s'
Waiting... 2x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 3x5s'
Waiting... 3x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 4x5s'
Waiting... 4x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 5x5s'
Waiting... 5x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 6x5s'
Waiting... 6x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 7x5s'
Waiting... 7x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 8x5s'
Waiting... 8x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 9x5s'
Waiting... 9x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 10x5s'
Waiting... 10x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 11x5s'
Waiting... 11x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 12x5s'
Waiting... 12x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 13x5s'
Waiting... 13x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ grep -q 200
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ echo 'Waiting... 14x5s'
Waiting... 14x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 15x5s'
Waiting... 15x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 16x5s'
Waiting... 16x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 17x5s'
Waiting... 17x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 18x5s'
Waiting... 18x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 19x5s'
Waiting... 19x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 20x5s'
Waiting... 20x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 21x5s'
Waiting... 21x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 22x5s'
Waiting... 22x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 23x5s'
Waiting... 23x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 24x5s'
Waiting... 24x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 25x5s'
Waiting... 25x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 26x5s'
Waiting... 26x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 27x5s'
Waiting... 27x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 28x5s'
Waiting... 28x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 29x5s'
Waiting... 29x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 30x5s'
Waiting... 30x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 31x5s'
Waiting... 31x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 32x5s'
Waiting... 32x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 33x5s'
Waiting... 33x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ grep -q 200
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ echo 'Waiting... 34x5s'
Waiting... 34x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 35x5s'
Waiting... 35x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 36x5s'
Waiting... 36x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 37x5s'
Waiting... 37x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 38x5s'
Waiting... 38x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 39x5s'
Waiting... 39x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 40x5s'
Waiting... 40x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 41x5s'
Waiting... 41x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 42x5s'
Waiting... 42x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 43x5s'
Waiting... 43x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 44x5s'
Waiting... 44x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 45x5s'
Waiting... 45x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 46x5s'
Waiting... 46x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 47x5s'
Waiting... 47x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 48x5s'
Waiting... 48x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 49x5s'
Waiting... 49x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 50x5s'
Waiting... 50x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 51x5s'
Waiting... 51x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 52x5s'
Waiting... 52x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 53x5s'
Waiting... 53x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 54x5s'
Waiting... 54x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 55x5s'
Waiting... 55x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 56x5s'
Waiting... 56x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 57x5s'
Waiting... 57x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 58x5s'
Waiting... 58x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 59x5s'
Waiting... 59x5s
+ for i in $(seq 1 60)
+ sleep 5
+ kill -0 2130421
+ curl -s -o /dev/null -w '%{http_code}' http://localhost:8091/health
+ grep -q 200
+ echo 'Waiting... 60x5s'
Waiting... 60x5s
+ echo '=== Server Log ==='
=== Server Log ===
+ cat /home/ubuntu/vllm_server_log.txt
Using stage config: /home/ubuntu/qwen3_tts_optimized.yaml
Running: /home/ubuntu/vllm_env/bin/vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-Base --omni --stage-configs-path /home/ubuntu/qwen3_tts_optimized.yaml --host 0.0.0.0 --port 8091 --trust-remote-code
INFO 03-16 14:38:31 [logo.py:45]        █     █     █▄   ▄█       ▄▀▀▀▀▄ █▄   ▄█ █▄    █ ▀█▀ 
INFO 03-16 14:38:31 [logo.py:45]  ▄▄ ▄█ █     █     █ ▀▄▀ █  ▄▄▄  █    █ █ ▀▄▀ █ █ ▀▄  █  █  
INFO 03-16 14:38:31 [logo.py:45]   █▄█▀ █     █     █     █       █    █ █     █ █   ▀▄█  █  
INFO 03-16 14:38:31 [logo.py:45]    ▀▀  ▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀        ▀▀▀▀  ▀     ▀ ▀     ▀ ▀▀▀ 
INFO 03-16 14:38:31 [logo.py:45] 
(APIServer pid=2130421) INFO 03-16 14:38:31 [utils.py:287] vLLM server version 0.16.0, serving model Qwen/Qwen3-TTS-12Hz-1.7B-Base
(APIServer pid=2130421) INFO 03-16 14:38:31 [utils.py:223] non-default args: {'model_tag': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'host': '0.0.0.0', 'port': 8091, 'model': 'Qwen/Qwen3-TTS-12Hz-1.7B-Base', 'trust_remote_code': True}
(APIServer pid=2130421) INFO 03-16 14:38:31 [weight_utils.py:50] Using model weights format ['*']
(APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:181] Initializing stages for model: Qwen/Qwen3-TTS-12Hz-1.7B-Base
(APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:313] No omni_master_address provided, defaulting to localhost (127.0.0.1)
(APIServer pid=2130421) WARNING 03-16 14:38:31 [utils.py:111] Filtered out 1 callable object(s) from base_engine_args that are not compatible with OmegaConf: ['dispatch_function']. 
(APIServer pid=2130421) INFO 03-16 14:38:31 [initialization.py:270] Loaded OmniTransferConfig with 1 connector configurations
(APIServer pid=2130421) INFO 03-16 14:38:31 [factory.py:46] Created connector: SharedMemoryConnector
(APIServer pid=2130421) INFO 03-16 14:38:31 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector
(APIServer pid=2130421) INFO 03-16 14:38:31 [omni.py:347] [AsyncOrchestrator] Loaded 2 stages
[Stage-0] INFO 03-16 14:38:39 [omni_stage.py:1132] [Stage-0] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)
[Stage-0] INFO 03-16 14:38:39 [initialization.py:324] [Stage-0] Initializing OmniConnectors with config keys: ['to_stage_1']
[Stage-0] INFO 03-16 14:38:39 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 14:38:39] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
(APIServer pid=2130421) INFO 03-16 14:38:40 [omni.py:458] [AsyncOrchestrator] Waiting for 2 stages to initialize (timeout: 600s)
[Stage-1] INFO 03-16 14:38:40 [omni_stage.py:1132] [Stage-1] ZMQ transport detected; disabling SHM IPC (shm_threshold_bytes set to maxsize)
[Stage-1] INFO 03-16 14:38:40 [initialization.py:324] [Stage-1] Initializing OmniConnectors with config keys: ['from_stage_0']
[Stage-1] INFO 03-16 14:38:40 [factory.py:46] Created connector: SharedMemoryConnector
[Stage-1] INFO 03-16 14:38:40 [initialization.py:60] Created connector for 0 -> 1: SharedMemoryConnector
[Stage-1] INFO 03-16 14:38:40 [omni_stage.py:79] NVML process-scoped memory available and PID host is available — concurrent init is safe, skipping locks
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
[Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
[Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
[Stage-0] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[2026-03-16 14:38:40] WARNING configuration_utils.py:697: The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
[Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
[Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
[Stage-1] INFO 03-16 14:38:40 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
[Stage-0] INFO 03-16 14:38:50 [model.py:529] Resolved architecture: Qwen3TTSTalkerForConditionalGeneration
[Stage-1] INFO 03-16 14:38:50 [model.py:529] Resolved architecture: Qwen3TTSCode2Wav
[Stage-0] INFO 03-16 14:38:50 [model.py:1549] Using max model len 4096
[Stage-0] INFO 03-16 14:38:50 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=512.
[Stage-0] INFO 03-16 14:38:50 [vllm.py:689] Asynchronous scheduling is disabled.
[Stage-1] INFO 03-16 14:38:51 [model.py:1549] Using max model len 32768
[Stage-1] INFO 03-16 14:38:51 [scheduler.py:224] Chunked prefill is enabled with max_num_batched_tokens=8192.
[Stage-1] INFO 03-16 14:38:51 [vllm.py:689] Asynchronous scheduling is disabled.
(EngineCore_DP0 pid=2130871) [Stage-1] INFO 03-16 14:39:00 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 32, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=2130871) [Stage-1] WARNING 03-16 14:39:00 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:01 [core.py:97] Initializing a V1 LLM engine (v0.16.0) with config: model='Qwen/Qwen3-TTS-12Hz-1.7B-Base', speculative_config=None, tokenizer='Qwen/Qwen3-TTS-12Hz-1.7B-Base', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-TTS-12Hz-1.7B-Base, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [512], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False, 'fuse_act_padding': False}, 'max_cudagraph_capture_size': 32, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=2130874) [Stage-0] WARNING 03-16 14:39:01 [multiproc_executor.py:921] Reducing Torch parallelism from 16 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
[Stage-1] INFO 03-16 14:39:09 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60659 backend=nccl
[Stage-1] INFO 03-16 14:39:09 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A
[Stage-0] INFO 03-16 14:39:09 [parallel_state.py:1234] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:51091 backend=nccl
[Stage-0] INFO 03-16 14:39:09 [parallel_state.py:1445] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A
/bin/sh: 1: sox: not found
[2026-03-16 14:39:10] WARNING __init__.py:10: SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    
(Worker pid=2131034) [Stage-1] INFO 03-16 14:39:10 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base...
/bin/sh: 1: sox: not found
[2026-03-16 14:39:10] WARNING __init__.py:10: SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:10 [gpu_model_runner.py:4124] Starting to load model Qwen/Qwen3-TTS-12Hz-1.7B-Base...
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:10 [vllm.py:1535] `torch.compile` is turned on, but the model Qwen/Qwen3-TTS-12Hz-1.7B-Base does not support it. Please open an issue on GitHub if you want it to be supported.
(Worker pid=2131034) [Stage-1] INFO 03-16 14:39:10 [default_loader.py:293] Loading weights took 8325668.04 seconds
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [cuda.py:367] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(Worker pid=2131039) <frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(Worker pid=2131039) <frozen importlib._bootstrap_external>:1184: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [vllm.py:689] Asynchronous scheduling is disabled.
(Worker pid=2131034) [Stage-1] INFO 03-16 14:39:11 [gpu_model_runner.py:4221] Model loading took 0.0 GiB memory and 0.002019 seconds
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:11 [weight_utils.py:579] No model.safetensors.index.json found in remote.
(Worker pid=2131039) Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
(Worker pid=2131034) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/7 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!
(Worker pid=2131034) [2026-03-16 14:39:12] WARNING logging.py:328: `torch_dtype` is deprecated! Use `dtype` instead!
(Worker pid=2131034) [Stage-1] INFO 03-16 14:39:12 [configuration_qwen3_tts_tokenizer_v2.py:156] encoder_config is None. Initializing encoder with default values
(Worker pid=2131034) [Stage-1] INFO 03-16 14:39:12 [configuration_qwen3_tts_tokenizer_v2.py:159] decoder_config is None. Initializing decoder with default values
(Worker pid=2131039) Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.85it/s]
(Worker pid=2131039) Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.85it/s]
(Worker pid=2131039) 
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:12 [qwen3_tts_talker.py:1534] Loaded 381 weights for Qwen3TTSTalkerForConditionalGeneration
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 31 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 31 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  14%|█▍        | 1/7 [00:00<00:03,  1.54it/s](Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 23 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 23 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 15 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 15 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 7 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 7 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 3 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 3 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 1 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 1 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 7/7 [00:00<00:00, 10.76it/s]
(Worker pid=2131034) Capturing CUDA graphs (decode, FULL):   0%|          | 0/5 [00:00<?, ?it/s](Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [qwen3_tts_code2wav.py:196] Code2Wav input_ids length 15 not divisible by num_quantizers 16, likely a warmup run; returning empty audio.
Capturing CUDA graphs (decode, FULL):   0%|          | 0/5 [00:00<?, ?it/s]
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:12 [default_loader.py:293] Loading weights took 0.71 seconds
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] WorkerProc hit an exception.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Traceback (most recent call last):
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 275, in __call__
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     output = self.runnable(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return self._call_impl(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return forward_call(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return func(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py", line 190, in forward
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     ctx_frames = int(req_ids[0].item())
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] torch.AcceleratorError: CUDA error: operation not permitted when stream is capturing
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Search for `cudaErrorStreamCaptureUnsupported' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] 
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] 
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] During handling of the above exception, another exception occurred:
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] 
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Traceback (most recent call last):
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 858, in worker_busy_loop
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     output = func(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return func(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 471, in compile_or_warm_up_model
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     cuda_graph_memory_bytes = self.model_runner.capture_model()
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return func(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 5223, in capture_model
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     self._capture_cudagraphs(
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 5323, in _capture_cudagraphs
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     dummy_run(
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     return func(*args, **kwargs)
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/worker/gpu_generation_model_runner.py", line 700, in _dummy_run
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     outputs = self.model(
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/compilation/cuda_graph.py", line 269, in __call__
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     with torch.cuda.graph(
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/cuda/graphs.py", line 265, in __exit__
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     self.cuda_graph.capture_end()
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/torch/cuda/graphs.py", line 128, in capture_end
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863]     super().capture_end()
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] torch.AcceleratorError: CUDA error: operation failed due to a previous error during capture
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Search for `cudaErrorStreamCaptureInvalidated' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(Worker pid=2131034) [Stage-1] ERROR 03-16 14:39:12 [multiproc_executor.py:863] 
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] EngineCore failed to start.
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] Traceback (most recent call last):
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 996, in run_engine_core
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     return func(*args, **kwargs)
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 740, in __init__
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     super().__init__(
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 113, in __init__
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     return func(*args, **kwargs)
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 275, in _initialize_kv_caches
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 118, in initialize_from_config
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 375, in collective_rpc
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     return aggregate(get_response())
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in get_response
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006]     raise RuntimeError(
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] RuntimeError: Worker failed with error 'CUDA error: operation failed due to a previous error during capture
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] Search for `cudaErrorStreamCaptureInvalidated' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:12 [core.py:1006] ', please check the stack trace above for the root cause
(Worker pid=2131034) [Stage-1] WARNING 03-16 14:39:12 [multiproc_executor.py:797] WorkerProc was terminated
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:13 [gpu_model_runner.py:4221] Model loading took 3.62 GiB memory and 1.563678 seconds
(EngineCore_DP0 pid=2130871) [Stage-1] ERROR 03-16 14:39:14 [multiproc_executor.py:247] Worker proc VllmWorker-0 died unexpectedly, shutting down executor.
(EngineCore_DP0 pid=2130871) Process EngineCore_DP0:
(EngineCore_DP0 pid=2130871) Traceback (most recent call last):
(EngineCore_DP0 pid=2130871)   File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=2130871)     self.run()
(EngineCore_DP0 pid=2130871)   File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=2130871)     self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 1010, in run_engine_core
(EngineCore_DP0 pid=2130871)     raise e
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 996, in run_engine_core
(EngineCore_DP0 pid=2130871)     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore_DP0 pid=2130871)     return func(*args, **kwargs)
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 740, in __init__
(EngineCore_DP0 pid=2130871)     super().__init__(
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 113, in __init__
(EngineCore_DP0 pid=2130871)     num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore_DP0 pid=2130871)     return func(*args, **kwargs)
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 275, in _initialize_kv_caches
(EngineCore_DP0 pid=2130871)     self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/abstract.py", line 118, in initialize_from_config
(EngineCore_DP0 pid=2130871)     self.collective_rpc("compile_or_warm_up_model")
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 375, in collective_rpc
(EngineCore_DP0 pid=2130871)     return aggregate(get_response())
(EngineCore_DP0 pid=2130871)   File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in get_response
(EngineCore_DP0 pid=2130871)     raise RuntimeError(
(EngineCore_DP0 pid=2130871) RuntimeError: Worker failed with error 'CUDA error: operation failed due to a previous error during capture
(EngineCore_DP0 pid=2130871) Search for `cudaErrorStreamCaptureInvalidated' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
(EngineCore_DP0 pid=2130871) CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(EngineCore_DP0 pid=2130871) For debugging consider passing CUDA_LAUNCH_BLOCKING=1
(EngineCore_DP0 pid=2130871) Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(EngineCore_DP0 pid=2130871) ', please check the stack trace above for the root cause
Process SpawnProcess-2:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/omni_stage.py", line 1075, in _stage_worker_async_entry
    asyncio.run(_stage_worker_async(model, stage_payload, in_q, out_q, batch_timeout, stage_init_timeout))
  File "/usr/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/omni_stage.py", line 1206, in _stage_worker_async
    stage_engine = AsyncOmniLLM.from_vllm_config(
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/utils/func_utils.py", line 116, in inner
    return fn(*args, **kwargs)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/async_omni_llm.py", line 208, in from_vllm_config
    return cls(
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/entrypoints/async_omni_llm.py", line 137, in __init__
    self.engine_core = EngineCoreClient.make_async_mp_client(
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 124, in make_async_mp_client
    return AsyncMPClient(*client_args)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
    return func(*args, **kwargs)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 835, in __init__
    super().__init__(
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 490, in __init__
    with launch_core_engines(vllm_config, executor_class, log_stats) as (
  File "/usr/lib/python3.10/contextlib.py", line 142, in __exit__
    next(self.gen)
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 925, in launch_core_engines
    wait_for_engine_startup(
  File "/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 984, in wait_for_engine_startup
    raise RuntimeError(
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:19 [configuration_qwen3_tts.py:489] talker_config is None. Initializing talker model with default values
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:19 [configuration_qwen3_tts.py:492] speaker_encoder_config is None. Initializing talker model with default values
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:19 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:19 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:19 [configuration_qwen3_tts.py:441] code_predictor_config is None. Initializing code_predictor model with default values
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:20 [backends.py:916] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/0247d0e358/rank_0_0/backbone for vLLM's torch.compile
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:20 [backends.py:976] Dynamo bytecode transform time: 6.47 s
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:24 [backends.py:351] Cache the graph of compile range (1, 512) for later use
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:26 [backends.py:368] Compiling a graph for compile range (1, 512) takes 2.01 s
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:26 [monitor.py:34] torch.compile takes 8.48 s in total
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:27 [base.py:81] Available KV cache memory: 19.62 GiB (process-scoped)
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:27 [kv_cache_utils.py:1307] GPU KV cache size: 183,648 tokens
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:27 [kv_cache_utils.py:1312] Maximum concurrency for 4,096 tokens per request: 44.84x
(Worker pid=2131039) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/7 [00:00<?, ?it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 3/7 [00:00<00:00, 25.91it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  86%|████████▌ | 6/7 [00:00<00:00, 24.66it/s]Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 7/7 [00:00<00:00, 23.34it/s]
(Worker pid=2131039) Capturing CUDA graphs (decode, FULL):   0%|          | 0/5 [00:00<?, ?it/s]Capturing CUDA graphs (decode, FULL):  60%|██████    | 3/5 [00:00<00:00, 23.14it/s]Capturing CUDA graphs (decode, FULL): 100%|██████████| 5/5 [00:00<00:00, 24.60it/s]
(Worker pid=2131039) [Stage-0] INFO 03-16 14:39:29 [gpu_model_runner.py:5246] Graph capturing finished in 2 secs, took 0.08 GiB
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:29 [core.py:278] init engine (profile, create kv cache, warmup model) took 16.05 seconds
(EngineCore_DP0 pid=2130874) [Stage-0] WARNING 03-16 14:39:30 [scheduler.py:166] Using custom scheduler class vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler. This scheduler interface is not public and compatibility may not be maintained.
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:30 [factory.py:46] Created connector: SharedMemoryConnector
(EngineCore_DP0 pid=2130874) [Stage-0] INFO 03-16 14:39:30 [vllm.py:689] Asynchronous scheduling is disabled.
(APIServer pid=2130421) INFO 03-16 14:39:31 [omni.py:448] [AsyncOrchestrator] Stage-0 reported ready
+ echo '=== GPU ==='
=== GPU ===
+ nvidia-smi
Mon Mar 16 14:43:22 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:01:00.0 Off |                    0 |
| N/A   29C    P0             57W /  400W |   24461MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A   2131039      C   VLLM::Worker                                24452MiB |
+-----------------------------------------------------------------------------------------+
+ echo '=== Test voices ==='
=== Test voices ===
+ curl -s http://localhost:8091/v1/audio/voices
+ echo '=== DONE ==='
=== DONE ===