#!/bin/bash
exec 1>/home/ubuntu/vllm_start_log.txt 2>&1
set -x

echo "=== Killing old processes ==="
nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null | xargs -r kill -9 2>/dev/null
pkill -9 -f "python3.*qwen" 2>/dev/null
sleep 3

echo "=== GPU cleared ==="
nvidia-smi

echo "=== Starting vLLM-Omni ==="
source /home/ubuntu/vllm_env/bin/activate
python3 -c "import flash_attn; print('flash_attn:', flash_attn.__version__)"

rm -f /home/ubuntu/vllm_server_log.txt
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
/home/ubuntu/vllm_env/bin/python3 /home/ubuntu/launch_server.py > /home/ubuntu/vllm_server_log.txt 2>&1 &
SERVER_PID=$!
echo "Server PID: $SERVER_PID"

echo "=== Waiting for server (max 300s) ==="
for i in $(seq 1 60); do
    sleep 5
    if ! kill -0 $SERVER_PID 2>/dev/null; then
        echo "Server died at ${i}x5s"
        cat /home/ubuntu/vllm_server_log.txt
        exit 1
    fi
    if curl -s -o /dev/null -w "%{http_code}" http://localhost:8091/health 2>/dev/null | grep -q 200; then
        echo "Server ready at ${i}x5s = $((i*5))s"
        break
    fi
    echo "Waiting... ${i}x5s"
done

echo "=== Server Log ==="
cat /home/ubuntu/vllm_server_log.txt

echo "=== GPU ==="
nvidia-smi

echo "=== Test voices ==="
curl -s http://localhost:8091/v1/audio/voices 2>&1

echo "=== DONE ==="
