#!/usr/bin/env bash
# GPU Idle Watchdog — alerts if all GPUs are idle (0% utilization)
# Designed to run as a cron job every 5 minutes

LOG="/root/data/qwen3-asr-phase2-out/gpu_watchdog.log"
ALERT_FILE="/root/data/qwen3-asr-phase2-out/GPU_IDLE_ALERT"

# Get GPU utilization for all GPUs
utils=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
if [ -z "$utils" ]; then
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: nvidia-smi failed" >> "$LOG"
    exit 1
fi

# Check if ALL GPUs are at 0%
total=0
idle=0
while IFS= read -r u; do
    u=$(echo "$u" | tr -d ' ')
    total=$((total + 1))
    if [ "$u" -eq 0 ] 2>/dev/null; then
        idle=$((idle + 1))
    fi
done <<< "$utils"

ts=$(date '+%Y-%m-%d %H:%M:%S')

if [ "$idle" -eq "$total" ] && [ "$total" -gt 0 ]; then
    # All GPUs idle — check if training process is alive
    if ! pgrep -f "qwen3_asr_sft_phase2" > /dev/null 2>&1; then
        echo "[$ts] ALERT: ALL $total GPUs at 0% and training NOT running!" >> "$LOG"
        echo "[$ts] ALL $total GPUs idle, training process dead" > "$ALERT_FILE"
    else
        echo "[$ts] WARN: ALL $total GPUs at 0% but training process exists (maybe loading?)" >> "$LOG"
    fi
elif [ "$idle" -gt 0 ]; then
    echo "[$ts] WARN: $idle/$total GPUs idle" >> "$LOG"
else
    # All GPUs busy — clear any stale alert
    [ -f "$ALERT_FILE" ] && rm -f "$ALERT_FILE"
fi
