#!/usr/bin/env bash
# GPU idle monitor: alerts if all GPUs are idle (0% utilization) for consecutive checks.
# Designed to run via cron every 2 minutes.
#
# Writes alert to /workspace/maya-asr/gpu_idle_alert.log
# Also touches /workspace/maya-asr/.gpu_idle_alert as a flag file for external watchers.

ALERT_LOG="/workspace/maya-asr/gpu_idle_alert.log"
STATE_FILE="/tmp/gpu_idle_monitor_state"
IDLE_THRESHOLD=5  # GPU util% below this = idle
MIN_CONSECUTIVE=2  # Alert after this many consecutive idle checks

# Get max GPU utilization across all GPUs
MAX_UTIL=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null | sort -rn | head -1)

if [ -z "$MAX_UTIL" ]; then
    echo "$(date -Iseconds) ERROR: nvidia-smi failed" >> "$ALERT_LOG"
    exit 1
fi

TIMESTAMP=$(date -Iseconds)

if [ "$MAX_UTIL" -le "$IDLE_THRESHOLD" ]; then
    # GPUs are idle
    PREV_COUNT=$(cat "$STATE_FILE" 2>/dev/null || echo 0)
    NEW_COUNT=$((PREV_COUNT + 1))
    echo "$NEW_COUNT" > "$STATE_FILE"

    if [ "$NEW_COUNT" -ge "$MIN_CONSECUTIVE" ]; then
        MSG="ALERT: All GPUs idle (max util=${MAX_UTIL}%) for ${NEW_COUNT} consecutive checks (~$((NEW_COUNT * 2)) min)"
        echo "$TIMESTAMP $MSG" >> "$ALERT_LOG"
        touch /workspace/maya-asr/.gpu_idle_alert

        # Also check if training process is still running
        TRAIN_PID=$(pgrep -f "train_prod.py" | head -1)
        if [ -z "$TRAIN_PID" ]; then
            echo "$TIMESTAMP CRITICAL: train_prod.py process NOT found! Training may have crashed." >> "$ALERT_LOG"
        else
            echo "$TIMESTAMP INFO: train_prod.py still running (PID $TRAIN_PID)" >> "$ALERT_LOG"
        fi
    fi
else
    # GPUs active, reset counter
    echo 0 > "$STATE_FILE"
    # Remove alert flag if it exists
    rm -f /workspace/maya-asr/.gpu_idle_alert
fi
