#!/usr/bin/env bash
# Production training launcher with fail-fast R2 validation.
#
# Sequence:
#   1. Run full readiness gate (including R2 roundtrip probe)
#   2. Start checkpoint watcher in background
#   3. Launch training in foreground
#   4. On exit/failure: stop watcher, report status
#
# Usage:
#   bash scripts/run_stage1_prod.sh                  # full launch
#   bash scripts/run_stage1_prod.sh --dry-run        # readiness + watcher bootstrap only
#   bash scripts/run_stage1_prod.sh --max-steps=100  # short test run

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_ROOT"

# Detect interpreter
if [ -n "${PYTHON:-}" ] && command -v "$PYTHON" &>/dev/null; then
    PY="$PYTHON"
elif command -v python &>/dev/null; then
    PY=python
elif command -v python3 &>/dev/null; then
    PY=python3
else
    echo "FAIL: No python interpreter found"; exit 1
fi

CONFIG="configs/train/stage1_prod_8xh200.yaml"
MODEL_NAME="maya-asr-stage1"
EXP_DIR="experiments/stage1_prod/maya_asr_stage1"
DRY_RUN=false
MAX_STEPS=""
WATCHER_PID=""

# Parse args
for arg in "$@"; do
    case $arg in
        --dry-run) DRY_RUN=true ;;
        --max-steps=*) MAX_STEPS="${arg#*=}" ;;
        *) echo "Unknown arg: $arg"; exit 1 ;;
    esac
done

# Cleanup on exit: stop watcher
cleanup() {
    if [ -n "$WATCHER_PID" ] && kill -0 "$WATCHER_PID" 2>/dev/null; then
        echo ""
        echo "Stopping checkpoint watcher (PID $WATCHER_PID)..."
        kill "$WATCHER_PID" 2>/dev/null || true
        wait "$WATCHER_PID" 2>/dev/null || true
    fi
}
trap cleanup EXIT INT TERM

echo "============================================"
echo "  Maya ASR - Stage 1 Production Launch"
echo "============================================"
echo ""

# --- Step 1: Readiness gate with R2 roundtrip ---
echo "--- Step 1: Production readiness gate ---"
$PY scripts/check_prod_readiness.py \
    --require-r2-roundtrip \
    --min-free-tb 1.0 \
    --train-parquet artifacts/phase3/production_train_final.parquet
READINESS_EXIT=$?
if [ $READINESS_EXIT -ne 0 ]; then
    echo ""
    echo "ABORT: Readiness gate failed. Training will not start."
    exit 1
fi
echo ""

# --- Step 2: Bootstrap checkpoint watcher ---
echo "--- Step 2: Starting checkpoint watcher ---"
mkdir -p "$EXP_DIR"

WATCHER_CMD="$PY scripts/auto_upload_checkpoints.py \
    --exp-dir $EXP_DIR \
    --model-name $MODEL_NAME \
    --delete-after-upload \
    --poll-interval 60"

if [ "$DRY_RUN" = true ]; then
    WATCHER_CMD="$WATCHER_CMD --dry-run"
fi

$WATCHER_CMD &
WATCHER_PID=$!
echo "  Watcher PID: $WATCHER_PID"
echo "  Model: $MODEL_NAME"
echo "  Exp dir: $EXP_DIR"

# Verify watcher started
sleep 1
if ! kill -0 "$WATCHER_PID" 2>/dev/null; then
    echo "ABORT: Checkpoint watcher failed to start."
    exit 1
fi
echo "  Watcher: running"
echo ""

if [ "$DRY_RUN" = true ]; then
    echo "--- DRY RUN MODE ---"
    echo "Readiness: PASS"
    echo "Watcher: bootstrapped (PID $WATCHER_PID)"
    echo "Training: would start with config $CONFIG"
    echo ""
    echo "To launch for real, run without --dry-run."
    exit 0
fi

# --- Step 3: Launch training ---
echo "--- Step 3: Launching training ---"
TRAIN_ARGS="--config $CONFIG"
if [ -n "$MAX_STEPS" ]; then
    TRAIN_ARGS="$TRAIN_ARGS --max-steps $MAX_STEPS"
    echo "  Max steps: $MAX_STEPS"
fi
echo "  Config: $CONFIG"
echo ""

$PY scripts/train_prod.py $TRAIN_ARGS
TRAIN_EXIT=$?

echo ""
if [ $TRAIN_EXIT -eq 0 ]; then
    echo "============================================"
    echo "  Training completed successfully"
    echo "============================================"
else
    echo "============================================"
    echo "  Training failed (exit code $TRAIN_EXIT)"
    echo "============================================"
    exit $TRAIN_EXIT
fi
