#!/usr/bin/env bash
# Prepare all production artifacts for stage 1 training.
#
# Pipeline:
#   1. Build full manifest (12 languages, all shards)
#   2. Filter by quality (min_quality=0.7)
#   3. Split train/val
#   4. Build encoder tokenizer (32K BPE, quality-first)
#   5. Generate input_cfg
#
# Usage:
#   bash scripts/prepare_stage1_prod.sh --dry-run              # show what would be built
#   bash scripts/prepare_stage1_prod.sh                        # build everything
#   bash scripts/prepare_stage1_prod.sh --max-shards 2         # test run with 2 shards/lang
#   bash scripts/prepare_stage1_prod.sh --force-rebuild         # force rebuild all

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_ROOT"

# Detect interpreter
if [ -n "${PYTHON:-}" ] && command -v "$PYTHON" &>/dev/null; then
    PY="$PYTHON"
elif command -v python &>/dev/null; then
    PY=python
elif command -v python3 &>/dev/null; then
    PY=python3
else
    echo "FAIL: No python interpreter found"; exit 1
fi

# Defaults
DRY_RUN=false
FORCE_REBUILD=false
MAX_SHARDS=0
LANGUAGES="as bn en gu hi kn ml mr or pa ta te"
MIN_QUALITY=0.7
VOCAB_SIZE=32768
VAL_RATIO=0.005
SEED=42

# Paths
MANIFEST="data/manifests/stage1_prod_full.jsonl"
TRAIN_MANIFEST="data/manifests/stage1_prod_train.jsonl"
VAL_MANIFEST="data/manifests/stage1_prod_val.jsonl"
TOKENIZER_DIR="tokenizers/stage1_prod_bpe"
INPUT_CFG="configs/data/stage1_prod_input_cfg.yaml"

# Parse args
for arg in "$@"; do
    case $arg in
        --dry-run) DRY_RUN=true ;;
        --force-rebuild) FORCE_REBUILD=true ;;
        --max-shards=*) MAX_SHARDS="${arg#*=}" ;;
        --languages=*) LANGUAGES="${arg#*=}" ;;
        --min-quality=*) MIN_QUALITY="${arg#*=}" ;;
        --vocab-size=*) VOCAB_SIZE="${arg#*=}" ;;
        *) echo "Unknown arg: $arg"; exit 1 ;;
    esac
done

echo "============================================"
echo "  Maya ASR - Stage 1 Production Artifacts"
echo "============================================"
echo ""
echo "Config:"
echo "  Languages:    $LANGUAGES"
echo "  Max shards:   $MAX_SHARDS (0=all)"
echo "  Min quality:  $MIN_QUALITY"
echo "  Vocab size:   $VOCAB_SIZE"
echo "  Val ratio:    $VAL_RATIO"
echo "  Seed:         $SEED"
echo "  Force:        $FORCE_REBUILD"
echo "  Dry run:      $DRY_RUN"
echo ""

if [ "$DRY_RUN" = true ]; then
    echo "--- DRY RUN: would build these artifacts ---"
    echo "  $MANIFEST"
    echo "  $TRAIN_MANIFEST"
    echo "  $VAL_MANIFEST"
    echo "  $TOKENIZER_DIR/"
    echo "  $INPUT_CFG"
    echo ""
    echo "Run without --dry-run to build."
    exit 0
fi

# --- Step 1: Build full manifest ---
SHARD_FLAG=""
if [ "$MAX_SHARDS" -gt 0 ]; then
    SHARD_FLAG="--max-shards $MAX_SHARDS"
fi

if [ "$FORCE_REBUILD" = true ] || [ ! -f "$MANIFEST" ]; then
    echo "--- Step 1: Building full manifest ---"
    # shellcheck disable=SC2086
    $PY scripts/build_manifest.py \
        --languages $LANGUAGES \
        $SHARD_FLAG \
        --min-quality "$MIN_QUALITY" \
        --output "$MANIFEST"
else
    echo "--- Step 1: Manifest exists, skipping (use --force-rebuild) ---"
fi
echo "OK: $MANIFEST ($(wc -l < "$MANIFEST") rows)"
echo ""

# --- Step 2: Validate manifest ---
echo "--- Step 2: Validating manifest ---"
$PY scripts/validate_manifest.py "$MANIFEST"
echo ""

# --- Step 3: Split train/val ---
if [ "$FORCE_REBUILD" = true ] || [ ! -f "$TRAIN_MANIFEST" ] || [ ! -f "$VAL_MANIFEST" ]; then
    echo "--- Step 3: Splitting train/val ---"
    $PY scripts/split_manifest.py \
        --input "$MANIFEST" \
        --train-output "$TRAIN_MANIFEST" \
        --val-output "$VAL_MANIFEST" \
        --val-ratio "$VAL_RATIO" \
        --seed "$SEED"
else
    echo "--- Step 3: Split exists, skipping ---"
fi
echo "OK: Train $(wc -l < "$TRAIN_MANIFEST") rows, Val $(wc -l < "$VAL_MANIFEST") rows"
echo ""

# --- Step 4: Build encoder tokenizer ---
if [ "$FORCE_REBUILD" = true ] || [ ! -f "$TOKENIZER_DIR/tokenizer.model" ]; then
    echo "--- Step 4: Building encoder tokenizer (${VOCAB_SIZE} BPE) ---"
    $PY scripts/build_tokenizer.py \
        --manifest "$TRAIN_MANIFEST" \
        --output-dir "$TOKENIZER_DIR" \
        --vocab-size "$VOCAB_SIZE" \
        --character-coverage 0.9999
else
    echo "--- Step 4: Tokenizer exists, skipping ---"
fi
echo "OK: $TOKENIZER_DIR/tokenizer.model"
echo ""

# --- Step 5: Generate input_cfg ---
if [ "$FORCE_REBUILD" = true ] || [ ! -f "$INPUT_CFG" ]; then
    echo "--- Step 5: Generating input_cfg ---"
    $PY scripts/generate_input_cfg.py \
        --manifests "$TRAIN_MANIFEST" \
        --output "$INPUT_CFG"
else
    echo "--- Step 5: Input cfg exists, skipping ---"
fi
echo "OK: $INPUT_CFG"
echo ""

echo "============================================"
echo "  Production artifacts ready"
echo "============================================"
echo ""
echo "Artifacts:"
echo "  Manifest:   $MANIFEST"
echo "  Train:      $TRAIN_MANIFEST"
echo "  Val:        $VAL_MANIFEST"
echo "  Tokenizer:  $TOKENIZER_DIR/"
echo "  Input cfg:  $INPUT_CFG"
echo ""
echo "Next: Update configs/train/stage1_prod_8xh200.yaml with these paths,"
echo "      then run: make prod-readiness"
