#!/usr/bin/env bash
# Push milestone checkpoints (every 50k steps) to R2
# Path: ptcheckpoints/qwen3-asr-1.7B/<date>/ckpt-<step>/
set -uo pipefail

source /root/data/.env

OUTPUT_DIR="/root/data/qwen3-asr-phase2-out"
PUSHED_LOG="$OUTPUT_DIR/.r2_pushed_checkpoints"
R2_BUCKET="ptcheckpoints"
R2_PREFIX="qwen3-asr-1.7B"
STEP_INTERVAL=50000

export AWS_ACCESS_KEY_ID="$R2_ACCESS_KEY_ID"
export AWS_SECRET_ACCESS_KEY="$R2_SECRET_ACCESS_KEY"
export AWS_DEFAULT_REGION="auto"

ENDPOINT="$R2_ENDPOINT_URL"

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$OUTPUT_DIR/r2_upload.log"; }

# Create pushed log if not exists
touch "$PUSHED_LOG"

# Find all checkpoints
for ckpt_dir in "$OUTPUT_DIR"/checkpoint-*; do
    [ -d "$ckpt_dir" ] || continue

    step=$(basename "$ckpt_dir" | grep -oP '\d+')
    [ -z "$step" ] && continue

    # Only push at 50k intervals
    if [ $((step % STEP_INTERVAL)) -ne 0 ]; then
        continue
    fi

    # Skip if already pushed
    if grep -qx "$step" "$PUSHED_LOG" 2>/dev/null; then
        continue
    fi

    # Verify checkpoint is complete (has model.safetensors)
    if [ ! -f "$ckpt_dir/model.safetensors" ]; then
        log "SKIP step $step: model.safetensors missing (incomplete checkpoint)"
        continue
    fi

    # Use the date from the checkpoint's modification time
    ckpt_date=$(date -r "$ckpt_dir/model.safetensors" '+%m-%d-%Y')
    r2_path="s3://${R2_BUCKET}/${R2_PREFIX}/${ckpt_date}/ckpt-${step}/"

    log "UPLOADING checkpoint-${step} → ${r2_path}"

    aws s3 sync "$ckpt_dir" "$r2_path" \
        --endpoint-url "$ENDPOINT" \
        --no-progress \
        2>&1 | while read -r line; do log "  $line"; done

    if [ ${PIPESTATUS[0]} -eq 0 ]; then
        echo "$step" >> "$PUSHED_LOG"
        log "SUCCESS: checkpoint-${step} uploaded to ${r2_path}"
    else
        log "FAILED: checkpoint-${step} upload failed"
    fi
done
