#!/usr/bin/env python3
"""
Sync CSVs to Supabase

Merges chapters_api_CLEAN.csv and Indic_podcasts_CLEAN.csv by video_id,
then upserts to Supabase videos table.

Usage:
    # Full sync (~172k videos)
    python scripts/sync_csv_to_supabase.py

    # Limit for testing
    python scripts/sync_csv_to_supabase.py --limit 100

    # Dry run (no DB writes)
    python scripts/sync_csv_to_supabase.py --dry-run --limit 10
"""

import os
import sys
import json
import argparse
import logging
from pathlib import Path
from datetime import datetime, timezone

# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

import pandas as pd
from src.supabase_client import SupabaseClient

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)-8s | %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger("CSVSync")

# File paths
BASE_DIR = Path(__file__).parent.parent
CHAPTERS_CSV = BASE_DIR / "chapters_api_CLEAN.csv"
PODCASTS_CSV = BASE_DIR / "Indic_podcasts_CLEAN.csv"


def load_and_merge_csvs(chapters_path: Path, podcasts_path: Path, limit: int = None) -> pd.DataFrame:
    """Load both CSVs and merge by video_id."""

    logger.info(f"Loading {podcasts_path.name}...")
    podcasts_df = pd.read_csv(podcasts_path)
    logger.info(f"  Loaded {len(podcasts_df)} rows")

    logger.info(f"Loading {chapters_path.name}...")
    chapters_df = pd.read_csv(chapters_path)
    logger.info(f"  Loaded {len(chapters_df)} rows")

    # Merge on video_id
    logger.info("Merging datasets by video_id...")
    merged = podcasts_df.merge(chapters_df, on='video_id', how='left')
    logger.info(f"  Merged result: {len(merged)} rows")

    # Check for duplicates
    dups = merged['video_id'].duplicated().sum()
    if dups > 0:
        logger.warning(f"  Found {dups} duplicate video_ids, keeping first")
        merged = merged.drop_duplicates(subset='video_id', keep='first')

    if limit:
        merged = merged.head(limit)
        logger.info(f"  Limited to {len(merged)} rows")

    return merged


def transform_to_supabase_schema(df: pd.DataFrame) -> list:
    """Transform merged DataFrame to Supabase schema."""

    videos = []
    now = datetime.now(timezone.utc).isoformat()

    for idx, row in df.iterrows():
        video_id = row['video_id']

        # Parse chapters JSON if present
        chapters_json = None
        if pd.notna(row.get('chapters')) and row['chapters']:
            try:
                chapters_json = json.loads(row['chapters'])
            except json.JSONDecodeError:
                pass

        video = {
            'youtube_id': video_id,
            'youtube_url': f"https://www.youtube.com/watch?v={video_id}",
            'status': 'PENDING',
            'created_at': now,
            'updated_at': now,

            # From podcasts CSV
            'title': row.get('title') if pd.notna(row.get('title')) else None,
            'source_duration_min': float(row['duration_min']) if pd.notna(row.get('duration_min')) else None,
            'channel': row.get('channel') if pd.notna(row.get('channel')) else None,
            'language': row.get('language') if pd.notna(row.get('language')) else None,
            'classification': row.get('classification') if pd.notna(row.get('classification')) else None,

            # From chapters CSV
            'has_chapters': row.get('has_chapters') == 'yes' if pd.notna(row.get('has_chapters')) else False,
            'chapter_count': int(row['chapter_count']) if pd.notna(row.get('chapter_count')) else 0,
            'chapters': chapters_json,

            # Defaults
            'max_attempts': 3,
            'attempt_count': 0,
        }

        videos.append(video)

        if (idx + 1) % 10000 == 0:
            logger.info(f"  Transformed {idx + 1} rows...")

    return videos


def sync_to_supabase(videos: list, batch_size: int = 1000, dry_run: bool = False) -> int:
    """Upsert videos to Supabase."""

    if dry_run:
        logger.info(f"DRY RUN: Would upsert {len(videos)} videos")
        logger.info(f"Sample video: {json.dumps(videos[0], indent=2, default=str)}")
        return 0

    client = SupabaseClient()

    logger.info(f"Upserting {len(videos)} videos in batches of {batch_size}...")
    total = client.bulk_upsert(videos, batch_size=batch_size)

    logger.info(f"Successfully upserted {total} videos")
    return total


def main():
    parser = argparse.ArgumentParser(description="Sync CSVs to Supabase")
    parser.add_argument('--limit', type=int, help="Limit number of rows to sync")
    parser.add_argument('--batch-size', type=int, default=1000, help="Batch size for upserts")
    parser.add_argument('--dry-run', action='store_true', help="Don't actually write to DB")
    parser.add_argument('--chapters-csv', type=Path, default=CHAPTERS_CSV, help="Path to chapters CSV")
    parser.add_argument('--podcasts-csv', type=Path, default=PODCASTS_CSV, help="Path to podcasts CSV")

    args = parser.parse_args()

    # Check files exist
    if not args.chapters_csv.exists():
        logger.error(f"Chapters CSV not found: {args.chapters_csv}")
        sys.exit(1)
    if not args.podcasts_csv.exists():
        logger.error(f"Podcasts CSV not found: {args.podcasts_csv}")
        sys.exit(1)

    logger.info("=" * 60)
    logger.info("CSV to Supabase Sync")
    logger.info("=" * 60)

    # Load and merge
    merged_df = load_and_merge_csvs(
        args.chapters_csv,
        args.podcasts_csv,
        limit=args.limit
    )

    # Transform
    logger.info("Transforming to Supabase schema...")
    videos = transform_to_supabase_schema(merged_df)
    logger.info(f"  Prepared {len(videos)} videos for upsert")

    # Sync
    total = sync_to_supabase(videos, batch_size=args.batch_size, dry_run=args.dry_run)

    logger.info("=" * 60)
    if args.dry_run:
        logger.info(f"DRY RUN complete. Would sync {len(videos)} videos.")
    else:
        logger.info(f"Sync complete. Upserted {total} videos.")
    logger.info("=" * 60)

    return 0


if __name__ == '__main__':
    sys.exit(main())
