#!/usr/bin/env python3
"""
Prepare 100 videos for benchmark by:
1. Listing R2 files that exist (podcasts/*.webm)
2. Cross-referencing with Supabase
3. Setting them to PENDING status
"""

import os
import sys
import time
import random

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass

from supabase import create_client
from src.r2_client import R2Client

# Config
TARGET_COUNT = 100
SUPABASE_URL = os.environ.get('URL') or os.environ.get('SUPABASE_URL')
SUPABASE_KEY = os.environ.get('SUPABASE_ADMIN')

if not SUPABASE_URL or not SUPABASE_KEY:
    print("ERROR: Missing Supabase credentials")
    sys.exit(1)

print("=" * 70)
print("BENCHMARK PREPARATION - 100 Videos")
print("=" * 70)

# Initialize clients
sb = create_client(SUPABASE_URL, SUPABASE_KEY)
r2 = R2Client(bucket_type='source')  # 'test' bucket has the podcasts/

print(f"\n1. Listing R2 files from {r2.bucket}/podcasts/...")

# List all files in podcasts/ prefix
all_r2_files = []
continuation_token = None

while True:
    kwargs = {
        'Bucket': r2.bucket,
        'Prefix': 'podcasts/',
        'MaxKeys': 1000
    }
    if continuation_token:
        kwargs['ContinuationToken'] = continuation_token

    response = r2.client.list_objects_v2(**kwargs)
    files = response.get('Contents', [])
    all_r2_files.extend(files)

    if not response.get('IsTruncated'):
        break
    continuation_token = response.get('NextContinuationToken')

    if len(all_r2_files) >= 5000:  # Limit scan
        break

print(f"   Found {len(all_r2_files)} files in R2")

# Extract video IDs from filenames (podcasts/{video_id}.webm)
r2_video_ids = set()
for f in all_r2_files:
    key = f['Key']  # podcasts/---g_VL7ySo.webm
    if key.endswith('.webm'):
        video_id = key.replace('podcasts/', '').replace('.webm', '')
        r2_video_ids.add(video_id)

print(f"   Extracted {len(r2_video_ids)} unique video IDs")

print(f"\n2. Cross-referencing with Supabase...")

# Query Supabase for these videos
# Do in batches to avoid URL length limits
video_ids_list = list(r2_video_ids)[:500]  # Limit for query

result = sb.table('videos').select(
    'youtube_id, title, source_duration_min, language, status'
).in_('youtube_id', video_ids_list).execute()

videos_in_db = {v['youtube_id']: v for v in (result.data or [])}
print(f"   Found {len(videos_in_db)} videos in Supabase DB")

# Filter to videos that are in both R2 AND Supabase
validated = []
for vid in video_ids_list:
    if vid in videos_in_db:
        v = videos_in_db[vid]
        validated.append(v)
        if len(validated) >= TARGET_COUNT:
            break

print(f"   Validated: {len(validated)} videos (in R2 + Supabase)")

if len(validated) < TARGET_COUNT:
    print(f"\n   WARNING: Only {len(validated)} videos available")
    if len(validated) < 10:
        print("   ERROR: Not enough videos for benchmark")
        sys.exit(1)

# Show status distribution before reset
statuses = {}
for v in validated:
    s = v.get('status', 'unknown')
    statuses[s] = statuses.get(s, 0) + 1
print(f"   Current status: {statuses}")

print(f"\n3. Resetting {len(validated)} videos to PENDING status...")

# Reset to PENDING for benchmark
video_ids = [v['youtube_id'] for v in validated]
batch_size = 50

for i in range(0, len(video_ids), batch_size):
    batch = video_ids[i:i+batch_size]
    sb.table('videos').update({
        'status': 'PENDING',
        'claimed_by': None,
        'claimed_at': None,
        'lease_expires_at': None,
    }).in_('youtube_id', batch).execute()
    print(f"   Reset batch {i//batch_size + 1}/{(len(video_ids) + batch_size - 1)//batch_size}")

print(f"\n4. Summary of benchmark videos:")

# Stats
durations = [v.get('source_duration_min', 0) or 0 for v in validated]
total_duration = sum(durations)
avg_duration = total_duration / len(validated) if validated else 0

print(f"   Total videos: {len(validated)}")
print(f"   Total duration: {total_duration:.1f} min ({total_duration/60:.1f} hours)")
print(f"   Avg duration: {avg_duration:.1f} min")
if durations:
    print(f"   Duration range: {min(durations):.1f} - {max(durations):.1f} min")

# Language distribution
langs = {}
for v in validated:
    lang = v.get('language', 'unknown') or 'unknown'
    langs[lang] = langs.get(lang, 0) + 1
print(f"   Languages: {dict(sorted(langs.items(), key=lambda x: -x[1]))}")

print(f"\n5. Sample videos:")
for v in validated[:5]:
    title = (v.get('title') or 'No title')[:50]
    dur = v.get('source_duration_min', 0) or 0
    print(f"   • {v['youtube_id']} | {dur:.1f}min | {title}")

print(f"\n" + "=" * 70)
print(f"READY: {len(validated)} videos set to PENDING")
print(f"Run benchmark with:")
print(f"  .venv/bin/python massive_process.py --supabase-queue --max-videos {len(validated)} --output ./output")
print("=" * 70)

# Save video IDs for reference
with open('/tmp/benchmark_videos.txt', 'w') as f:
    for v in validated:
        f.write(f"{v['youtube_id']}\n")
print(f"\nVideo IDs saved to /tmp/benchmark_videos.txt")
