#!/usr/bin/env python3
"""
Audio Fetcher Worker
video_url_uploaded → audio_fetching → audio_fetch_done

Downloads audio from YouTube via Vidssave API, uploads to R2.
CPU only — no GPU needed.
"""

import os
import sys
import json
import time
import uuid
import random
import logging
import tempfile
import subprocess
import requests

from common import (
    get_db, get_s3, claim_jobs, get_job, verify_claim,
    update_job, upload_to_r2, POLL_INTERVAL, BATCH_SIZE,
)

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
log = logging.getLogger("fetcher")

WORKER_ID = os.environ.get("FETCHER_WORKER_ID", f"fetcher-{uuid.uuid4().hex[:8]}")
VIDSSAVE_URL = "https://api.vidssave.com/api/contentsite_api/media/parse"
VIDSSAVE_AUTH = os.environ.get("VIDSSAVE_AUTH", "20250901majwlqo")
VIDSSAVE_DOMAIN = os.environ.get("VIDSSAVE_DOMAIN", "api-ak.vidssave.com")
PROXY_LIST_PATH = os.environ.get("PROXY_LIST_PATH", "/app/proxylist.md")

PROXIES = []

def load_proxies():
    global PROXIES
    path = PROXY_LIST_PATH
    if not os.path.exists(path):
        path = os.path.join(os.path.dirname(__file__), "proxylist.md")
    if not os.path.exists(path):
        log.warning("No proxy list found, yt-dlp will run without proxy")
        return
    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split(":")
            if len(parts) == 4:
                ip, port, user, pw = parts
                PROXIES.append(f"http://{user}:{pw}@{ip}:{port}")
    log.info(f"Loaded {len(PROXIES)} proxies")


def get_random_proxy() -> str:
    if not PROXIES:
        return None
    return random.choice(PROXIES)

FROM_STATUS = "video_url_uploaded"
CLAIM_PREFIX = "FETCH_CLAIMED"
PROCESSING_STATUS = "audio_fetching"
DONE_STATUS = "audio_fetch_done"
FAIL_STATUS = "failed"


def vidssave_get_resources(youtube_url: str) -> dict:
    resp = requests.post(VIDSSAVE_URL, data={
        "auth": VIDSSAVE_AUTH,
        "domain": VIDSSAVE_DOMAIN,
        "origin": "",
        "source": "",
        "link": youtube_url,
    }, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    if data.get("status") != 1:
        raise RuntimeError(f"Vidssave failed: {data}")
    return data.get("data", {})


def pick_best_audio(resources: list) -> dict:
    audio = [r for r in resources if r.get("type") == "audio" and r.get("download_url")]
    # Prefer Vidssave proxy URLs (mode="direct") over raw googlevideo (mode="check_download")
    direct = [r for r in audio if r.get("download_mode") == "direct"]
    check = [r for r in audio if r.get("download_mode") == "check_download"]
    for pool in [direct, check]:
        for quality in ["128KBPS", "256KBPS", "48KBPS", "LOW"]:
            for r in pool:
                if r.get("quality") == quality:
                    return r
    return audio[0] if audio else None


def process_job(job_id: str, conn, s3):
    job = get_job(conn, job_id)
    if not job:
        return
    youtube_url = job["youtube_url"]
    log.info(f"[{WORKER_ID}] Processing {job_id[:12]}... ({youtube_url[:50]})")

    if not verify_claim(conn, job_id, CLAIM_PREFIX, WORKER_ID):
        log.warning(f"[{WORKER_ID}] Job {job_id[:12]} no longer claimed, skipping")
        return

    try:
        update_job(conn, job_id, status=PROCESSING_STATUS)

        log.info("  Calling Vidssave API...")
        data = vidssave_get_resources(youtube_url)
        title = data.get("title", "")
        thumbnail = data.get("thumbnail", "")
        duration = data.get("duration")

        log.info(f"  Title: {title[:60]}")
        log.info(f"  Duration: {duration}s")

        audio = pick_best_audio(data.get("resources", []))
        if not audio:
            update_job(conn, job_id, status=FAIL_STATUS, error_message="No audio download URL from Vidssave")
            return

        fmt = audio.get("format", "m4a").lower()
        mode = audio.get("download_mode", "")
        log.info(f"  Best: {audio['quality']} {fmt} mode={mode} ({audio['size']/1024/1024:.1f} MB)")

        tmp_path = tempfile.mktemp(suffix=f".{fmt}")
        try:
            if mode == "direct":
                log.info("  Downloading via Vidssave proxy...")
                dl_result = subprocess.run(
                    ["curl", "-L", "-f", "-o", tmp_path, "--connect-timeout", "15",
                     "--max-time", "300", "-s", "-S", audio["download_url"]],
                    capture_output=True, text=True, timeout=320,
                )
                if dl_result.returncode != 0:
                    raise RuntimeError(f"curl failed: {dl_result.stderr[:200]}")
            else:
                proxy = get_random_proxy()
                if proxy:
                    log.info(f"  Downloading via yt-dlp + proxy...")
                    tmp_path = tempfile.mktemp(suffix=".m4a")
                    fmt = "m4a"
                    dl_result = subprocess.run(
                        ["yt-dlp", "-f", "bestaudio[ext=m4a]/bestaudio", "--no-playlist",
                         "--proxy", proxy, "-o", tmp_path, youtube_url],
                        capture_output=True, text=True, timeout=600,
                    )
                    if dl_result.returncode != 0:
                        raise RuntimeError(f"yt-dlp+proxy failed: {dl_result.stderr[:200]}")
                else:
                    log.info("  Downloading via yt-dlp (no proxy)...")
                    tmp_path = tempfile.mktemp(suffix=".m4a")
                    fmt = "m4a"
                    dl_result = subprocess.run(
                        ["yt-dlp", "-f", "bestaudio[ext=m4a]/bestaudio", "--no-playlist",
                         "-o", tmp_path, youtube_url],
                        capture_output=True, text=True, timeout=600,
                    )
                    if dl_result.returncode != 0:
                        raise RuntimeError(f"yt-dlp failed: {dl_result.stderr[:200]}")

            file_size = os.path.getsize(tmp_path)
            log.info(f"  Downloaded {file_size/1024/1024:.1f} MB")

            r2_key = f"{job_id}/raw_audio.{fmt}"
            log.info(f"  Uploading to R2: {r2_key}")
            raw_audio_url = upload_to_r2(s3, tmp_path, r2_key)

            update_job(conn, job_id,
                       status=DONE_STATUS,
                       title=title,
                       thumbnail_url=thumbnail,
                       duration_seconds=duration,
                       raw_audio_url=raw_audio_url)
            log.info(f"  Job {job_id[:12]} → {DONE_STATUS}")

        finally:
            if os.path.exists(tmp_path):
                os.remove(tmp_path)

    except Exception as e:
        log.error(f"  Job {job_id[:12]} failed: {e}", exc_info=True)
        update_job(conn, job_id, status=FAIL_STATUS, error_message=str(e)[:500])


def main():
    load_proxies()
    log.info(f"Audio Fetcher starting | worker={WORKER_ID} batch={BATCH_SIZE} poll={POLL_INTERVAL}s proxies={len(PROXIES)}")
    conn = None
    s3 = get_s3()

    while True:
        try:
            if conn is None:
                conn = get_db()
            jobs = claim_jobs(conn, FROM_STATUS, CLAIM_PREFIX, WORKER_ID, BATCH_SIZE)
            if not jobs:
                time.sleep(POLL_INTERVAL)
                continue
            log.info(f"[{WORKER_ID}] Claimed {len(jobs)} jobs")
            for job_id in jobs:
                process_job(job_id, conn, s3)
        except KeyboardInterrupt:
            break
        except Exception as e:
            log.error(f"Loop error: {e}", exc_info=True)
            conn = None
            time.sleep(POLL_INTERVAL)


if __name__ == "__main__":
    main()
