from __future__ import annotations

import os
import time
from dataclasses import dataclass
from pathlib import Path

from .config import EnvConfig, SUPPORTED_LANGUAGES


ROOT = Path("/home/ubuntu/transcripts")
DEFAULT_CANONICAL_SEGMENTS = (
    ROOT / "final_data" / "final_cleaned_segments_with_variants_rerouted_repetition_filtered.parquet"
)
DEFAULT_RAW_TRANSCRIPTS = ROOT / "data" / "transcription_results.parquet"
DEFAULT_VALIDATION_PARQUET = ROOT / "data" / "recover_v2_consolidated.parquet"
DEFAULT_YOUTUBE_META = ROOT / "data" / "youtube_video_metadata_all.csv"


def _env_int(name: str, default: int) -> int:
    try:
        return int(os.getenv(name, str(default)))
    except ValueError:
        return default


def _env_bool(name: str, default: bool) -> bool:
    raw = os.getenv(name)
    if raw is None:
        return default
    return raw.strip().lower() in {"1", "true", "yes", "on"}


def _env_csv(name: str, default: list[str]) -> list[str]:
    raw = os.getenv(name, "")
    if not raw.strip():
        return default
    return [item.strip() for item in raw.split(",") if item.strip()]


@dataclass
class FinalExportConfig:
    base: EnvConfig
    run_id: str
    reference_mode: str
    reference_bucket: str
    reference_prefix: str
    output_bucket: str
    output_prefix: str
    local_work_root: Path
    canonical_segments_path: Path
    raw_transcripts_path: Path
    variants_path: Path | None
    validation_path: Path | None
    youtube_meta_path: Path | None
    supported_languages: list[str]
    require_variants: bool
    require_validation: bool
    microshard_target_rows: int
    final_shard_target_rows: int
    polish_threads: int
    duckdb_threads: int
    compactor_claim_limit: int
    max_videos: int
    max_shards: int
    allow_partial_shards: bool
    language_filters: list[str]
    language_lease_seconds: int
    claim_stale_after_s: int
    reference_download_concurrency: int
    canonical_segments_r2_key: str
    raw_transcripts_r2_key: str
    validation_r2_key: str | None
    youtube_meta_r2_key: str | None
    variants_r2_key: str | None
    reference_manifest_r2_key: str | None

    @property
    def worker_id(self) -> str:
        return self.base.worker_id

    @property
    def gpu_type(self) -> str:
        return self.base.gpu_type

    @property
    def database_url(self) -> str:
        return self.base.database_url

    @property
    def mock_mode(self) -> bool:
        return self.base.mock_mode

    @property
    def microshard_prefix(self) -> str:
        return f"{self.output_prefix.rstrip('/')}/microshards"

    @property
    def shard_prefix(self) -> str:
        return f"{self.output_prefix.rstrip('/')}/shards"

    @classmethod
    def from_env(cls) -> "FinalExportConfig":
        base = EnvConfig()
        run_id = os.getenv("FINAL_EXPORT_RUN_ID", f"final-export-{int(time.time())}")
        reference_mode = os.getenv("FINAL_EXPORT_REFERENCE_MODE", "local").strip().lower()
        reference_bucket = os.getenv("FINAL_EXPORT_REFERENCE_BUCKET", base.r2_bucket)
        reference_prefix = os.getenv("FINAL_EXPORT_REFERENCE_PREFIX", "final-export-reference")
        output_bucket = os.getenv("FINAL_EXPORT_OUTPUT_BUCKET", base.r2_bucket)
        output_prefix = os.getenv("FINAL_EXPORT_OUTPUT_PREFIX", f"final-export/{run_id}")
        variants_raw = os.getenv("FINAL_EXPORT_VARIANTS_PATH", "").strip()
        validation_raw = os.getenv("FINAL_EXPORT_VALIDATION_PATH", str(DEFAULT_VALIDATION_PARQUET))
        youtube_raw = os.getenv("FINAL_EXPORT_YOUTUBE_META_PATH", str(DEFAULT_YOUTUBE_META))
        language_filters = _env_csv("FINAL_EXPORT_LANG_FILTERS", [])
        supported_languages = _env_csv("FINAL_EXPORT_SUPPORTED_LANGS", list(SUPPORTED_LANGUAGES))
        return cls(
            base=base,
            run_id=run_id,
            reference_mode=reference_mode,
            reference_bucket=reference_bucket,
            reference_prefix=reference_prefix.rstrip("/"),
            output_bucket=output_bucket,
            output_prefix=output_prefix,
            local_work_root=Path(os.getenv("FINAL_EXPORT_LOCAL_WORK_ROOT", str(ROOT / "tmp" / "final_export"))),
            canonical_segments_path=Path(
                os.getenv("FINAL_EXPORT_CANONICAL_SEGMENTS_PATH", str(DEFAULT_CANONICAL_SEGMENTS))
            ),
            raw_transcripts_path=Path(
                os.getenv("FINAL_EXPORT_RAW_TRANSCRIPTS_PATH", str(DEFAULT_RAW_TRANSCRIPTS))
            ),
            variants_path=Path(variants_raw) if variants_raw else None,
            validation_path=Path(validation_raw) if validation_raw else None,
            youtube_meta_path=Path(youtube_raw) if youtube_raw else None,
            supported_languages=supported_languages,
            require_variants=_env_bool("FINAL_EXPORT_REQUIRE_VARIANTS", False),
            require_validation=_env_bool("FINAL_EXPORT_REQUIRE_VALIDATION", False),
            microshard_target_rows=_env_int("FINAL_EXPORT_MICROSHARD_TARGET_ROWS", 5000),
            final_shard_target_rows=_env_int("FINAL_EXPORT_FINAL_SHARD_ROWS", 15000),
            polish_threads=_env_int("FINAL_EXPORT_POLISH_THREADS", 16),
            duckdb_threads=_env_int("FINAL_EXPORT_DUCKDB_THREADS", 8),
            compactor_claim_limit=_env_int("FINAL_EXPORT_COMPACTOR_CLAIM_LIMIT", 32),
            max_videos=_env_int("FINAL_EXPORT_MAX_VIDEOS", 0),
            max_shards=_env_int("FINAL_EXPORT_MAX_SHARDS", 0),
            allow_partial_shards=_env_bool("FINAL_EXPORT_ALLOW_PARTIAL_SHARDS", False),
            language_filters=language_filters,
            language_lease_seconds=_env_int("FINAL_EXPORT_LANGUAGE_LEASE_SECONDS", 120),
            claim_stale_after_s=_env_int("FINAL_EXPORT_CLAIM_STALE_AFTER_S", 900),
            reference_download_concurrency=_env_int("FINAL_EXPORT_REFERENCE_DOWNLOAD_CONCURRENCY", 16),
            canonical_segments_r2_key=os.getenv(
                "FINAL_EXPORT_CANONICAL_SEGMENTS_R2_KEY",
                f"{reference_prefix.rstrip('/')}/canonical_segments.parquet",
            ),
            raw_transcripts_r2_key=os.getenv(
                "FINAL_EXPORT_RAW_TRANSCRIPTS_R2_KEY",
                f"{reference_prefix.rstrip('/')}/raw_transcripts.parquet",
            ),
            validation_r2_key=os.getenv(
                "FINAL_EXPORT_VALIDATION_R2_KEY",
                f"{reference_prefix.rstrip('/')}/validation.parquet",
            ).strip()
            or None,
            youtube_meta_r2_key=os.getenv(
                "FINAL_EXPORT_YOUTUBE_META_R2_KEY",
                f"{reference_prefix.rstrip('/')}/youtube_meta.csv",
            ).strip()
            or None,
            variants_r2_key=os.getenv(
                "FINAL_EXPORT_VARIANTS_R2_KEY",
                f"{reference_prefix.rstrip('/')}/variants.parquet",
            ).strip()
            or None,
            reference_manifest_r2_key=os.getenv(
                "FINAL_EXPORT_REFERENCE_MANIFEST_R2_KEY",
                f"{reference_prefix.rstrip('/')}/manifest.json",
            ).strip()
            or None,
        )

    def validate_for_video_stage(self) -> list[str]:
        errors: list[str] = []
        if not self.mock_mode:
            if not self.database_url:
                errors.append("DATABASE_URL is required")
            if not self.base.r2_endpoint_url:
                errors.append("R2_ENDPOINT_URL is required")
        if self.reference_mode not in {"local", "r2"}:
            errors.append("FINAL_EXPORT_REFERENCE_MODE must be 'local' or 'r2'")
        if self.reference_mode == "local":
            if not self.canonical_segments_path.exists():
                errors.append(f"Canonical segments parquet not found: {self.canonical_segments_path}")
            if not self.raw_transcripts_path.exists():
                errors.append(f"Raw transcripts parquet not found: {self.raw_transcripts_path}")
            if self.require_variants and (self.variants_path is None or not self.variants_path.exists()):
                errors.append("FINAL_EXPORT_REQUIRE_VARIANTS=true but variants parquet is missing")
            if self.require_validation and (self.validation_path is None or not self.validation_path.exists()):
                errors.append("FINAL_EXPORT_REQUIRE_VALIDATION=true but validation parquet is missing")
        else:
            if not self.canonical_segments_r2_key:
                errors.append("FINAL_EXPORT_CANONICAL_SEGMENTS_R2_KEY is required in r2 mode")
            if not self.raw_transcripts_r2_key:
                errors.append("FINAL_EXPORT_RAW_TRANSCRIPTS_R2_KEY is required in r2 mode")
            if self.require_validation and not self.validation_r2_key:
                errors.append("FINAL_EXPORT_VALIDATION_R2_KEY is required in r2 mode when validation is mandatory")
        return errors

    def validate_for_compactor(self) -> list[str]:
        errors: list[str] = []
        if not self.mock_mode:
            if not self.database_url:
                errors.append("DATABASE_URL is required")
            if not self.base.r2_endpoint_url:
                errors.append("R2_ENDPOINT_URL is required")
        return errors
