"""Integration tests for manifest pipeline — requires /root/sft_data."""

import json
import subprocess
from pathlib import Path

import pytest

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
SFT_DATA = Path("/root/sft_data")

pytestmark = pytest.mark.integration

requires_data = pytest.mark.skipif(
    not SFT_DATA.exists(),
    reason=f"{SFT_DATA} not found — skipping integration tests",
)


@requires_data
def test_sft_data_root_exists():
    assert SFT_DATA.exists()


@requires_data
def test_final_export_root_exists():
    assert (SFT_DATA / "final-export" / "production" / "shards").exists()


@requires_data
def test_build_manifest_smoke_en_hi(tmp_path):
    """Build manifest for en+hi, 1 shard each."""
    out = tmp_path / "smoke.jsonl"
    result = subprocess.run(
        [
            "python3",
            "scripts/build_manifest.py",
            "--languages",
            "en",
            "hi",
            "--max-shards",
            "1",
            "--output",
            str(out),
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr
    assert out.exists()

    lines = [ln for ln in out.read_text().strip().split("\n") if ln.strip()]
    assert len(lines) > 1000, f"Expected >1000 rows, got {len(lines)}"

    # Check valid JSONL
    for line in lines[:5]:
        row = json.loads(line)
        assert row["taskname"] == "asr"
        assert row["duration"] > 0
        assert row["lang"] in ("en", "hi")


@requires_data
def test_build_manifest_quality_filter(tmp_path):
    """--min-quality should reduce row count."""
    result_filtered = subprocess.run(
        [
            "python3",
            "scripts/build_manifest.py",
            "--languages",
            "hi",
            "--max-shards",
            "1",
            "--min-quality",
            "0.95",
            "--output",
            str(tmp_path / "filtered.jsonl"),
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result_filtered.returncode == 0
    assert "Dropped (low quality)" in result_filtered.stdout

    result_full = subprocess.run(
        [
            "python3",
            "scripts/build_manifest.py",
            "--languages",
            "hi",
            "--max-shards",
            "1",
            "--output",
            str(tmp_path / "full.jsonl"),
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result_full.returncode == 0

    n_filtered = len(
        [ln for ln in (tmp_path / "filtered.jsonl").read_text().strip().split("\n") if ln.strip()]
    )
    n_full = len(
        [ln for ln in (tmp_path / "full.jsonl").read_text().strip().split("\n") if ln.strip()]
    )
    assert n_filtered < n_full


@requires_data
def test_build_manifest_dedupe(tmp_path):
    """Dedupe should produce <= rows vs --no-dedupe."""
    base = [
        "python3",
        "scripts/build_manifest.py",
        "--languages",
        "en",
        "--max-shards",
        "1",
    ]
    subprocess.run(
        [*base, "--output", str(tmp_path / "deduped.jsonl")],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    subprocess.run(
        [*base, "--no-dedupe", "--output", str(tmp_path / "nodup.jsonl")],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    n_deduped = len(
        [ln for ln in (tmp_path / "deduped.jsonl").read_text().strip().split("\n") if ln.strip()]
    )
    n_nodup = len(
        [ln for ln in (tmp_path / "nodup.jsonl").read_text().strip().split("\n") if ln.strip()]
    )
    assert n_deduped <= n_nodup


@requires_data
def test_validate_manifest_passes(tmp_path):
    """Validator should pass on a freshly built manifest."""
    out = tmp_path / "manifest.jsonl"
    subprocess.run(
        [
            "python3",
            "scripts/build_manifest.py",
            "--languages",
            "en",
            "--max-shards",
            "1",
            "--output",
            str(out),
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    result = subprocess.run(
        ["python3", "scripts/validate_manifest.py", str(out)],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0
    assert "PASS" in result.stdout
