"""Unit tests for provenance metadata in split and tokenizer scripts."""

import hashlib
import json
import subprocess
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent


def _make_manifest(tmp_path, count=50, lang="en"):
    """Create a minimal JSONL manifest."""
    tmp_path.mkdir(parents=True, exist_ok=True)
    path = tmp_path / "manifest.jsonl"
    with open(path, "w") as f:
        for i in range(count):
            row = {
                "audio_filepath": f"/data/{lang}/audio.tar",
                "tar_member": f"seg_{i}.flac",
                "text": f"hello world sentence {i}",
                "duration": 3.0,
                "lang": lang,
                "taskname": "asr",
                "source_lang": lang,
                "target_lang": lang,
            }
            f.write(json.dumps(row) + "\n")
    return path


def _sha256(path):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()


def test_split_metadata_created(tmp_path):
    """split_manifest should create split_metadata.json."""
    manifest = _make_manifest(tmp_path, count=100)
    train = tmp_path / "train.jsonl"
    val = tmp_path / "val.jsonl"

    result = subprocess.run(
        [
            "python3",
            "scripts/split_manifest.py",
            "--input",
            str(manifest),
            "--train-output",
            str(train),
            "--val-output",
            str(val),
            "--val-ratio",
            "0.1",
            "--seed",
            "42",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr

    meta_file = tmp_path / "split_metadata.json"
    assert meta_file.exists(), "split_metadata.json not created"

    meta = json.loads(meta_file.read_text())
    assert meta["seed"] == 42
    assert meta["val_ratio"] == 0.1
    assert meta["stratify_by_lang"] is True
    assert meta["train_rows"] + meta["val_rows"] == 100
    assert "source_manifest_sha256" in meta
    assert meta["source_manifest_sha256"] == _sha256(manifest)
    assert "source_manifest" in meta
    assert "created_at" in meta


def test_split_metadata_sha256_changes_with_content(tmp_path):
    """Different manifest content should produce different sha256."""
    m1 = _make_manifest(tmp_path / "a", count=50)
    m2 = _make_manifest(tmp_path / "b", count=60)

    for manifest, subdir in [(m1, "a"), (m2, "b")]:
        subprocess.run(
            [
                "python3",
                "scripts/split_manifest.py",
                "--input",
                str(manifest),
                "--train-output",
                str(tmp_path / subdir / "train.jsonl"),
                "--val-output",
                str(tmp_path / subdir / "val.jsonl"),
                "--val-ratio",
                "0.1",
                "--seed",
                "42",
            ],
            capture_output=True,
            text=True,
            cwd=str(PROJECT_ROOT),
        )

    meta1 = json.loads((tmp_path / "a" / "split_metadata.json").read_text())
    meta2 = json.loads((tmp_path / "b" / "split_metadata.json").read_text())
    assert meta1["source_manifest_sha256"] != meta2["source_manifest_sha256"]


def test_tokenizer_metadata_has_source_hash(tmp_path):
    """Tokenizer metadata should contain source manifest sha256."""
    texts = [f"hello world sentence number {i}" for i in range(100)]
    manifest = tmp_path / "manifest.jsonl"
    with open(manifest, "w") as f:
        for i, text in enumerate(texts):
            row = {
                "audio_filepath": "/data/audio.tar",
                "tar_member": f"seg_{i}.flac",
                "text": text,
                "duration": 3.0,
                "lang": "en",
                "taskname": "asr",
                "source_lang": "en",
                "target_lang": "en",
            }
            f.write(json.dumps(row) + "\n")

    out_dir = tmp_path / "tok"
    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(manifest),
            "--output-dir",
            str(out_dir),
            "--vocab-size",
            "64",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr

    meta = json.loads((out_dir / "metadata.json").read_text())
    assert "source_manifest_sha256" in meta
    assert meta["source_manifest_sha256"] == _sha256(manifest)
    assert "source_manifest" in meta
    assert "vocab_size_requested" in meta
    assert meta["vocab_size_requested"] == 64
    assert meta["vocab_size"] >= 1
