"""Unit tests for build_tokenizer.py — uses synthetic data only."""

import json
import subprocess
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent


def _make_manifest(tmp_path, texts, lang="en"):
    """Create a minimal JSONL manifest with given texts."""
    path = tmp_path / "manifest.jsonl"
    with open(path, "w") as f:
        for i, text in enumerate(texts):
            row = {
                "audio_filepath": f"/data/{lang}/audio.tar",
                "tar_member": f"seg_{i}.flac",
                "text": text,
                "duration": 3.0,
                "lang": lang,
                "taskname": "asr",
                "source_lang": lang,
                "target_lang": lang,
            }
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    return path


def test_builds_tokenizer_successfully(tmp_path):
    """Basic tokenizer build with synthetic text."""
    texts = [f"hello world sentence number {i}" for i in range(100)]
    manifest = _make_manifest(tmp_path, texts)
    out_dir = tmp_path / "tok"

    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(manifest),
            "--output-dir",
            str(out_dir),
            "--vocab-size",
            "64",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr
    assert (out_dir / "tokenizer.model").exists()
    assert (out_dir / "tokenizer.vocab").exists()
    assert (out_dir / "metadata.json").exists()

    # Verify metadata
    meta = json.loads((out_dir / "metadata.json").read_text())
    assert meta["model_type"] == "bpe"
    assert meta["num_training_sentences"] == 100
    assert "created_at" in meta


def test_fails_on_missing_manifest(tmp_path):
    """Should fail clearly when manifest doesn't exist."""
    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(tmp_path / "nonexistent.jsonl"),
            "--output-dir",
            str(tmp_path / "tok"),
            "--vocab-size",
            "64",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode != 0
    assert "Manifest not found" in result.stderr


def test_fails_on_empty_text_manifest(tmp_path):
    """Should fail when all text fields are empty."""
    path = tmp_path / "empty.jsonl"
    with open(path, "w") as f:
        for i in range(5):
            row = {
                "audio_filepath": "/data/audio.tar",
                "tar_member": f"seg_{i}.flac",
                "text": "",
                "duration": 1.0,
                "lang": "en",
                "taskname": "asr",
                "source_lang": "en",
                "target_lang": "en",
            }
            f.write(json.dumps(row) + "\n")

    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(path),
            "--output-dir",
            str(tmp_path / "tok"),
            "--vocab-size",
            "64",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode != 0
    assert "No valid text found" in result.stderr


def test_model_type_unigram(tmp_path):
    """--model-type unigram should work."""
    # Unigram needs diverse text with large enough corpus to build vocab.
    # Use many unique long sentences so SPM can extract enough subwords.
    import string

    words = list(string.ascii_lowercase)  # 26 single-char words
    rng = __import__("random").Random(42)
    texts = [" ".join(rng.choices(words, k=20)) for _ in range(2000)]
    manifest = _make_manifest(tmp_path, texts)
    out_dir = tmp_path / "tok"

    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(manifest),
            "--output-dir",
            str(out_dir),
            "--vocab-size",
            "50",
            "--model-type",
            "unigram",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr

    meta = json.loads((out_dir / "metadata.json").read_text())
    assert meta["model_type"] == "unigram"


def test_max_sentences_limits_training(tmp_path):
    """--max-sentences should truncate training data."""
    texts = [f"sentence {i}" for i in range(200)]
    manifest = _make_manifest(tmp_path, texts)
    out_dir = tmp_path / "tok"

    result = subprocess.run(
        [
            "python3",
            "scripts/build_tokenizer.py",
            "--manifest",
            str(manifest),
            "--output-dir",
            str(out_dir),
            "--vocab-size",
            "64",
            "--max-sentences",
            "50",
        ],
        capture_output=True,
        text=True,
        cwd=str(PROJECT_ROOT),
    )
    assert result.returncode == 0, result.stderr
    assert "Truncated to 50" in result.stdout

    meta = json.loads((out_dir / "metadata.json").read_text())
    assert meta["num_training_sentences"] == 50