"""Tests for raw-tar recovery loader."""
from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import soundfile as sf

from src.audio_polish import polish_segment
from validations.recover_loader import (
    load_recover_segments,
    replay_segment_id,
)


def _make_split_audio(sr: int = 16000) -> np.ndarray:
    """Create a long deterministic signal with silence valleys for stable splits."""
    duration_s = 26.0
    t = np.linspace(0, duration_s, int(sr * duration_s), endpoint=False)
    audio = (0.25 * np.sin(2 * np.pi * 220 * t)).astype(np.float32)

    for silence_at_s in (8.0, 16.0):
        start = int(silence_at_s * sr)
        end = start + int(0.5 * sr)
        audio[start:end] *= 0.0001
    return audio


def _write_raw_video(tmp_path: Path, video_id: str, segment_name: str, audio: np.ndarray, sr: int = 16000) -> Path:
    video_dir = tmp_path / video_id
    segments_dir = video_dir / "segments"
    segments_dir.mkdir(parents=True, exist_ok=True)
    (video_dir / "metadata.json").write_text(json.dumps({"video_id": video_id, "language": "te"}))

    raw_path = segments_dir / segment_name
    sf.write(raw_path, audio, sr)
    return raw_path


def _valid_children(raw_path: Path) -> list:
    return [seg for seg in polish_segment(raw_path) if not seg.trim_meta.discarded]


def test_recover_loader_matches_requested_split_child(tmp_path: Path):
    video_id = "video_split_test"
    parent_file = "SPEAKER_00_0000_0.00-26.00.flac"
    raw_path = _write_raw_video(tmp_path, video_id, parent_file, _make_split_audio())

    valid = _valid_children(raw_path)
    assert len(valid) >= 3

    tx_rows = []
    for seg in valid[:2]:
        tx_rows.append({
            "segment_file": replay_segment_id(
                seg.trim_meta.original_file,
                seg.trim_meta.was_split,
                seg.trim_meta.split_index,
            ),
            "transcription": f"text-{seg.trim_meta.split_index}",
            "tagged": f"tagged-{seg.trim_meta.split_index}",
            "detected_language": "te",
            "quality_score": 0.9,
            "speaker_emotion": "neutral",
            "speaker_style": "conversational",
            "speaker_pace": "normal",
            "speaker_accent": "",
        })

    target_id = tx_rows[1]["segment_file"]
    result = load_recover_segments(
        tmp_path,
        video_id,
        tx_rows,
        target_segment_ids={target_id},
    )

    assert result.missing_tx_ids == []
    assert result.matched_tx_ids == [target_id]
    assert [seg.segment_file for seg in result.segments] == [target_id]
    assert result.segments[0].gemini_transcription == "text-1"
    # Remaining replay-only children from this parent are surfaced as salvage candidates.
    assert result.extra_regen_ids


def test_recover_loader_reports_historical_id_missing_after_replay(tmp_path: Path):
    video_id = "video_missing_test"
    parent_file = "SPEAKER_00_0001_0.00-26.00.flac"
    raw_path = _write_raw_video(tmp_path, video_id, parent_file, _make_split_audio())

    valid = _valid_children(raw_path)
    assert valid

    real_id = replay_segment_id(
        valid[0].trim_meta.original_file,
        valid[0].trim_meta.was_split,
        valid[0].trim_meta.split_index,
    )
    fake_id = f"{parent_file}_split99"

    tx_rows = [
        {
            "segment_file": real_id,
            "transcription": "real",
            "tagged": "real",
            "detected_language": "te",
            "quality_score": 0.8,
        },
        {
            "segment_file": fake_id,
            "transcription": "missing",
            "tagged": "missing",
            "detected_language": "te",
            "quality_score": 0.8,
        },
    ]

    result = load_recover_segments(tmp_path, video_id, tx_rows)

    assert real_id in result.matched_tx_ids
    assert fake_id in result.missing_tx_ids