#!/usr/bin/env python3
"""
Convert soprano_data (indicvoices_r, rasa_hindi, IISc_SYSPIN) into a
HuggingFace Dataset with 'audio', 'text', and 'speaker' columns,
ready for the KaniTTS-2 dataset pipeline.
"""

import csv
import json
import os
from datasets import Dataset, Audio, Features, Value

SOPRANO_ROOT = "/home/ubuntu/soprano_data"
OUTPUT_DIR = "/home/ubuntu/kanitts-2-dataset-pipeline/hindi_hf_dataset"


def load_indicvoices():
    csv_path = os.path.join(SOPRANO_ROOT, "indicvoices_r", "metadata.csv")
    wav_dir = os.path.join(SOPRANO_ROOT, "indicvoices_r", "wavs")
    entries = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row.get("split") == "test":
                continue
            text = row["text"].strip()
            if not text:
                continue
            wav_path = os.path.join(wav_dir, row["filename"])
            if not os.path.isfile(wav_path):
                continue
            speaker = row.get("speaker_id", "iv_unknown")
            entries.append({"audio": wav_path, "text": text, "speaker": f"iv_{speaker}"})
    print(f"indicvoices_r: {len(entries)} samples")
    return entries


def load_rasa_hindi():
    csv_path = os.path.join(SOPRANO_ROOT, "rasa_hindi", "metadata.csv")
    wav_dir = os.path.join(SOPRANO_ROOT, "rasa_hindi", "wavs")
    entries = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            text = row["text"].strip()
            if not text:
                continue
            filename = row["filename"]
            if not filename.endswith(".wav"):
                filename += ".wav"
            wav_path = os.path.join(wav_dir, filename)
            if not os.path.isfile(wav_path):
                continue
            gender = row.get("gender", "unknown").lower()
            entries.append({"audio": wav_path, "text": text, "speaker": f"rasa_{gender}"})
    print(f"rasa_hindi: {len(entries)} samples")
    return entries


def load_iisc_syspin():
    base = os.path.join(SOPRANO_ROOT, "IISc_SYSPIN_Data")
    entries = []
    speakers = [
        ("IISc_SYSPINProject_Hindi_Male_Spk001_HC", "iisc_male"),
        ("IISc_SYSPINProject_Hindi_Female_Spk001_HC", "iisc_female"),
    ]
    for spk_dir, speaker_id in speakers:
        spk_path = os.path.join(base, spk_dir)
        json_file = [f for f in os.listdir(spk_path) if f.endswith("_Transcripts.json")][0]
        wav_dir = os.path.join(spk_path, "wav")
        with open(os.path.join(spk_path, json_file), encoding="utf-8") as f:
            data = json.load(f)
        for wav_id, info in data["Transcripts"].items():
            text = info["Transcript"].strip()
            if not text:
                continue
            wav_path = os.path.join(wav_dir, wav_id + ".wav")
            if not os.path.isfile(wav_path):
                continue
            entries.append({"audio": wav_path, "text": text, "speaker": speaker_id})
    print(f"IISc_SYSPIN: {len(entries)} samples")
    return entries


def main():
    print("Loading all datasets...")
    all_entries = load_indicvoices() + load_rasa_hindi() + load_iisc_syspin()
    print(f"\nTotal: {len(all_entries)} samples")

    print("Creating HuggingFace Dataset (paths only, no audio loading)...")
    ds = Dataset.from_dict({
        "audio": [e["audio"] for e in all_entries],
        "text": [e["text"] for e in all_entries],
        "speaker": [e["speaker"] for e in all_entries],
    })

    print(f"Saving to {OUTPUT_DIR}...")
    ds.save_to_disk(OUTPUT_DIR)

    print(f"\nDone! Dataset saved with {len(ds)} samples.")
    print(f"Columns: {ds.column_names}")
    print(f"Example audio path: {ds[0]['audio']}")
    print(f"Example text: {ds[0]['text'][:80]}...")


if __name__ == "__main__":
    main()
