"""
Prompt + schema helpers for transcript script normalization.

This path is separate from the audio transcription prompt because the task here
starts from existing transcript text, not audio. We pre-classify fully Roman
items locally and only send native/mixed-script items to Gemini.
"""
from __future__ import annotations

import json
import re
import unicodedata
from enum import Enum
from typing import Iterable

from pydantic import BaseModel, Field

from .config import LANGUAGE_MAP


class InputScriptProfile(str, Enum):
    fully_roman = "fully_roman"
    fully_native = "fully_native"
    mixed_native_latin = "mixed_native_latin"
    other = "other"


class TranscriptVariantResult(BaseModel):
    id: str = Field(description="Caller-provided stable input identifier.")
    native_script_text: str = Field(
        description=(
            "Same utterance rendered in the target native script. Preserve protected "
            "spans exactly: numerics, emails, URLs, handles, hashtags, and file-like tokens."
        )
    )
    romanized_text: str = Field(
        description=(
            "Same utterance rendered in ASCII Romanization for the target language. "
            "Preserve protected spans exactly."
        )
    )


class TranscriptVariantBatchResult(BaseModel):
    results: list[TranscriptVariantResult]


PROTECTED_SPAN_RE = re.compile(
    r"(?:https?://\S+|www\.\S+|[\w.+-]+@[\w-]+(?:\.[\w-]+)+|[@#][\w._-]+|\b\d+(?:[.,:/-]\d+)*\b)",
    re.IGNORECASE,
)

ASCII_ROMAN_RE = re.compile(r"^[A-Za-z0-9\s.,!?;:'\"(){}\[\]/\\@#&%+*=_<>|-]*$")

SCRIPT_RANGES: dict[str, tuple[tuple[int, int], ...]] = {
    "Latin": ((0x0041, 0x024F),),
    "Devanagari": ((0x0900, 0x097F),),
    "Bengali": ((0x0980, 0x09FF),),
    "Gurmukhi": ((0x0A00, 0x0A7F),),
    "Gujarati": ((0x0A80, 0x0AFF),),
    "Odia": ((0x0B00, 0x0B7F),),
    "Tamil": ((0x0B80, 0x0BFF),),
    "Telugu": ((0x0C00, 0x0C7F),),
    "Kannada": ((0x0C80, 0x0CFF),),
    "Malayalam": ((0x0D00, 0x0D7F),),
}

SCRIPT_BLOCK_BY_NAME = {
    "Devanagari": "Devanagari",
    "Assamese": "Bengali",
    "Bengali": "Bengali",
    "Gurmukhi": "Gurmukhi",
    "Gujarati": "Gujarati",
    "Odia": "Odia",
    "Tamil": "Tamil",
    "Telugu": "Telugu",
    "Kannada": "Kannada",
    "Malayalam": "Malayalam",
    "Latin": "Latin",
}


def _resolve_refs(schema: dict, defs: dict | None = None) -> dict:
    if defs is None:
        defs = schema.pop("$defs", {})

    if "$ref" in schema:
        ref_name = schema["$ref"].split("/")[-1]
        resolved = defs.get(ref_name, {})
        return _resolve_refs(dict(resolved), defs)

    result: dict = {}
    for key, value in schema.items():
        if key == "$defs":
            continue
        if isinstance(value, dict):
            result[key] = _resolve_refs(value, defs)
        elif isinstance(value, list):
            result[key] = [
                _resolve_refs(item, defs) if isinstance(item, dict) else item
                for item in value
            ]
        else:
            result[key] = value
    return result


def get_transcript_variant_json_schema() -> dict:
    schema = TranscriptVariantBatchResult.model_json_schema()
    resolved = _resolve_refs(schema)
    resolved["additionalProperties"] = False
    item_schema = resolved["properties"]["results"]["items"]
    item_schema["additionalProperties"] = False
    return resolved


def get_target_script_name(language_code: str) -> str:
    if language_code not in LANGUAGE_MAP:
        language_code = "en"
    _, script_name, _ = LANGUAGE_MAP[language_code]
    return script_name


def get_target_script_block(language_code: str) -> str:
    script_name = get_target_script_name(language_code)
    return SCRIPT_BLOCK_BY_NAME.get(script_name, "Latin")


def extract_protected_spans(text: str) -> list[str]:
    return PROTECTED_SPAN_RE.findall(text)


def _strip_protected_spans(text: str) -> str:
    return PROTECTED_SPAN_RE.sub(" ", text)


def _char_in_ranges(char: str, ranges: Iterable[tuple[int, int]]) -> bool:
    codepoint = ord(char)
    return any(start <= codepoint <= end for start, end in ranges)


def detect_script_counts(text: str) -> dict[str, int]:
    scrubbed = _strip_protected_spans(text)
    counts = {name: 0 for name in SCRIPT_RANGES}
    for char in scrubbed:
        category = unicodedata.category(char)
        if category[:1] not in {"L", "M"}:
            continue
        matched = False
        for script_name, ranges in SCRIPT_RANGES.items():
            if _char_in_ranges(char, ranges):
                counts[script_name] += 1
                matched = True
                break
        if matched:
            continue
    return counts


def classify_input_script(text: str, language_code: str) -> InputScriptProfile:
    if language_code == "en":
        return InputScriptProfile.fully_roman

    counts = detect_script_counts(text)
    native_block = get_target_script_block(language_code)
    native_count = counts.get(native_block, 0)
    latin_count = counts.get("Latin", 0)
    other_count = sum(
        count
        for script_name, count in counts.items()
        if script_name not in {native_block, "Latin"}
    )

    if native_count == 0 and latin_count > 0 and other_count == 0:
        return InputScriptProfile.fully_roman
    if native_count > 0 and latin_count == 0 and other_count == 0:
        return InputScriptProfile.fully_native
    if native_count > 0 and latin_count > 0 and other_count == 0:
        return InputScriptProfile.mixed_native_latin
    return InputScriptProfile.other


def romanized_text_is_ascii(text: str) -> bool:
    scrubbed = _strip_protected_spans(text)
    return ASCII_ROMAN_RE.fullmatch(scrubbed) is not None


def build_transcript_variant_user_prompt(items: list[dict]) -> str:
    normalized_items = []
    for item in items:
        normalized_items.append(
            {
                "id": item["id"],
                "language_code": item["language_code"],
                "input_script_profile": item["input_script_profile"],
                "text": item["text"],
            }
        )

    payload = json.dumps(normalized_items, ensure_ascii=False, indent=2)
    return (
        "Produce both variants for every input item. Return one JSON object matching the schema exactly.\n"
        "INPUT:\n"
        f"{payload}\n"
    )


# Keep this just above Gemini Flash's 1024-token cache threshold. The test
# harness verifies the exact cached token count using API metadata at runtime.
CACHEABLE_TRANSCRIPT_VARIANT_PROMPT = """ROLE
Convert transcript text into two deterministic script variants for the same spoken utterance.
Output only the JSON object required by the schema.

INPUT
Each item has: id, language_code, input_script_profile, text.
The caller already skipped fully_roman items, so every item here must be processed.

TASK
For each item return:
1. native_script_text
2. romanized_text

GLOBAL RULES
- Same utterance only. No translation, paraphrase, cleanup, grammar fixing, expansion, or added context.
- Preserve word order, clause order, repetitions, fillers, and punctuation as closely as possible.
- Never infer omitted words before or after the visible text.
- Return one output item per input item in the same order.
- Copy id exactly from the input.
- No explanations, notes, confidence, or extra keys.

NATIVE_SCRIPT_TEXT
- Render the utterance in the target language's native script.
- If input_script_profile is fully_native, native_script_text may be identical to the input.
- If input_script_profile is mixed_native_latin, convert ordinary spoken Latin-script words into the target native script.
- Do not change protected spans.
- Common spoken borrowings like Google, WhatsApp, click, video, website, support, form, upload, download, meeting may be written in native script when they are ordinary spoken words, not literals.

ROMANIZED_TEXT
- Render the same utterance in plain ASCII Roman letters.
- No diacritics, IPA, or scholarly transliteration.
- Use everyday user-style Romanization.
- Keep Romanization internally consistent within an item.
- Well-known brand names may remain in their natural Roman form when that is the most faithful rendering.

PROTECTED SPANS
Preserve these byte-for-byte unchanged in both outputs:
- email addresses
- URLs and web domains
- @handles and #hashtags
- numbers, dates, times, amounts, version-like numbers
- file names and literal identifier-like tokens

If unsure whether a token is protected, preserve it exactly.

PUNCTUATION
- Keep punctuation close to the input.
- Do not invent new clauses.
- Do not add surrounding quotes.

EXAMPLES

Example 1
input:
या Google पर जाकर whatsapp@support.com पर click करके भी अपने WhatsApp का access ले सकते हैं
native_script_text:
या गूगल पर जाकर whatsapp@support.com पर क्लिक करके भी अपने वॉट्सऐप का एक्सेस ले सकते हैं
romanized_text:
Ya Google par jaakar whatsapp@support.com par click karke bhi apne WhatsApp ka access le sakte hain

Example 2
input:
अनुराग ठाकुर जी तमिलनाडु में आकर आठ नौ जगहों पर प्रचार किए हैं
native_script_text:
अनुराग ठाकुर जी तमिलनाडु में आकर आठ नौ जगहों पर प्रचार किए हैं
romanized_text:
Anurag Thakur ji Tamilnadu mein aakar aath nau jagahon par prachaar kiye hain

Example 3
input:
আমাৰ যিটো website www.brainexcellencelabs.com তাত আপুনি search কৰিলে agenda খিনি জানিব পাৰিব
native_script_text:
আমাৰ যিটো ৱেবছাইট www.brainexcellencelabs.com তাত আপুনি চাৰ্চ কৰিলে এজেণ্ডা খিনি জানিব পাৰিব
romanized_text:
Amar jito website www.brainexcellencelabs.com tat apuni search korile agenda khini janib parib

Example 4
input:
ഈ മുഖം അങ്ങനെയല്ല എന്ന് വെച്ചാൽ they can change it. ഒന്നും കഷ്ടമേ കിട്ടത്തില്ല.
native_script_text:
ഈ മുഖം അങ്ങനെയല്ല എന്ന് വെച്ചാൽ ദേ കാൻ ചേഞ്ച് ഇറ്റ്. ഒന്നും കഷ്ടമേ കിട്ടത്തില്ല.
romanized_text:
Ee mukham anganeyalla ennu vechaal they can change it. Onnum kashtame kittathilla.

Example 5
input:
అందువల్ల డిపార్ట్మెంట్ టు డిపార్ట్మెంట్ యాక్షన్ తీసుకోవడం వల్ల
native_script_text:
అందువల్ల డిపార్ట్మెంట్ టు డిపార్ట్మెంట్ యాక్షన్ తీసుకోవడం వల్ల
romanized_text:
Anduvalla department tu department action teesukovadam valla

Example 6
input:
આ meeting 3 વાગ્યે start થશે
native_script_text:
આ મીટિંગ 3 વાગ્યે સ્ટાર્ટ થશે
romanized_text:
A meeting 3 vaagye start thashe

FINAL
Return a single JSON object matching the schema exactly. Never translate. Preserve protected spans exactly.
"""


def get_cacheable_transcript_variant_prompt() -> str:
    return CACHEABLE_TRANSCRIPT_VARIANT_PROMPT