"""
Pydantic schemas for structured transcription output.

Output fields (what Gemini returns per segment):
  1. transcription   - Native script with punctuation (PRIMARY, authoritative)
  2. code_switch     - Mixed script: native + English in Latin (derived from 1)
  3. romanized       - Full Latin transliteration (derived from 1)
  4. tagged          - Code-switch + audio event tags [laugh] etc. (derived from 2)
  5. speaker         - Metadata: emotion, style, pace, accent

Prompt v3 design (evolved from v1 original -> v2 strict -> v3 current):
  - Single master template, dynamic language pack injection for 12 languages
  - "Verbatim transcription system" persona (explicit NOT assistant negation)
  - Acoustic punctuation (prosody-based pauses, not grammar rules)
  - Anti-hallucination: [UNK] for uncertain words, no sentence completion
  - Field derivation ordering: transcription is authoritative, others derive from it
  - Transliterate != Translate distinction (fixed v2 contradiction that caused
    English speech to be translated into Telugu on segment 3 test)
  - Numbers as spoken words for TTS fidelity
  - Per-language script rules for known error-prone cases
"""
from typing import Optional, Dict
from pydantic import BaseModel, Field


# === Language Pack Configs ===
# Per-language metadata + script-specific rules injected into the master prompt.
# Only rules that prevent ACTUAL transcription errors, not generic grammar.
# v1: 12 languages (11 Indic + English). Add more as needed.

LANGUAGE_CONFIGS: Dict[str, Dict[str, str]] = {
    "Hindi": {
        "bcp47": "hi-IN",
        "script": "Devanagari",
        # Nukta: ज़/फ़ vs ज/फ matters for Urdu-influenced speakers.
        # Schwa: written राम but pronounced "Ram" - keep orthographic form.
        "script_rules": (
            "Preserve Nukta distinctions (ज़ vs ज, फ़ vs फ) when the speaker "
            "clearly pronounces the distinction. Use standard Devanagari "
            "orthography even when Schwa is deleted in pronunciation."
        ),
    },
    "Marathi": {
        "bcp47": "mr-IN",
        "script": "Devanagari",
        # ळ is unique to Marathi among Devanagari languages.
        "script_rules": (
            "Preserve ळ (retroflex lateral) accurately. Use standard "
            "Marathi Devanagari orthography."
        ),
    },
    "Telugu": {
        "bcp47": "te-IN",
        "script": "Telugu",
        # Agglutinative: sandhi combines words. Don't over-split.
        "script_rules": (
            "Preserve Sandhi (combined/agglutinated words) as spoken. "
            "If the speaker says a combined form as one unit, write it "
            "as one word. Do not split or merge beyond what is spoken."
        ),
    },
    "Tamil": {
        "bcp47": "ta-IN",
        "script": "Tamil",
        # Short vs long vowels are phonemically distinct in Tamil.
        # Tamil script doesn't distinguish voiced/voiceless (same letter for both).
        "script_rules": (
            "Strictly distinguish short and long vowels (e vs ē, o vs ō). "
            "Do not colloquialize formal endings unless the speaker actually "
            "uses colloquial forms. Preserve spoken form faithfully."
        ),
    },
    "Kannada": {
        "bcp47": "kn-IN",
        "script": "Kannada",
        "script_rules": (
            "Preserve Sandhi and agglutinated forms as spoken. "
            "Use standard Kannada script for borrowed Sanskrit sounds."
        ),
    },
    "Malayalam": {
        "bcp47": "ml-IN",
        "script": "Malayalam",
        # Heavily agglutinative + complex chillu consonants.
        "script_rules": (
            "Preserve complex conjunct consonants (chillu letters) accurately. "
            "Do not split agglutinated words. Write combined forms as spoken."
        ),
    },
    "Gujarati": {
        "bcp47": "gu-IN",
        "script": "Gujarati",
        "script_rules": "Use standard Gujarati script orthography.",
    },
    "Punjabi": {
        "bcp47": "pa-IN",
        "script": "Gurmukhi",
        # Tonal language - tones are phonemically distinct.
        "script_rules": (
            "Use Gurmukhi script. Ensure correct representation of "
            "tonal markers where applicable in the script."
        ),
    },
    "Bengali": {
        "bcp47": "bn-IN",
        "script": "Bengali",
        # Inherent vowel /ɔ/ not /a/ (differs from Hindi Devanagari).
        "script_rules": (
            "Inherent vowel in Bengali script is /ɔ/ not /a/. "
            "Preserve Chandrabindu for nasalization where spoken."
        ),
    },
    "Assamese": {
        "bcp47": "as-IN",
        "script": "Assamese",
        # Critical: Assamese-specific ৰ (ra) and ৱ (wa) are NOT Bengali chars.
        "script_rules": (
            "Use Assamese-specific characters: ৰ (ra) and ৱ (wa), NOT their "
            "Bengali equivalents. The script resembles Bengali but these "
            "characters are distinct and must be correct."
        ),
    },
    "Odia": {
        "bcp47": "or-IN",
        "script": "Odia",
        "script_rules": "Use standard Odia script orthography.",
    },
    "English": {
        "bcp47": "en-IN",
        "script": "Latin",
        "script_rules": (
            "Use standard English spelling regardless of accent. "
            "Do not write phonetic approximations of accented speech."
        ),
    },
}


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training. Only what matters."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 4 transcription formats + speaker metadata."""

    transcription: str = Field(
        description="Native script transcription with minimal punctuation. "
                    "English words transliterated phonetically in native script."
    )

    code_switch: str = Field(
        description="Mixed script: native language in native script, "
                    "English words in Latin script. Minimal punctuation."
    )

    romanized: str = Field(
        description="Full Roman/Latin script transliteration as pronounced."
    )

    tagged: str = Field(
        description="Code-switch transcription with audio event tags "
                    "([laugh], [cough], [sigh], [breath], [singing], "
                    "[noise], [music], [applause]) at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    # Validation fields (filled by CTC validator, not Gemini)
    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


# === Prompt v3 ===
# Master template: one prompt for all 12 languages, with dynamic language pack.
# Evolution: v1 (original) -> v2 (strict, fixed segment 3 translation) -> v3 (below)
# v3 changes: translate/transliterate fix, field derivation ordering, [UNK] tokens,
#   numbers-as-words, acoustic punctuation emphasis, language-specific script rules,
#   "not assistant" negation, negative constraints, additionalProperties in schema.

def get_transcription_prompt(language: str) -> str:
    """
    System instruction for Gemini transcription.

    Uses master template with dynamic language pack injection.
    Key design: "transcription" field is authoritative, all others derive from it.
    """
    # Language pack (fallback to generic if unknown language)
    lang_config = LANGUAGE_CONFIGS.get(language, {})
    script_name = lang_config.get("script", f"{language} native")
    bcp47 = lang_config.get("bcp47", "")
    script_rules = lang_config.get("script_rules", "")

    # Build optional script rules section
    script_section = ""
    if script_rules:
        script_section = (
            f"\nSCRIPT RULES FOR {language.upper()}:\n{script_rules}\n"
        )

    # English has different field logic (no transliteration needed)
    if language.lower() == "english":
        native_field_rule = (
            "Write in standard English spelling."
        )
        code_switch_rule = (
            "Same as transcription. If the speaker uses words from another "
            "language, write them in Latin script as heard."
        )
    else:
        # The transliterate vs translate distinction is critical here.
        # v2 prompt had a contradiction: "don't convert English to {language}"
        # + "write English phonetically in {language} script". The word "convert"
        # was ambiguous and caused segment 3 to be translated instead of transcribed.
        native_field_rule = (
            f"Write ONLY in {script_name} script. No Latin characters in this field.\n"
            f"   When the speaker uses English words, TRANSLITERATE them phonetically "
            f"into {script_name} (sound mapping, NOT meaning translation).\n"
            f"   Example: spoken 'computer' → phonetic form in {script_name}, "
            f"NOT the {language} word for computer."
        )
        code_switch_rule = (
            f"Preserve language switching exactly as spoken.\n"
            f"   {language} words stay in {script_name} script. "
            f"English words stay in Latin script.\n"
            f"   Do NOT translate in either direction."
        )

    lang_label = f"{language} ({bcp47})" if bcp47 else language

    return f"""You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: {lang_label}

CRITICAL RULES (HIGHEST PRIORITY - violations cause rejection):
1. NEVER TRANSLATE. This is transcription (writing what is spoken), not translation (converting meaning). If the speaker says English words, those are English. If the speaker says {language} words, those are {language}.
2. VERBATIM FIDELITY: Transcribe every repetition, filler, stammer, false start, hesitation exactly as spoken. Do not clean up speech.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, accent, or word choice. Write what was said, not what should have been said.
4. NO HALLUCINATION: Never add words, phrases, or pleasantries not present in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Do not complete words or sentences.
5. NUMBERS: Write all numbers as spoken words in the appropriate script, never as digits.
6. UNCERTAINTY: If a word is unclear but speech is present, write [UNK] in its place. Reserve [INAUDIBLE] only when most or all speech in the segment cannot be understood.
7. BOUNDARY HANDLING: Audio segments are VAD-cut and may start/end mid-speech. Only transcribe clearly audible content. Do not guess at partial words at segment boundaries.

PUNCTUATION (acoustic prosody, not grammar):
- Allowed: comma, period, question mark, exclamation mark. No other punctuation.
- Insert based on AUDIBLE PROSODY only: comma for a distinct short pause, period for a full-stop pause with pitch drop, ? for clear question intonation, ! for strong emphasis.
- If the speaker runs sentences together without pausing, do NOT insert punctuation.
- Do NOT add closing punctuation if the audio ends without a clear prosodic stop.
{script_section}
FIELD DERIVATION ORDER:
The "transcription" field is the PRIMARY authoritative output. All other text fields must contain the SAME words in the SAME order - they differ ONLY in script representation and optional tags. Do not independently re-listen or re-interpret the audio for each field.

OUTPUT FIELDS:

1. transcription (AUTHORITATIVE - native script)
   {native_field_rule}
   Punctuation: period, comma, ? and ! only, based on audible prosodic cues.

2. code_switch (derived from transcription - mixed script)
   {code_switch_rule}

3. romanized (derived from transcription - Latin script)
   Full Roman/Latin transliteration of the entire audio as pronounced.
   Preserve pronunciation faithfully even if spelling looks non-standard.

4. tagged (derived from code_switch - mixed script + event tags)
   Same text as code_switch with audio event tags at positions where they occur.
   ONLY these tags, ONLY if clearly and prominently audible (never hallucinate):
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

5. speaker (metadata from audio prosody)
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect/accent if confidently detectable, empty string otherwise. Use defaults if unsure."""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# === JSON Schema for Gemini structured output ===
# additionalProperties: False prevents model from inventing extra JSON keys.
# Gemini structured output enforces schema shape but NOT semantic correctness -
# downstream CTC validation still required.

TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "code_switch": {
            "type": "string",
            "description": "Mixed script preserving language switching"
        },
        "romanized": {
            "type": "string",
            "description": "Complete Roman script transliteration"
        },
        "tagged": {
            "type": "string",
            "description": "Code-switch with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "description": "neutral, happy, sad, angry, excited, surprised"
                },
                "speaking_style": {
                    "type": "string",
                    "description": "conversational, narrative, excited, calm, emphatic, sarcastic, formal"
                },
                "pace": {
                    "type": "string",
                    "description": "slow, normal, fast"
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"],
            "additionalProperties": False
        }
    },
    "required": ["transcription", "code_switch", "romanized", "tagged", "speaker"],
    "additionalProperties": False
}