"""
Pydantic schemas for structured transcription output.

Output fields (what Gemini returns per segment):
  1. transcription   - Native script with punctuation (PRIMARY, authoritative)
  2. code_switch     - Mixed script: native + English in Latin (derived from 1)
  3. romanized       - Full Latin transliteration (derived from 1)
  4. tagged          - Code-switch + audio event tags [laugh] etc. (derived from 2)
  5. speaker         - Metadata: emotion, style, pace, accent

Prompt v3 design (evolved from v1 original -> v2 strict -> v3 current):
  - Single master template, dynamic language pack injection for 12 languages
  - "Verbatim transcription system" persona (explicit NOT assistant negation)
  - Acoustic punctuation (prosody-based pauses, not grammar rules)
  - Anti-hallucination: [UNK] for uncertain words, no sentence completion
  - Field derivation ordering: transcription is authoritative, others derive from it
  - Transliterate != Translate distinction (fixed v2 contradiction that caused
    English speech to be translated into Telugu on segment 3 test)
  - Per-language script tips (1-2 practical tips, not grammar rules)
"""
from typing import Optional, Dict
from pydantic import BaseModel, Field


# === Language Pack Configs ===
# Per-language metadata + script-specific rules injected into the master prompt.
# Only rules that prevent ACTUAL transcription errors, not generic grammar.
# v1: 12 languages (11 Indic + English). Add more as needed.

LANGUAGE_CONFIGS: Dict[str, Dict[str, str]] = {
    # 1-2 practical script tips per language. Not grammar rules.
    "Hindi": {
        "bcp47": "hi-IN",
        "script": "Devanagari",
        "script_rules": "Preserve Nukta (ज़ vs ज, फ़ vs फ) when clearly pronounced.",
    },
    "Marathi": {
        "bcp47": "mr-IN",
        "script": "Devanagari",
        "script_rules": "Preserve ळ (retroflex lateral) accurately.",
    },
    "Telugu": {
        "bcp47": "te-IN",
        "script": "Telugu",
        "script_rules": "Don't over-split words. Preserve Sandhi/combined forms as spoken.",
    },
    "Tamil": {
        "bcp47": "ta-IN",
        "script": "Tamil",
        "script_rules": "Distinguish short and long vowels accurately.",
    },
    "Kannada": {
        "bcp47": "kn-IN",
        "script": "Kannada",
        "script_rules": "Preserve agglutinated/combined forms as spoken.",
    },
    "Malayalam": {
        "bcp47": "ml-IN",
        "script": "Malayalam",
        "script_rules": "Don't split agglutinated words. Preserve chillu letters.",
    },
    "Gujarati": {
        "bcp47": "gu-IN",
        "script": "Gujarati",
        "script_rules": "",
    },
    "Punjabi": {
        "bcp47": "pa-IN",
        "script": "Gurmukhi",
        "script_rules": "",
    },
    "Bengali": {
        "bcp47": "bn-IN",
        "script": "Bengali",
        "script_rules": "Preserve Chandrabindu for nasalization where spoken.",
    },
    "Assamese": {
        "bcp47": "as-IN",
        "script": "Assamese",
        # Critical: these chars look like Bengali but are Assamese-specific.
        "script_rules": "Use Assamese-specific ৰ (ra) and ৱ (wa), NOT Bengali equivalents.",
    },
    "Odia": {
        "bcp47": "or-IN",
        "script": "Odia",
        "script_rules": "",
    },
    "English": {
        "bcp47": "en-IN",
        "script": "Latin",
        "script_rules": "Standard English spelling. Don't phonetically approximate accents.",
    },
}


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training. Only what matters."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 4 transcription formats + speaker metadata."""

    transcription: str = Field(
        description="Native script transcription with minimal punctuation. "
                    "English words transliterated phonetically in native script."
    )

    code_switch: str = Field(
        description="Mixed script: native language in native script, "
                    "English words in Latin script. Minimal punctuation."
    )

    romanized: str = Field(
        description="Full Roman/Latin script transliteration as pronounced."
    )

    tagged: str = Field(
        description="Code-switch transcription with audio event tags "
                    "([laugh], [cough], [sigh], [breath], [singing], "
                    "[noise], [music], [applause]) at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    # Validation fields (filled by CTC validator, not Gemini)
    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


# === Prompt v3 ===
# Master template: one prompt for all 12 languages, with dynamic language pack.
# Evolution: v1 (original) -> v2 (strict, fixed segment 3 translation) -> v3 (below)
# v3 changes: translate/transliterate fix, field derivation ordering, [UNK] tokens,
#   prosody-based punctuation, simplified language script tips, no-preamble constraint,
#   "not assistant" negation, additionalProperties in schema.

def get_transcription_prompt(language: str) -> str:
    """
    System instruction for Gemini transcription.

    Uses master template with dynamic language pack injection.
    Key design: "transcription" field is authoritative, all others derive from it.
    """
    # Language pack (fallback to generic if unknown language)
    lang_config = LANGUAGE_CONFIGS.get(language, {})
    script_name = lang_config.get("script", f"{language} native")
    bcp47 = lang_config.get("bcp47", "")
    script_rules = lang_config.get("script_rules", "")

    # Build optional script rules section
    script_section = ""
    if script_rules:
        script_section = (
            f"\nSCRIPT RULES FOR {language.upper()}:\n{script_rules}\n"
        )

    # English has different field logic (no transliteration needed)
    if language.lower() == "english":
        native_field_rule = (
            "Write in standard English spelling."
        )
        code_switch_rule = (
            "Same as transcription. If the speaker uses words from another "
            "language, write them in Latin script as heard."
        )
    else:
        # The transliterate vs translate distinction is critical here.
        # v2 prompt had a contradiction: "don't convert English to {language}"
        # + "write English phonetically in {language} script". The word "convert"
        # was ambiguous and caused segment 3 to be translated instead of transcribed.
        native_field_rule = (
            f"Write ONLY in {script_name} script. No Latin characters in this field.\n"
            f"   When the speaker uses English words, TRANSLITERATE them phonetically "
            f"into {script_name} (sound mapping, NOT meaning translation).\n"
            f"   Example: spoken 'computer' → phonetic form in {script_name}, "
            f"NOT the {language} word for computer."
        )
        code_switch_rule = (
            f"Preserve language switching exactly as spoken.\n"
            f"   {language} words stay in {script_name} script. "
            f"English words stay in Latin script.\n"
            f"   Do NOT translate in either direction."
        )

    lang_label = f"{language} ({bcp47})" if bcp47 else language

    return f"""You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: {lang_label}

CRITICAL RULES (violations cause rejection):
1. NEVER TRANSLATE. This is transcription, not translation. If the speaker says English words, those are English. If the speaker says {language} words, those are {language}.
2. VERBATIM FIDELITY: Every repetition, filler, stammer, false start, hesitation - exactly as spoken.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION: Never add words or phrases not in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Do not complete anything. Do not add conversational preamble like "Sure, here is..." - output ONLY the JSON.
5. UNCERTAINTY: If a word is unclear, write [UNK]. Use [INAUDIBLE] when speech exists but is unintelligible. Use [NO_SPEECH] when the segment contains no speech at all (silence, noise, music only).
6. BOUNDARY HANDLING: Audio is VAD-cut and may start/end mid-speech. Transcribe everything you can confidently hear, including speech at boundaries. Only omit what is truly inaudible.

PUNCTUATION (prosody-based, not grammar):
- Only: comma, period, ? and !
- Insert from audible pauses/intonation only. No pause = no punctuation.
{script_section}
FIELD DERIVATION:
"transcription" is the PRIMARY output. code_switch and romanized must contain the same words in the same order, differing only in script. "tagged" is code_switch with audio event markers inserted at their positions. Do not re-interpret the audio independently for each field.

OUTPUT FIELDS:

1. transcription (AUTHORITATIVE - native script)
   {native_field_rule}
   Punctuation: period, comma, ? and ! only, based on audible prosodic cues.

2. code_switch (derived from transcription - mixed script)
   {code_switch_rule}

3. romanized (derived from transcription - Latin script)
   Full Roman/Latin transliteration of the entire audio as pronounced.
   Preserve pronunciation faithfully even if spelling looks non-standard.

4. tagged (derived from code_switch - mixed script + event tags)
   Same text as code_switch with audio event tags at positions where they occur.
   ONLY these tags, ONLY if clearly and prominently audible (never hallucinate):
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

5. speaker (metadata from audio prosody)
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect/accent if confidently detectable, empty string otherwise. Use defaults if unsure."""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# === JSON Schema for Gemini structured output ===
# Uses response_json_schema (JSON Schema path), NOT response_schema (OpenAPI path).
# additionalProperties: False is supported on the JSON Schema path.
# Enums mechanically lock speaker values at the API level (not just prompt-level).
# Gemini structured output enforces schema shape but NOT semantic correctness -
# downstream CTC validation still required.

TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "code_switch": {
            "type": "string",
            "description": "Mixed script preserving language switching"
        },
        "romanized": {
            "type": "string",
            "description": "Complete Roman script transliteration"
        },
        "tagged": {
            "type": "string",
            "description": "Code-switch with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "enum": [
                        "neutral", "happy", "sad",
                        "angry", "excited", "surprised"
                    ]
                },
                "speaking_style": {
                    "type": "string",
                    "enum": [
                        "conversational", "narrative", "excited",
                        "calm", "emphatic", "sarcastic", "formal"
                    ]
                },
                "pace": {
                    "type": "string",
                    "enum": ["slow", "normal", "fast"]
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"],
            "additionalProperties": False
        }
    },
    "required": ["transcription", "code_switch", "romanized", "tagged", "speaker"],
    "additionalProperties": False
}
