"""
Pydantic schemas for structured transcription output.

Architecture change (v4): Gemini outputs fewer fields, code derives the rest.
  Gemini outputs:
    1. transcription   - Native script with punctuation (PRIMARY, authoritative)
    2. tagged          - Code-mixed + audio event tags [laugh] etc.
    3. speaker         - Metadata: emotion, style, pace, accent
    4. detected_language

  Code derives (deterministic, not Gemini):
    5. romanized       - uroman-based Latin transliteration (from transcription)
    6. code_switch     - dropped (tagged subsumes it)

Prompt v4 design (evolved from v1 -> v2 strict -> v3 field derivation -> v4 simplified):
  - Reduced Gemini output from 4 text fields to 2 (transcription + tagged)
  - Less cognitive load = better adherence, more deterministic output
  - romanized derived deterministically via uroman = stable, reproducible MMS alignment
"""
from typing import Optional, Dict
from pydantic import BaseModel, Field


LANGUAGE_CONFIGS: Dict[str, Dict[str, str]] = {
    "Hindi": {
        "bcp47": "hi-IN",
        "script": "Devanagari",
        "script_rules": "Preserve Nukta when clearly pronounced.",
    },
    "Marathi": {
        "bcp47": "mr-IN",
        "script": "Devanagari",
        "script_rules": "Preserve retroflex lateral accurately.",
    },
    "Telugu": {
        "bcp47": "te-IN",
        "script": "Telugu",
        "script_rules": "Don't over-split words. Preserve Sandhi/combined forms as spoken.",
    },
    "Tamil": {
        "bcp47": "ta-IN",
        "script": "Tamil",
        "script_rules": "Distinguish short and long vowels accurately.",
    },
    "Kannada": {
        "bcp47": "kn-IN",
        "script": "Kannada",
        "script_rules": "Preserve agglutinated/combined forms as spoken.",
    },
    "Malayalam": {
        "bcp47": "ml-IN",
        "script": "Malayalam",
        "script_rules": "Don't split agglutinated words. Preserve chillu letters.",
    },
    "Gujarati": {
        "bcp47": "gu-IN",
        "script": "Gujarati",
        "script_rules": "",
    },
    "Punjabi": {
        "bcp47": "pa-IN",
        "script": "Gurmukhi",
        "script_rules": "",
    },
    "Bengali": {
        "bcp47": "bn-IN",
        "script": "Bengali",
        "script_rules": "Preserve Chandrabindu for nasalization where spoken.",
    },
    "Assamese": {
        "bcp47": "as-IN",
        "script": "Assamese",
        "script_rules": "Use Assamese-specific characters, NOT Bengali equivalents.",
    },
    "Odia": {
        "bcp47": "or-IN",
        "script": "Odia",
        "script_rules": "",
    },
    "English": {
        "bcp47": "en-IN",
        "script": "Latin",
        "script_rules": "Standard English spelling. Don't phonetically approximate accents.",
    },
}


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 2 text fields from Gemini + derived fields."""

    transcription: str = Field(
        description="Native script transcription with minimal punctuation. "
                    "English words transliterated phonetically in native script."
    )

    code_switch: str = Field(
        default="",
        description="Mixed script: native + English in Latin. "
                    "v4: derived from tagged by stripping event tags, or empty."
    )

    romanized: str = Field(
        default="",
        description="Full Roman/Latin script transliteration. "
                    "v4: derived via uroman from transcription, not Gemini output."
    )

    tagged: str = Field(
        description="Code-mixed transcription with audio event tags. "
                    "Native script for primary language, Latin for English words, "
                    "plus [laugh] [cough] [sigh] etc. at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )

    detected_language: str = Field(
        default="",
        description="The language actually spoken in the audio."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


def get_transcription_prompt(language: str) -> str:
    """
    System instruction for Gemini transcription.
    v4: 2 text fields (transcription + tagged) instead of 4.
    """
    lang_config = LANGUAGE_CONFIGS.get(language, {})
    script_name = lang_config.get("script", f"{language} native")
    bcp47 = lang_config.get("bcp47", "")
    script_rules = lang_config.get("script_rules", "")

    script_section = ""
    if script_rules:
        script_section = f"\nSCRIPT RULES FOR {language.upper()}:\n{script_rules}\n"

    if language.lower() == "english":
        native_field_rule = "Write in standard English spelling."
        tagged_rule = "Same as transcription with audio event tags at their positions."
    else:
        native_field_rule = (
            f"Write ONLY in {script_name} script. No Latin characters in this field.\n"
            f"   When the speaker uses English words, TRANSLITERATE them phonetically "
            f"into {script_name} (sound mapping, NOT meaning translation).\n"
            f"   Example: spoken 'computer' -> phonetic form in {script_name}, "
            f"NOT the {language} word for computer."
        )
        tagged_rule = (
            f"Preserve language switching exactly as spoken.\n"
            f"   {language} words stay in {script_name} script. "
            f"English words stay in Latin script.\n"
            f"   Do NOT translate in either direction.\n"
            f"   Insert audio event tags at positions where they occur."
        )

    lang_label = f"{language} ({bcp47})" if bcp47 else language

    return f"""You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: {lang_label}

CRITICAL RULES (violations cause rejection):
1. NEVER TRANSLATE. This is transcription, not translation. If the speaker says English words, those are English. If the speaker says {language} words, those are {language}. Write what you HEAR, not what you think it means in another language.
2. VERBATIM FIDELITY: Every repetition, filler, stammer, false start, hesitation - exactly as spoken.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION: Never add words or phrases not in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Do not complete anything. Output ONLY the JSON.
5. UNCERTAINTY: If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech. Use [NO_SPEECH] for no speech (silence, noise, music only).
6. BOUNDARY HANDLING: Audio is VAD-cut and may start/end mid-speech. Transcribe everything you can confidently hear. Only omit what is truly inaudible.
7. LANGUAGE MISMATCH: Trust what you hear. If audio is clearly different from {language}, transcribe in that language's script and set detected_language accordingly.

PUNCTUATION (prosody-based, not grammar):
- Only: comma, period, ? and !
- Insert from audible pauses/intonation only. No pause = no punctuation.
{script_section}
FIELD DERIVATION:
"transcription" is the PRIMARY authoritative output in native script.
"tagged" is the same content with language switching preserved (native script for {language}, Latin for English) and audio event markers at positions. Derive from the same hearing, not independently.

OUTPUT FIELDS:

1. transcription (AUTHORITATIVE - native script)
   {native_field_rule}
   Punctuation: period, comma, ? and ! only, from audible prosodic cues.

2. tagged (derived from transcription - code-mixed + event tags)
   {tagged_rule}
   ONLY these tags, ONLY if clearly and prominently audible:
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

3. speaker (metadata from audio prosody)
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect/accent if confidently detectable, empty string otherwise.

4. detected_language
   The language you actually hear spoken. If code-mixed, write the dominant language."""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# v4 JSON Schema: only 4 fields required from Gemini (down from 6)
TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "tagged": {
            "type": "string",
            "description": "Code-mixed transcription with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "enum": [
                        "neutral", "happy", "sad",
                        "angry", "excited", "surprised"
                    ]
                },
                "speaking_style": {
                    "type": "string",
                    "enum": [
                        "conversational", "narrative", "excited",
                        "calm", "emphatic", "sarcastic", "formal"
                    ]
                },
                "pace": {
                    "type": "string",
                    "enum": ["slow", "normal", "fast"]
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"],
            "additionalProperties": False
        },
        "detected_language": {
            "type": "string",
            "description": "Language actually spoken in the audio"
        }
    },
    "required": ["transcription", "tagged", "speaker", "detected_language"],
    "additionalProperties": False
}