"""
Pydantic schemas for structured transcription output.

Architecture change (v4): Gemini outputs fewer fields, code derives the rest.
  Gemini outputs:
    1. transcription   - Native script with punctuation (PRIMARY, authoritative)
    2. tagged          - Code-mixed + audio event tags [laugh] etc.
    3. speaker         - Metadata: emotion, style, pace, accent
    4. detected_language

  Code derives (deterministic, not Gemini):
    5. romanized       - uroman-based Latin transliteration (from transcription)
    6. code_switch     - dropped (tagged subsumes it)

Prompt v4 design (evolved from v1 -> v2 strict -> v3 field derivation -> v4 simplified):
  - Reduced Gemini output from 4 text fields to 2 (transcription + tagged)
  - Less cognitive load = better adherence, more deterministic output
  - romanized derived deterministically via uroman = stable, reproducible MMS alignment
"""
from typing import Optional, Dict
from pydantic import BaseModel, Field


LANGUAGE_CONFIGS: Dict[str, Dict[str, str]] = {
    "Hindi": {
        "bcp47": "hi-IN",
        "script": "Devanagari",
        "script_rules": "Preserve Nukta when clearly pronounced.",
    },
    "Marathi": {
        "bcp47": "mr-IN",
        "script": "Devanagari",
        "script_rules": "Preserve retroflex lateral accurately.",
    },
    "Telugu": {
        "bcp47": "te-IN",
        "script": "Telugu",
        "script_rules": "Don't over-split words. Preserve Sandhi/combined forms as spoken.",
    },
    "Tamil": {
        "bcp47": "ta-IN",
        "script": "Tamil",
        "script_rules": "Distinguish short and long vowels accurately.",
    },
    "Kannada": {
        "bcp47": "kn-IN",
        "script": "Kannada",
        "script_rules": "Preserve agglutinated/combined forms as spoken.",
    },
    "Malayalam": {
        "bcp47": "ml-IN",
        "script": "Malayalam",
        "script_rules": "Don't split agglutinated words. Preserve chillu letters.",
    },
    "Gujarati": {
        "bcp47": "gu-IN",
        "script": "Gujarati",
        "script_rules": "",
    },
    "Punjabi": {
        "bcp47": "pa-IN",
        "script": "Gurmukhi",
        "script_rules": "",
    },
    "Bengali": {
        "bcp47": "bn-IN",
        "script": "Bengali",
        "script_rules": "Preserve Chandrabindu for nasalization where spoken.",
    },
    "Assamese": {
        "bcp47": "as-IN",
        "script": "Assamese",
        "script_rules": "Use Assamese-specific characters, NOT Bengali equivalents.",
    },
    "Odia": {
        "bcp47": "or-IN",
        "script": "Odia",
        "script_rules": "",
    },
    "English": {
        "bcp47": "en-IN",
        "script": "Latin",
        "script_rules": "Standard English spelling. Don't phonetically approximate accents.",
    },
}


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 2 text fields from Gemini + derived fields."""

    transcription: str = Field(
        description="Code-mixed transcription with punctuation. "
                    "Each language in its original script (English stays English)."
    )

    code_switch: str = Field(
        default="",
        description="Mixed script: native + English in Latin. "
                    "v4: derived from tagged by stripping event tags, or empty."
    )

    romanized: str = Field(
        default="",
        description="Full Roman/Latin script transliteration. "
                    "v4: derived via uroman from transcription, not Gemini output."
    )

    tagged: str = Field(
        description="Code-mixed transcription with audio event tags. "
                    "Native script for primary language, Latin for English words, "
                    "plus [laugh] [cough] [sigh] etc. at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )

    detected_language: str = Field(
        default="",
        description="The language actually spoken in the audio."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


def get_transcription_prompt(language: str) -> str:
    """
    System instruction for Gemini transcription.
    v4: 2 text fields (transcription + tagged) instead of 4.
    """
    lang_config = LANGUAGE_CONFIGS.get(language, {})
    script_name = lang_config.get("script", f"{language} native")
    bcp47 = lang_config.get("bcp47", "")
    script_rules = lang_config.get("script_rules", "")

    script_section = ""
    if script_rules:
        script_section = f"\nSCRIPT RULES FOR {language.upper()}:\n{script_rules}\n"

    if language.lower() == "english":
        native_field_rule = "Write in standard English spelling."
        tagged_rule = "Same as transcription with audio event tags at their positions."
    else:
        # v5: transcription is now CODE-MIXED. English stays English.
        # This is what the speaker actually said - no transliteration.
        native_field_rule = (
            f"Write {language} words in {script_name} script.\n"
            f"   Keep English words in English (Latin script) exactly as spoken.\n"
            f"   Keep Hindi words in Devanagari, Tamil words in Tamil script, etc.\n"
            f"   Each language stays in its original script. Do NOT transliterate.\n"
            f"   Example: speaker says 'salt biscuits manchidi' -> salt biscuits {script_name}(manchidi)"
        )
        tagged_rule = (
            f"Same text as transcription with audio event tags inserted at their positions.\n"
            f"   Do NOT change any words or scripts - just add the tags where events occur."
        )

    lang_label = f"{language} ({bcp47})" if bcp47 else language

    return f"""You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: {lang_label}

CRITICAL RULES (violations cause rejection):
1. NEVER TRANSLATE. This is transcription, not translation. If the speaker says English words, those are English. If the speaker says {language} words, those are {language}. Write what you HEAR, not what you think it means in another language.
2. VERBATIM FIDELITY: Every repetition, filler, stammer, false start, hesitation - exactly as spoken.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION: Never add words or phrases not in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Do not complete anything. Output ONLY the JSON.
5. UNCERTAINTY: If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech. Use [NO_SPEECH] for no speech (silence, noise, music only).
6. BOUNDARY HANDLING: Audio is VAD-cut and may start/end mid-speech. Transcribe everything you can confidently hear. Only omit what is truly inaudible.
7. LANGUAGE MISMATCH: Trust what you hear. If audio is clearly different from {language}, transcribe in that language's script and set detected_language accordingly.

PUNCTUATION (prosody-based, not grammar):
- Only: comma, period, ? and !
- Insert from audible pauses/intonation only. No pause = no punctuation.
{script_section}
FIELD DERIVATION:
"transcription" is the PRIMARY authoritative output. It IS code-mixed: each language in its own script.
"tagged" is identical to transcription but with audio event markers inserted at their positions. Do NOT re-interpret the audio for tagged - just copy transcription and add tags.

OUTPUT FIELDS:

1. transcription (AUTHORITATIVE - native script)
   {native_field_rule}
   Punctuation: period, comma, ? and ! only, from audible prosodic cues.

2. tagged (derived from transcription - code-mixed + event tags)
   {tagged_rule}
   ONLY these tags, ONLY if clearly and prominently audible:
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

3. speaker (metadata from audio prosody)
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect/accent if confidently detectable, empty string otherwise.

4. detected_language
   The language you actually hear spoken. If code-mixed, write the dominant language."""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# v4 JSON Schema: only 4 fields required from Gemini (down from 6)
TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "tagged": {
            "type": "string",
            "description": "Code-mixed transcription with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "enum": [
                        "neutral", "happy", "sad",
                        "angry", "excited", "surprised"
                    ]
                },
                "speaking_style": {
                    "type": "string",
                    "enum": [
                        "conversational", "narrative", "excited",
                        "calm", "emphatic", "sarcastic", "formal"
                    ]
                },
                "pace": {
                    "type": "string",
                    "enum": ["slow", "normal", "fast"]
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"],
            "additionalProperties": False
        },
        "detected_language": {
            "type": "string",
            "description": "Language actually spoken in the audio"
        }
    },
    "required": ["transcription", "tagged", "speaker", "detected_language"],
    "additionalProperties": False
}