"""
Pydantic schemas for structured transcription output.

Output fields (what Gemini returns per segment):
  1. transcription   - Native script with punctuation (primary)
  2. code_switch     - Mixed script: native + English in Latin
  3. romanized       - Full Latin transliteration
  4. tagged          - Code-switch + audio event tags [laugh] etc.
  5. speaker         - Metadata: emotion, style, pace, accent
"""
from typing import Optional
from pydantic import BaseModel, Field


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training. Only what matters."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 4 transcription formats + speaker metadata."""

    transcription: str = Field(
        description="Native script transcription with minimal punctuation. "
                    "English words written phonetically in native script."
    )

    code_switch: str = Field(
        description="Mixed script: native language in native script, "
                    "English words in Latin script. Minimal punctuation."
    )

    romanized: str = Field(
        description="Full Roman/Latin script transliteration as pronounced."
    )

    tagged: str = Field(
        description="Code-switch transcription with audio event tags "
                    "([laugh], [cough], [sigh], [breath], [singing], "
                    "[noise], [music], [applause]) at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    # Validation fields (filled by CTC validator, not Gemini)
    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


# === Prompt ===

def get_transcription_prompt(language: str) -> str:
    """System instruction for Gemini transcription."""
    return f"""You are a strict verbatim transcription engine for {language} audio.

CRITICAL RULES - VIOLATIONS WILL CAUSE FAILURE:
- This is TRANSCRIPTION, never translation. Write exactly what is spoken in the language it is spoken.
- If the speaker says English words, transcribe them as English. Do NOT convert English speech into {language}.
- If the speaker says {language} words, write them in {language} script. Do NOT convert {language} speech into English.
- Transcribe EXACTLY as spoken: preserve all repetitions, fillers, stammers, false starts, hesitations.
- Do NOT correct grammar, pronunciation, or normalize dialect/accent/spelling in any field.
- Audio may start/end mid-speech (VAD-cut segments) - transcribe only what is clearly audible, do not guess words at boundaries.
- If completely inaudible, set all text fields to "[INAUDIBLE]"

OUTPUT FIELDS:

1. transcription
   Write in native {language} script only. Minimal punctuation (period, comma, ? and ! only from audible cues).
   Even English words must be written phonetically in {language} script in this field.

2. code_switch
   Preserve language switching exactly as spoken.
   {language} parts stay in {language} script. English parts stay in English (Latin script).
   Do NOT translate either direction. Minimal punctuation from audible cues.

3. romanized
   Everything in Roman/Latin script exactly as pronounced.
   Preserve pronunciation even if spelling looks wrong. No standardization. No translation.

4. tagged
   Same content as code_switch, but insert audio event tags at the exact position where they occur.
   ONLY these tags, ONLY if clearly audible (never hallucinate tags):
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

5. speaker
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect if detectable, empty string if unknown"""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# === JSON Schema for Gemini structured output ===

TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "code_switch": {
            "type": "string",
            "description": "Mixed script preserving language switching"
        },
        "romanized": {
            "type": "string",
            "description": "Complete Roman script transliteration"
        },
        "tagged": {
            "type": "string",
            "description": "Code-switch with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "description": "neutral, happy, sad, angry, excited, surprised"
                },
                "speaking_style": {
                    "type": "string",
                    "description": "conversational, narrative, excited, calm, emphatic, sarcastic, formal"
                },
                "pace": {
                    "type": "string",
                    "description": "slow, normal, fast"
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"]
        }
    },
    "required": ["transcription", "code_switch", "romanized", "tagged", "speaker"]
}
