"""
Pydantic schemas for structured transcription output.

Output fields (what Gemini returns per segment):
  1. transcription   - Native script with punctuation (primary)
  2. code_switch     - Mixed script: native + English in Latin
  3. romanized       - Full Latin transliteration
  4. tagged          - Code-switch + audio event tags [laugh] etc.
  5. speaker         - Metadata: emotion, style, pace, accent
"""
from typing import Optional
from pydantic import BaseModel, Field


class SpeakerMeta(BaseModel):
    """Speaker metadata for TTS training. Only what matters."""
    emotion: str = Field(
        default="neutral",
        description="neutral, happy, sad, angry, excited, surprised"
    )
    speaking_style: str = Field(
        default="conversational",
        description="conversational, narrative, excited, calm, emphatic, sarcastic, formal"
    )
    pace: str = Field(
        default="normal",
        description="slow, normal, fast"
    )
    accent: str = Field(
        default="",
        description="Regional accent/dialect if detectable, empty string if unknown"
    )


class TranscriptionOutput(BaseModel):
    """Structured output: 4 transcription formats + speaker metadata."""

    transcription: str = Field(
        description="Native script transcription with minimal punctuation. "
                    "English words written phonetically in native script."
    )

    code_switch: str = Field(
        description="Mixed script: native language in native script, "
                    "English words in Latin script. Minimal punctuation."
    )

    romanized: str = Field(
        description="Full Roman/Latin script transliteration as pronounced."
    )

    tagged: str = Field(
        description="Code-switch transcription with audio event tags "
                    "([laugh], [cough], [sigh], [breath], [singing], "
                    "[noise], [music], [applause]) at positions where they occur."
    )

    speaker: Optional[SpeakerMeta] = Field(
        default=None,
        description="Speaker metadata: emotion, style, pace, accent."
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""

    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")

    transcription: TranscriptionOutput = Field(description="The transcription outputs")

    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time")

    # Validation fields (filled by CTC validator, not Gemini)
    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")

    @property
    def native(self) -> str:
        """Shortcut to primary native transcription."""
        return self.transcription.transcription


# === Prompt ===

def get_transcription_prompt(language: str) -> str:
    """System instruction for Gemini transcription."""
    return f"""You are a strict verbatim transcription engine for {language} audio.

GLOBAL RULES:
- Transcribe EXACTLY as spoken: all repetitions, fillers, stammers, false starts
- Do NOT correct grammar, pronunciation, or normalize dialect/accent
- Audio may start/end mid-speech (VAD-cut segments) - transcribe only what is clearly audible, do not guess incomplete words at boundaries
- If completely inaudible, set all fields to "[INAUDIBLE]"

OUTPUT FIELDS:

1. transcription
   Native script of {language} with minimal punctuation.
   English words written phonetically in {language} script.
   Punctuation: period for sentence ends, comma for audible pauses, ? for questions, ! for emphasis. No other punctuation.

2. code_switch
   Preserve language switching exactly as spoken.
   {language} parts in native script, English words/phrases in Latin script.
   Minimal punctuation based on audible cues.

3. romanized
   Entire transcription in Roman/Latin script as pronounced.
   Preserve pronunciation even if spelling looks wrong. No standardization.

4. tagged
   Same as code_switch but with audio event tags inserted at the position where they occur.
   ALLOWED TAGS (only use if clearly audible, never hallucinate):
   [laugh] - laughter
   [cough] - coughing
   [sigh] - audible sigh
   [breath] - heavy/audible breathing
   [singing] - humming, singing, melodic vocalization
   [noise] - non-speech noise burst
   [music] - background music
   [applause] - clapping

5. speaker (metadata object)
   - emotion: neutral | happy | sad | angry | excited | surprised
   - speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   - pace: slow | normal | fast
   - accent: regional dialect/accent if detectable, empty string if unknown"""


def get_user_prompt() -> str:
    """User prompt to accompany the audio."""
    return ("Transcribe this audio segment following the system instructions. "
            "Return a valid JSON object with all required fields.")


# === JSON Schema for Gemini structured output ===

TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "transcription": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "code_switch": {
            "type": "string",
            "description": "Mixed script preserving language switching"
        },
        "romanized": {
            "type": "string",
            "description": "Complete Roman script transliteration"
        },
        "tagged": {
            "type": "string",
            "description": "Code-switch with audio event tags"
        },
        "speaker": {
            "type": "object",
            "description": "Speaker metadata",
            "properties": {
                "emotion": {
                    "type": "string",
                    "description": "neutral, happy, sad, angry, excited, surprised"
                },
                "speaking_style": {
                    "type": "string",
                    "description": "conversational, narrative, excited, calm, emphatic, sarcastic, formal"
                },
                "pace": {
                    "type": "string",
                    "description": "slow, normal, fast"
                },
                "accent": {
                    "type": "string",
                    "description": "Regional accent/dialect or empty string"
                }
            },
            "required": ["emotion", "speaking_style", "pace"]
        }
    },
    "required": ["transcription", "code_switch", "romanized", "tagged", "speaker"]
}