"""
Pydantic schemas for structured transcription output.
Defines the four output formats required for Indian language transcription.
"""
from typing import Optional
from pydantic import BaseModel, Field


class TranscriptionOutput(BaseModel):
    """
    Structured output for a single audio segment transcription.
    Contains four different transcription formats as per requirements.
    """
    
    native_transcription: str = Field(
        description="Complete native script transcription without any punctuation. "
                    "English words written phonetically in native script. "
                    "All repetitions, fillers, stammers preserved exactly as spoken."
    )
    
    native_with_punctuation: str = Field(
        description="Native script transcription with minimal punctuation based on speech flow. "
                    "Uses period, comma, question mark, exclamation only. "
                    "No grammar correction or sentence restructuring."
    )
    
    code_switch: str = Field(
        description="Mixed script transcription preserving original language switching. "
                    "Indian language parts in native script, English words in Latin script. "
                    "Minimal punctuation following primary language convention."
    )
    
    romanized: str = Field(
        description="Complete Roman/Latin script transcription (transliteration). "
                    "All speech converted to romanized form as pronounced. "
                    "Preserves pronunciation exactly even if spelling looks incorrect."
    )
    
    confidence: Optional[float] = Field(
        default=None,
        description="Optional confidence score for the transcription (0-1)"
    )
    
    notes: Optional[str] = Field(
        default=None,
        description="Any notes about audio quality, unclear portions, or special observations"
    )


class TranscriptionResult(BaseModel):
    """Complete result for a transcribed segment including metadata."""
    
    segment_id: str = Field(description="Identifier for the audio segment")
    chunk_index: int = Field(default=0, description="Chunk index if segment was split")
    total_chunks: int = Field(default=1, description="Total chunks for this segment")
    duration_sec: float = Field(description="Duration of audio in seconds")
    language: str = Field(description="Primary language of the audio")
    
    transcription: TranscriptionOutput = Field(description="The transcription outputs")
    
    model_used: str = Field(description="Gemini model used for transcription")
    thinking_level: Optional[str] = Field(default=None, description="Thinking level used")
    processing_time_sec: Optional[float] = Field(default=None, description="API call time in seconds")
    
    # Validation fields
    validation_status: Optional[str] = Field(default=None, description="accept/review/reject")
    validation_score: Optional[float] = Field(default=None, description="Alignment score 0-1")
    
    @property
    def native(self) -> str:
        """Shortcut to native transcription."""
        return self.transcription.native_transcription


def get_transcription_prompt(language: str) -> str:
    """
    Generate the system instruction prompt for transcription.
    
    Args:
        language: The primary language of the audio
        
    Returns:
        System instruction string
    """
    return f"""You are a strict, verbatim transcription engine for Indian languages.

Primary audio language: {language}

TASK:
Listen to the audio carefully and transcribe the speech exactly as spoken.
Produce a JSON object with four transcription outputs.

STRICT GLOBAL RULES (apply to all outputs):
1. VERBATIM ONLY: Include all repetitions, filler words, stammers, false starts, hesitations, and colloquial expressions exactly as spoken
2. NO NORMALIZATION: Do not correct grammar, pronunciation, spellings, or clean up dialect/accent/mixed language usage
3. NO INFERENCE: Do not add meaning, structure, or emphasis that is not clearly audible
4. SCRIPT FIDELITY: Follow the rules of each output strictly

OUTPUT DEFINITIONS:

native_transcription:
- Write everything ONLY in the native script of {language}
- NO punctuation marks at all
- Even if English words are spoken, write them phonetically in {language} script
- Preserve ALL repetitions, fillers, stammers exactly

native_with_punctuation:
- Write everything ONLY in the native script of {language}
- Use MINIMAL punctuation strictly based on speech flow:
  * Period (।) for clear sentence endings
  * Comma (,) only for clearly audible pauses
  * Question mark (?) only when question intonation is clearly audible
  * Exclamation mark (!) only when strong emphasis is clearly audible
- NO ellipses, quotation marks, brackets, colons, semicolons, or multiple punctuation
- Do NOT correct grammar or restructure sentences

code_switch:
- Preserve the original language switching exactly as spoken
- Write Indian language parts in their native script
- Write English words and phrases in English (Latin script) if spoken that way
- Do NOT translate, normalize, or rewrite across languages
- Use minimal punctuation based on audible cues

romanized:
- Write the ENTIRE transcription in Roman/Latin script only
- Convert all {language} speech into Romanized form as it sounds
- Preserve pronunciation exactly as spoken even if spelling looks incorrect
- Preserve ALL repetitions, fillers, stammers, false starts
- Do NOT standardize spellings or make them grammatically correct
- Use minimal punctuation only if clearly audible

If audio is unclear, transcribe what you can hear. Use the 'notes' field for quality issues.
If completely inaudible, set all transcription fields to "[INAUDIBLE]"."""


def get_user_prompt() -> str:
    """Get the user prompt to accompany the audio."""
    return "Transcribe this audio segment following the system instructions. Return a valid JSON object with the four transcription outputs."


# JSON Schema for Gemini's structured output
TRANSCRIPTION_JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "native_transcription": {
            "type": "string",
            "description": "Native script transcription without punctuation"
        },
        "native_with_punctuation": {
            "type": "string",
            "description": "Native script transcription with minimal punctuation"
        },
        "code_switch": {
            "type": "string",
            "description": "Mixed script preserving language switching"
        },
        "romanized": {
            "type": "string",
            "description": "Complete Roman script transliteration"
        },
        "confidence": {
            "type": "number",
            "description": "Confidence score 0-1 (optional)"
        },
        "notes": {
            "type": "string",
            "description": "Notes about audio quality or unclear portions (optional)"
        }
    },
    "required": ["native_transcription", "native_with_punctuation", "code_switch", "romanized"]
}
