o
    5i.                     @   s|  U d Z ddlmZmZ ddlmZmZ ddddddd	dd
dddddddddddddddddddddddddddddd dd!d"ddd#d$d%dd&Zeeeeef f e	d'< G d(d) d)eZ
G d*d+ d+eZG d,d- d-eZd.ed/efd0d1Zd/efd2d3Zd4d5d6d7d5d8d7d4d9d5g d:d;d5g d<d;d5g d=d;d5d>d7d?g d@dAdBd5dCd7dDg dDdAdEZdFS )Ga}  
Pydantic schemas for structured transcription output.

Architecture change (v4): Gemini outputs fewer fields, code derives the rest.
  Gemini outputs:
    1. transcription   - Native script with punctuation (PRIMARY, authoritative)
    2. tagged          - Code-mixed + audio event tags [laugh] etc.
    3. speaker         - Metadata: emotion, style, pace, accent
    4. detected_language

  Code derives (deterministic, not Gemini):
    5. romanized       - uroman-based Latin transliteration (from transcription)
    6. code_switch     - dropped (tagged subsumes it)

Prompt v4 design (evolved from v1 -> v2 strict -> v3 field derivation -> v4 simplified):
  - Reduced Gemini output from 4 text fields to 2 (transcription + tagged)
  - Less cognitive load = better adherence, more deterministic output
  - romanized derived deterministically via uroman = stable, reproducible MMS alignment
    )OptionalDict)	BaseModelFieldzhi-IN
Devanagariz'Preserve Nukta when clearly pronounced.)bcp47scriptscript_ruleszmr-INz&Preserve retroflex lateral accurately.zte-INTeluguzADon't over-split words. Preserve Sandhi/combined forms as spoken.zta-INTamilz-Distinguish short and long vowels accurately.zkn-INKannadaz/Preserve agglutinated/combined forms as spoken.zml-IN	Malayalamz8Don't split agglutinated words. Preserve chillu letters.zgu-INGujarati zpa-INGurmukhizbn-INBengaliz4Preserve Chandrabindu for nasalization where spoken.zas-INAssamesez:Use Assamese-specific characters, NOT Bengali equivalents.zor-INOdiazen-INLatinzBStandard English spelling. Don't phonetically approximate accents.)HindiMarathir
   r   r   r   r   Punjabir   r   r   EnglishLANGUAGE_CONFIGSc                   @   sb   e Zd ZU dZedddZeed< edddZeed< ed	d
dZ	eed< edddZ
eed< dS )SpeakerMetaz"Speaker metadata for TTS training.neutralz.neutral, happy, sad, angry, excited, surpriseddefaultdescriptionemotionconversationalzEconversational, narrative, excited, calm, emphatic, sarcastic, formalspeaking_stylenormalzslow, normal, fastpacer   z>Regional accent/dialect if detectable, empty string if unknownaccentN)__name__
__module____qualname____doc__r   r   str__annotations__r!   r#   r$    r+   r+   A/home/ubuntu/maya3_transcribe/src/backend/transcription_schema.pyr   X   s$   
 r   c                   @   s   e Zd ZU dZeddZeed< edddZeed< edd	dZ	eed
< eddZ
eed< edddZee ed< edddZeed< dS )TranscriptionOutputz>Structured output: 2 text fields from Gemini + derived fields.zhCode-mixed transcription with punctuation. Each language in its original script (English stays English).r   transcriptionr   zcMixed script: native + English in Latin. v4: derived from tagged by stripping event tags, or empty.r   code_switchzfFull Roman/Latin script transliteration. v4: derived via uroman from transcription, not Gemini output.	romanizedzCode-mixed transcription with audio event tags. Native script for primary language, Latin for English words, plus [laugh] [cough] [sigh] etc. at positions where they occur.taggedNz/Speaker metadata: emotion, style, pace, accent.speakerz*The language actually spoken in the audio.detected_language)r%   r&   r'   r(   r   r/   r)   r*   r0   r1   r2   r3   r   r   r4   r+   r+   r+   r,   r-   l   s0   
 r-   c                   @   s  e Zd ZU dZeddZeed< edddZe	ed< ed	d
dZ
e	ed< eddZeed< eddZeed< eddZeed< eddZeed< edddZee ed< edddZee ed< edddZee ed< edddZee ed< edefddZdS ) TranscriptionResultz=Complete result for a transcribed segment including metadata.z Identifier for the audio segmentr.   
segment_idr   z Chunk index if segment was splitr   chunk_index   zTotal chunks for this segmenttotal_chunkszDuration of audio in secondsduration_seczPrimary language of the audiolanguagezThe transcription outputsr/   z#Gemini model used for transcription
model_usedNzThinking level usedthinking_levelzAPI call timeprocessing_time_seczaccept/review/rejectvalidation_statuszAlignment score 0-1validation_scorereturnc                 C   s   | j j S )z)Shortcut to primary native transcription.)r/   )selfr+   r+   r,   native   s   zTranscriptionResult.native)r%   r&   r'   r(   r   r6   r)   r*   r7   intr9   r:   floatr;   r/   r-   r<   r=   r   r>   r?   r@   propertyrC   r+   r+   r+   r,   r5      s   
 r5   r;   rA   c           	      C   s   t | i }|d|  d}|dd}|dd}d}|r*d|   d| d}|  d	kr5d
}d}nd|  d| d| d}d}|rM|  d| dn| }d| d|  d|  d|  d| d| d| dS )zs
    System instruction for Gemini transcription.
    v4: 2 text fields (transcription + tagged) instead of 4.
    r   z nativer   r   r	   z
SCRIPT RULES FOR z:

englishz#Write in standard English spelling.z?Same as transcription with audio event tags at their positions.zWrite z
 words in a   script.
   Keep English words in English (Latin script) exactly as spoken.
   Keep Hindi words in Devanagari, Tamil words in Tamil script, etc.
   Each language stays in its original script. Do NOT transliterate.
   Example: speaker says 'salt biscuits manchidi' -> salt biscuits z
(manchidi)zSame text as transcription with audio event tags inserted at their positions.
   Do NOT change any words or scripts - just add the tags where events occur.z ()zYou are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: z

CRITICAL RULES (violations cause rejection):
1. NEVER TRANSLATE. This is transcription, not translation. If the speaker says English words, those are English. If the speaker says z words, those are a*  . Write what you HEAR, not what you think it means in another language.
2. VERBATIM FIDELITY: Every repetition, filler, stammer, false start, hesitation - exactly as spoken.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION: Never add words or phrases not in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Do not complete anything. Output ONLY the JSON.
5. UNCERTAINTY: If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech. Use [NO_SPEECH] for no speech (silence, noise, music only).
6. BOUNDARY HANDLING: Audio is VAD-cut and may start/end mid-speech. Transcribe everything you can confidently hear. Only omit what is truly inaudible.
7. LANGUAGE MISMATCH: Trust what you hear. If audio is clearly different from z, transcribe in that language's script and set detected_language accordingly.

PUNCTUATION (prosody-based, not grammar):
- Only: comma, period, ? and !
- Insert from audible pauses/intonation only. No pause = no punctuation.
ar  
FIELD DERIVATION:
"transcription" is the PRIMARY authoritative output. It IS code-mixed: each language in its own script.
"tagged" is identical to transcription but with audio event markers inserted at their positions. Do NOT re-interpret the audio for tagged - just copy transcription and add tags.

OUTPUT FIELDS:

1. transcription (AUTHORITATIVE - native script)
   z
   Punctuation: period, comma, ? and ! only, from audible prosodic cues.

2. tagged (derived from transcription - code-mixed + event tags)
   a.  
   ONLY these tags, ONLY if clearly and prominently audible:
   [laugh] [cough] [sigh] [breath] [singing] [noise] [music] [applause]

3. speaker (metadata from audio prosody)
   emotion: neutral | happy | sad | angry | excited | surprised
   speaking_style: conversational | narrative | excited | calm | emphatic | sarcastic | formal
   pace: slow | normal | fast
   accent: regional dialect/accent if confidently detectable, empty string otherwise.

4. detected_language
   The language you actually hear spoken. If code-mixed, write the dominant language.)r   getupperlower)	r;   lang_configscript_namer   r	   script_sectionnative_field_ruletagged_rule
lang_labelr+   r+   r,   get_transcription_prompt   s@   rS   c                   C   s   dS )z#User prompt to accompany the audio.zuTranscribe this audio segment following the system instructions. Return a valid JSON object with all required fields.r+   r+   r+   r+   r,   get_user_prompt   s   rT   objectstringz4Native script transcription with minimal punctuation)typer   z.Code-mixed transcription with audio event tagszSpeaker metadata)r   happysadangryexcited	surprised)rW   enum)r    	narrativer[   calmemphatic	sarcasticformal)slowr"   fastz'Regional accent/dialect or empty string)r   r!   r#   r$   )r   r!   r#   F)rW   r   
propertiesrequiredadditionalPropertiesz%Language actually spoken in the audio)r/   r2   r3   r4   )rW   re   rf   rg   N)r(   typingr   r   pydanticr   r   r   r)   r*   r   r-   r5   rS   rT   TRANSCRIPTION_JSON_SCHEMAr+   r+   r+   r,   <module>   s    @%L,
