o
    î:£i-  ã                   @  sô   d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZmZmZ G dd	„ d	eeƒZG d
d„ deeƒZG dd„ deeƒZG dd„ deƒZG dd„ deƒZd*dd„Zd+d,dd„ZdZdZdZdZd-d"d#„Zd.d$d%„Zd-d&d'„Zd.d(d)„ZdS )/z
Parameterized system prompt + Pydantic JSON schema.
Prompt is ~400 tokens, language-specific, no schema in prompt text (API-only enforcement).
é    )Úannotations)ÚEnum)ÚOptional)Ú	BaseModelÚFieldé   )ÚLANGUAGE_MAPÚAUDIO_EVENT_TAGSÚSUPPORTED_LANGUAGESc                   @  s$   e Zd ZdZdZdZdZdZdZdS )ÚSpeakerEmotionÚneutralÚhappyÚsadÚangryÚexcitedÚ	surprisedN)	Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r   © r   r   ú./home/ubuntu/transcripts/src/prompt_builder.pyr      s    r   c                   @  s(   e Zd ZdZdZdZdZdZdZdZ	dS )	ÚSpeakingStyleÚconversationalÚ	narrativer   ÚcalmÚemphaticÚ	sarcasticÚformalN)
r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s    r   c                   @  s   e Zd ZdZdZdZdS )ÚSpeakerPaceÚslowÚnormalÚfastN)r   r   r   r   r    r!   r   r   r   r   r   $   s    r   c                   @  sX   e Zd ZU eddZded< eddZded< eddZd	ed
< edddZded< dS )ÚSpeakerMetadataz+Speaker emotion detected from audio prosody©Údescriptionr   Úemotionz"Speaking style detected from audior   Úspeaking_stylez$Speaking pace: slow, normal, or fastr   ÚpaceÚ z{Sub-regional dialect only (e.g. 'Hyderabadi', 'Chennai', 'Bhojpuri'). Do NOT put the language name. Empty string if unsure.)Údefaultr$   ÚstrÚaccentN)	r   r   r   r   r%   Ú__annotations__r&   r'   r+   r   r   r   r   r"   *   s
   
 r"   c                   @  sZ   e Zd ZU dZeddZded< eddZded< eddZd	ed
< eddZ	ded< dS )ÚTranscriptionSchemaz5The output schema enforced via API structured output.zbNative script transcription with minimal punctuation. Code-mixed: each language in its own script.r#   r*   ÚtranscriptionzLSame as transcription but with audio event tags inserted at their positions.Útaggedz+Speaker metadata derived from audio prosodyr"   ÚspeakerzUISO 639-1 code of the language actually spoken. If code-mixed, the dominant language.Údetected_languageN)
r   r   r   Ú__doc__r   r.   r,   r/   r0   r1   r   r   r   r   r-   1   s   
 r-   ÚreturnÚdictc                  C  s   t  ¡ } t| ƒS )z0Return the JSON schema dict for API enforcement.)r-   Úmodel_json_schemaÚ_resolve_refs)Úschemar   r   r   Úget_json_schema9   s   r8   Nr7   Údefsúdict | Nonec                   s°   ˆ du r
|   di ¡‰ d| v r$| d  d¡d }ˆ  |i ¡}tt|ƒˆ ƒS i }|  ¡ D ]+\}}|dkr3q*t|tƒr@t|ˆ ƒ||< q*t|tƒrQ‡ fdd„|D ƒ||< q*|||< q*|S )zBRecursively resolve $ref pointers so the schema is self-contained.Nz$defsz$refú/éÿÿÿÿc                   s$   g | ]}t |tƒrt|ˆ ƒn|‘qS r   )Ú
isinstancer4   r6   )Ú.0Úitem©r9   r   r   Ú
<listcomp>Q   s   $ z!_resolve_refs.<locals>.<listcomp>)ÚpopÚsplitÚgetr6   r4   Úitemsr=   Úlist)r7   r9   Úref_nameÚresolvedÚresultÚkÚvr   r@   r   r6   @   s    


r6   uƒ  You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content.

TARGET: {lang_name} ({lang_code})

CRITICAL RULES (violations cause rejection):
1. NEVER TRANSLATE. This is transcription, not translation. If the speaker says English words, those are English. If the speaker says {lang_name} words, those are {lang_name}. Write what you HEAR.
2. VERBATIM FIDELITY: Every repetition, filler, stammer, false start, hesitation - exactly as spoken.
3. NO CORRECTION: Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION: Never add words not in the audio. If audio cuts off mid-sentence, STOP where the audio stops. Output ONLY the JSON.
5. UNCERTAINTY: If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech. Use [NO_SPEECH] for no speech.
6. BOUNDARY HANDLING: Audio is VAD-cut and may start/end mid-speech. Transcribe everything you can confidently hear.
7. LANGUAGE MISMATCH: Trust what you hear. If audio is clearly different from {lang_name}, transcribe in that language's script and set detected_language accordingly.

PUNCTUATION (prosody-based only): comma, period, ? and ! â€” insert from audible pauses/intonation only.

SCRIPT RULES FOR {script_name}: {script_hint}

NUMBERS: Write all numbers as digits (1, 2, 100, 1000) not words.

FIELD DERIVATION:
"transcription" is the PRIMARY output. Code-mixed: each language in its own script.
"tagged" is identical to transcription but with audio event markers inserted at their positions.

ALLOWED EVENT TAGS (only if clearly audible): {event_tags}

"speaker" metadata: detect emotion, speaking_style, pace from audio prosody. For accent: only a specific sub-regional dialect (e.g. "Hyderabadi", "Chennai"). Do NOT put the language name. Default to empty string.
"detected_language": ISO 639-1 code of the dominant language actually spoken.uÊ  # Role

You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content. Output ONLY the JSON.

# Critical Rules

1. NEVER TRANSLATE. This is transcription, not translation. Write what you HEAR in the script it was spoken in.
2. VERBATIM FIDELITY. Every repetition, filler, stammer, false start, hesitation â€” exactly as spoken. Do not clean up speech.
3. NO CORRECTION. Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION. Never add words not in the audio. If audio cuts off mid-sentence, STOP where the audio stops.
5. UNCERTAINTY. If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech.
6. BOUNDARY HANDLING. Audio is VAD-cut and may start or end mid-speech. Transcribe everything you can confidently hear. Do not guess what came before or after.
7. LANGUAGE MISMATCH. Trust what you hear. The expected language hint is just a hint. If audio is clearly in a different language, transcribe in that language's script and set detected_language accordingly.

# Code-Mixed Transcription

Audio may contain multiple languages. Each language stays in its native script. Do NOT transliterate.
- Indic words: write in their native script (Devanagari, Telugu, Tamil, etc.)
- English words spoken in an Indic sentence: keep in Latin script
- Hindi words in a Telugu sentence: keep in Devanagari
- Preserve Sandhi and combined forms as spoken. Do not over-split words.

# Punctuation

Insert punctuation from audible prosodic cues only. No pause heard = no punctuation.
- Only: comma, period, ? and !
- Do not add punctuation for grammatical correctness

# No Speech

If the audio contains no speech (only silence, noise, or music), set transcription to [NO_SPEECH].

# Field Rules

- "transcription": the PRIMARY authoritative field. Verbatim, code-mixed, native script.
- "tagged": identical to transcription, with event tags inserted at their audio positions. Do NOT re-interpret the audio for this field â€” copy transcription and insert tags.
- "speaker": emotion, speaking_style, pace â€” derived from audio prosody only. For "accent": only fill if you can identify a specific sub-regional dialect (e.g. "Hyderabadi", "Chennai", "Bhojpuri"). Do NOT put the language name. Default to empty string.
- "detected_language": ISO 639-1 code of the dominant language actually spoken.

# Event Tags

Insert ONLY if clearly and prominently audible. Do not guess.
- [laugh] â€” audible laughter
- [cough] â€” actual cough sound
- [sigh] â€” audible exhale/sigh
- [breath] â€” heavy or prominent breathing
- [singing] â€” speaker is singing, not speaking
- [noise] â€” environmental noise disrupting speech
- [music] â€” background music audible during speech or if humming
- [applause] â€” clapping from audience or speaker
- [snort] â€” nasal snort sound
- [cry] â€” audible crying or sobbing

# Reference Examples

## Example: Code-mixed (Telugu + English)
Input context: Telugu podcast, speaker casually mixing English
transcription: "à°¨à°¾à°•à± à°ˆ phone à°šà°¾à°²à°¾ à°¬à°¾à°—à±à°‚à°¦à°¿, like really good quality à°…à°¨à±à°¨à°®à°¾à°Ÿ"
tagged: "à°¨à°¾à°•à± à°ˆ phone à°šà°¾à°²à°¾ à°¬à°¾à°—à±à°‚à°¦à°¿, like really good quality à°…à°¨à±à°¨à°®à°¾à°Ÿ"
detected_language: "te"

## Example: Code-mixed (Hindi + English)
Input context: Hindi interview with English technical terms
transcription: "à¤¤à¥‹ basically à¤¹à¤®à¤¨à¥‡ machine learning model à¤•à¥‹ train à¤•à¤¿à¤¯à¤¾ à¤”à¤° results à¤•à¤¾à¤«à¤¼à¥€ à¤…à¤šà¥à¤›à¥‡ à¤†à¤"
tagged: "à¤¤à¥‹ basically à¤¹à¤®à¤¨à¥‡ machine learning model à¤•à¥‹ train à¤•à¤¿à¤¯à¤¾ à¤”à¤° results à¤•à¤¾à¤«à¤¼à¥€ à¤…à¤šà¥à¤›à¥‡ à¤†à¤"
detected_language: "hi"

## Example: No speech
Input context: Segment contains only background noise
transcription: "[NO_SPEECH]"
tagged: "[NO_SPEECH]"
detected_language: (same as expected hint)

## Example: Abrupt cutoff
Input context: Audio ends mid-word due to VAD boundary
transcription: "à°…à°ªà±à°ªà±à°¡à± à°µà°¾à°³à±à°³à± à°µà°šà±à°šà°¿ à°šà±†à°ªà±à°ªà°¾à°°à± à°•à°¦à°¾, à°† à°¤à°°à±à°µà°¾à°¤ à°®à°¨"
tagged: "à°…à°ªà±à°ªà±à°¡à± à°µà°¾à°³à±à°³à± à°µà°šà±à°šà°¿ à°šà±†à°ªà±à°ªà°¾à°°à± à°•à°¦à°¾, à°† à°¤à°°à±à°µà°¾à°¤ à°®à°¨"
Note: audio cuts mid-word at "à°®à°¨" â€” transcribe only what is heard, do not complete the word.

## Example: Event tags
Input context: Speaker laughs while talking
transcription: "à°…à°¦à°¿ à°šà°¾à°²à°¾ funny moment"
tagged: "à°…à°¦à°¿ à°šà°¾à°²à°¾ [laugh] funny moment"
detected_language: "te"

## Example: Language mismatch
Input context: Expected Hindi but speaker is actually speaking English
transcription: "so the main thing about this product is the packaging"
tagged: "so the main thing about this product is the packaging"
detected_language: "en"
z~TARGET LANGUAGE: {lang_name} ({lang_code})
Transcribe this audio segment. Return a valid JSON object with all required fields.zuTranscribe this audio segment following the system instructions. Return a valid JSON object with all required fields.Ú	lang_coder*   c                 C  sB   | t vrd} t |  \}}}d dd„ tD ƒ¡}tj|| |||dS )z-Build the V1 language-specific system prompt.Úenz, c                 s  s    | ]	}d |› dV  qdS )ú[ú]Nr   )r>   Útr   r   r   Ú	<genexpr>â   s   € z&build_system_prompt.<locals>.<genexpr>)Ú	lang_namerL   Úscript_nameÚscript_hintÚ
event_tags)r   Újoinr	   ÚSYSTEM_PROMPT_TEMPLATEÚformat)rL   rR   rS   rT   rU   r   r   r   Úbuild_system_promptÜ   s   ûrY   c                   C  ó   t S )zBReturn the V2 uniform cacheable system prompt (language-agnostic).)ÚCACHEABLE_SYSTEM_PROMPTr   r   r   r   Úget_cacheable_system_promptí   ó   r\   c                 C  s(   | t vrd} t |  \}}}tj|| dS )zFBuild per-request user prompt with language hint (for V2 cached mode).rM   )rR   rL   )r   ÚUSER_PROMPT_TEMPLATErX   )rL   rR   Ú_r   r   r   Úbuild_user_promptò   s   r`   c                   C  rZ   )z>Return the V1 user prompt (language already in system prompt).)ÚUSER_PROMPTr   r   r   r   Úget_user_promptú   r]   rb   )r3   r4   )N)r7   r4   r9   r:   r3   r4   )rL   r*   r3   r*   )r3   r*   )r2   Ú
__future__r   Úenumr   Útypingr   Úpydanticr   r   Úconfigr   r	   r
   r*   r   r   r   r"   r-   r8   r6   rW   r[   r^   ra   rY   r\   r`   rb   r   r   r   r   Ú<module>   s*    	

]


