o
    lQi-                     @  sh  U d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ G d	d
 d
eeZG dd deZG dd deZedejZedZddddddddddd
Zded< dddd d!d"d#d$d%d&d'd(ZdRdSd.d/ZdTd0d1ZdUd4d5ZdUd6d7ZdVd:d;ZdWd<d=ZdXdBdCZ dYdEdFZ!dZdGdHZ"d[dIdJZ#d\dMdNZ$dOZ%d]dPdQZ&dS )^a  
Prompt + schema helpers for transcript script normalization.

This path is separate from the audio transcription prompt because the task here
starts from existing transcript text, not audio. We pre-classify fully Roman
items locally and only send native/mixed-script items to Gemini.
    )annotationsN)Enum)Iterable)	BaseModelField   LANGUAGE_MAPc                   @  s   e Zd ZdZdZdZdZdS )InputScriptProfilefully_romanfully_nativemixed_native_latinotherN)__name__
__module____qualname__r   r   r   r    r   r    src/transcript_variant_prompt.pyr
      s
    r
   c                   @  sD   e Zd ZU eddZded< eddZded< eddZded< d	S )
TranscriptVariantResultz(Caller-provided stable input identifier.)descriptionstridzSame utterance rendered in the target native script. Preserve protected spans exactly: numerics, emails, URLs, handles, hashtags, and file-like tokens.native_script_textzhSame utterance rendered in ASCII Romanization for the target language. Preserve protected spans exactly.romanized_textN)r   r   r   r   r   __annotations__r   r   r   r   r   r   r      s   
 r   c                   @  s   e Zd ZU ded< dS )TranscriptVariantBatchResultzlist[TranscriptVariantResult]resultsN)r   r   r   r   r   r   r   r   r   ,   s   
 r   zZ(?:https?://\S+|www\.\S+|[\w.+-]+@[\w-]+(?:\.[\w-]+)+|[@#][\w._-]+|\b\d+(?:[.,:/-]\d+)*\b)z0^[A-Za-z0-9\s.,!?;:'\"(){}\[\]/\\@#&%+*=_<>|-]*$))A   iO  ))i 	  i	  ))i	  i	  ))i 
  i
  ))i
  i
  ))i   i  ))i  i  ))i   i  ))i  i  ))i   i  )
Latin
DevanagariBengaliGurmukhiGujaratiOdiaTamilTeluguKannada	Malayalamz&dict[str, tuple[tuple[int, int], ...]]SCRIPT_RANGESr   r    r!   r"   r#   r$   r%   r&   r'   r   )r   Assameser    r!   r"   r#   r$   r%   r&   r'   r   schemadictdefsdict | Nonereturnc                   s    d u r
|  di  d| v r$| d dd } |i }tt| S i }|  D ]+\}}|dkr3q*t|tr@t| ||< q*t|trQ fdd|D ||< q*|||< q*|S )Nz$defsz$ref/c                   s$   g | ]}t |trt| n|qS r   )
isinstancer+   _resolve_refs).0itemr,   r   r   
<listcomp>c   s    z!_resolve_refs.<locals>.<listcomp>)popsplitgetr2   r+   itemsr1   list)r*   r,   ref_nameresolvedresultkeyvaluer   r5   r   r2   S   s$   



r2   c                  C  s4   t  } t| }d|d< |d d d }d|d< |S )NFadditionalProperties
propertiesr   r:   )r   model_json_schemar2   )r*   r=   item_schemar   r   r   "get_transcript_variant_json_schemal   s   rE   language_coder   c                 C  s   | t vrd} t |  \}}}|S )Nenr   )rF   _script_namer   r   r   get_target_script_nameu   s   rJ   c                 C  s   t | }t|dS )Nr   )rJ   SCRIPT_BLOCK_BY_NAMEr9   )rF   rI   r   r   r   get_target_script_block|   s   rL   text	list[str]c                 C  s
   t | S N)PROTECTED_SPAN_REfindallrM   r   r   r   extract_protected_spans   s   
rS   c                 C  s   t d| S )N )rP   subrR   r   r   r   _strip_protected_spans   s   rV   charrangesIterable[tuple[int, int]]boolc                   s   t |  t fdd|D S )Nc                 3  s,    | ]\}}|   ko|kn  V  qd S rO   r   )r3   startend	codepointr   r   	<genexpr>   s   * z"_char_in_ranges.<locals>.<genexpr>)ordany)rW   rX   r   r]   r   _char_in_ranges   s   rb   dict[str, int]c                 C  s~   t | }dd tD }|D ]/}t|}|d d dvrqd}t D ]\}}t||r8||  d7  < d} nq#|r<qq|S )Nc                 S  s   i | ]}|d qS )r   r   )r3   namer   r   r   
<dictcomp>   s    z(detect_script_counts.<locals>.<dictcomp>r   >   LMFT)rV   r(   unicodedatacategoryr:   rb   )rM   scrubbedcountsrW   ri   matchedrI   rX   r   r   r   detect_script_counts   s"   

rm   c                   s   |dkrt jS t| }t| | d}|dd}t fdd| D }|dkr7|dkr7|dkr7t jS |dkrF|dkrF|dkrFt jS |dkrU|dkrU|dkrUt jS t j	S )NrG   r   r   c                 3  s$    | ]\}}| d hvr|V  qdS )r   Nr   )r3   rI   countnative_blockr   r   r_      s    z(classify_input_script.<locals>.<genexpr>)
r
   r   rm   rL   r9   sumr:   r   r   r   )rM   rF   rk   native_countlatin_countother_countr   ro   r   classify_input_script   s    ru   c                 C  s   t | }t|d uS rO   )rV   ASCII_ROMAN_RE	fullmatch)rM   rj   r   r   r   romanized_text_is_ascii   s   rx   r:   
list[dict]c                 C  sN   g }| D ]}| |d |d |d |d d qtj|ddd}d	| d
S )Nr   rF   input_script_profilerM   )r   rF   rz   rM   F   )ensure_asciiindentzgProduce both variants for every input item. Return one JSON object matching the schema exactly.
INPUT:

)appendjsondumps)r:   normalized_itemsr4   payloadr   r   r   $build_transcript_variant_user_prompt   s   	r   u  ROLE
Convert transcript text into two deterministic script variants for the same spoken utterance.
Output only the JSON object required by the schema.

INPUT
Each item has: id, language_code, input_script_profile, text.
The caller already skipped fully_roman items, so every item here must be processed.

TASK
For each item return:
1. native_script_text
2. romanized_text

GLOBAL RULES
- Same utterance only. No translation, paraphrase, cleanup, grammar fixing, expansion, or added context.
- Preserve word order, clause order, repetitions, fillers, and punctuation as closely as possible.
- Never infer omitted words before or after the visible text.
- Return one output item per input item in the same order.
- Copy id exactly from the input.
- No explanations, notes, confidence, or extra keys.

NATIVE_SCRIPT_TEXT
- Render the utterance in the target language's native script.
- If input_script_profile is fully_native, native_script_text may be identical to the input.
- If input_script_profile is mixed_native_latin, convert ordinary spoken Latin-script words into the target native script.
- Do not change protected spans.
- Common spoken borrowings like Google, WhatsApp, click, video, website, support, form, upload, download, meeting may be written in native script when they are ordinary spoken words, not literals.

ROMANIZED_TEXT
- Render the same utterance in plain ASCII Roman letters.
- No diacritics, IPA, or scholarly transliteration.
- Use everyday user-style Romanization.
- Keep Romanization internally consistent within an item.
- Well-known brand names may remain in their natural Roman form when that is the most faithful rendering.

PROTECTED SPANS
Preserve these byte-for-byte unchanged in both outputs:
- email addresses
- URLs and web domains
- @handles and #hashtags
- numbers, dates, times, amounts, version-like numbers
- file names and literal identifier-like tokens

If unsure whether a token is protected, preserve it exactly.

PUNCTUATION
- Keep punctuation close to the input.
- Do not invent new clauses.
- Do not add surrounding quotes.

EXAMPLES

Example 1
input:
या Google पर जाकर whatsapp@support.com पर click करके भी अपने WhatsApp का access ले सकते हैं
native_script_text:
या गूगल पर जाकर whatsapp@support.com पर क्लिक करके भी अपने वॉट्सऐप का एक्सेस ले सकते हैं
romanized_text:
Ya Google par jaakar whatsapp@support.com par click karke bhi apne WhatsApp ka access le sakte hain

Example 2
input:
अनुराग ठाकुर जी तमिलनाडु में आकर आठ नौ जगहों पर प्रचार किए हैं
native_script_text:
अनुराग ठाकुर जी तमिलनाडु में आकर आठ नौ जगहों पर प्रचार किए हैं
romanized_text:
Anurag Thakur ji Tamilnadu mein aakar aath nau jagahon par prachaar kiye hain

Example 3
input:
আমাৰ যিটো website www.brainexcellencelabs.com তাত আপুনি search কৰিলে agenda খিনি জানিব পাৰিব
native_script_text:
আমাৰ যিটো ৱেবছাইট www.brainexcellencelabs.com তাত আপুনি চাৰ্চ কৰিলে এজেণ্ডা খিনি জানিব পাৰিব
romanized_text:
Amar jito website www.brainexcellencelabs.com tat apuni search korile agenda khini janib parib

Example 4
input:
ഈ മുഖം അങ്ങനെയല്ല എന്ന് വെച്ചാൽ they can change it. ഒന്നും കഷ്ടമേ കിട്ടത്തില്ല.
native_script_text:
ഈ മുഖം അങ്ങനെയല്ല എന്ന് വെച്ചാൽ ദേ കാൻ ചേഞ്ച് ഇറ്റ്. ഒന്നും കഷ്ടമേ കിട്ടത്തില്ല.
romanized_text:
Ee mukham anganeyalla ennu vechaal they can change it. Onnum kashtame kittathilla.

Example 5
input:
అందువల్ల డిపార్ట్మెంట్ టు డిపార్ట్మెంట్ యాక్షన్ తీసుకోవడం వల్ల
native_script_text:
అందువల్ల డిపార్ట్మెంట్ టు డిపార్ట్మెంట్ యాక్షన్ తీసుకోవడం వల్ల
romanized_text:
Anduvalla department tu department action teesukovadam valla

Example 6
input:
આ meeting 3 વાગ્યે start થશે
native_script_text:
આ મીટિંગ 3 વાગ્યે સ્ટાર્ટ થશે
romanized_text:
A meeting 3 vaagye start thashe

FINAL
Return a single JSON object matching the schema exactly. Never translate. Preserve protected spans exactly.
c                   C  s   t S rO   )#CACHEABLE_TRANSCRIPT_VARIANT_PROMPTr   r   r   r   'get_cacheable_transcript_variant_prompt;  s   r   rO   )r*   r+   r,   r-   r.   r+   )r.   r+   )rF   r   r.   r   )rM   r   r.   rN   )rM   r   r.   r   )rW   r   rX   rY   r.   rZ   )rM   r   r.   rc   )rM   r   rF   r   r.   r
   )rM   r   r.   rZ   )r:   ry   r.   r   )r.   r   )'__doc__
__future__r   r   rerh   enumr   typingr   pydanticr   r   configr	   r   r
   r   r   compile
IGNORECASErP   rv   r(   r   rK   r2   rE   rJ   rL   rS   rV   rb   rm   ru   rx   r   r   r   r   r   r   r   <module>   sj    


	







i