o
    Iy±iE'  ã                   @  s~  d Z ddlmZ ddlZddlZddlZddlZddlmZm	Z	 ddl
mZ ddlmZmZ edƒZedƒZed	ƒZed
ƒZdZe dd¡ZdZdZdZdZesredƒZe ¡ rre ¡  ¡ D ]Ze d¡rqe  dd¡d  !¡ Z nq^G dd„ deƒZ"d<d=dd„Z#d>dd „Z$d!Z%d"Z&d#d$„ Z'd?d'd(„Z(d>d)d*„Z)d@d,d-„Z*dAd/d0„Z+dBd2d3„Z,dCd5d6„Z-d7d8„ Z.d@d9d:„Z/e0d;kr½e.ƒ  dS dS )Dz„
Transcribe Modi audio segments using Gemini 3.1 Pro.
Hindi-only, clean text output for VibeVoice fine-tuning, resumable, parallel.
é    )ÚannotationsN)ÚThreadPoolExecutorÚas_completed)ÚPath)Ú	BaseModelÚFieldz$/home/ubuntu/modi_processed/segmentsz'/home/ubuntu/modi_processed/transcriptsz4/home/ubuntu/modi_processed/transcribe_progress.jsonz)/home/ubuntu/modi_processed/dataset.jsonlzgemini-3.1-pro-previewÚ
GEMINI_KEYÚ é
   é   g      @é2   z/home/ubuntu/transcripts/.envzGEMINI_KEY=ú=é   c                   @  s    e Zd ZU eddZded< dS )ÚTranscriptionSchemazFVerbatim Devanagari transcription. English words stay in Latin script.)ÚdescriptionÚstrÚtranscriptionN)Ú__name__Ú
__module__Ú__qualname__r   r   Ú__annotations__© r   r   ú)/home/ubuntu/modi_processed/transcribe.pyr   +   s   
 r   ÚschemaÚdictÚdefsúdict | NoneÚreturnc                   s¬   ˆ d u r
|   di ¡‰ d| v r"| d  d¡d }ttˆ  |i ¡ƒˆ ƒS i }|  ¡ D ]+\}}|dkr1q(t|tƒr>t|ˆ ƒ||< q(t|tƒrO‡ fdd„|D ƒ||< q(|||< q(|S )Nz$defsz$refú/éÿÿÿÿc                   s$   g | ]}t |tƒrt|ˆ ƒn|‘qS r   )Ú
isinstancer   Ú_resolve_refs)Ú.0Úitem©r   r   r   Ú
<listcomp><   s   $ z!_resolve_refs.<locals>.<listcomp>)ÚpopÚsplitr!   r   ÚgetÚitemsr    Úlist)r   r   Úref_nameÚresultÚkÚvr   r$   r   r!   /   s   


r!   c                  C  s   t  ¡ } t| ƒS ©N)r   Úmodel_json_schemar!   )r   r   r   r   Úget_json_schemaB   s   r1   uc  You are a verbatim speech-to-text transcription system for Hindi. Output ONLY the JSON.

This is Narendra Modi's speech audio â€” expect formal Hindi with occasional English terms.

RULES:
1. NEVER TRANSLATE. Write what you HEAR. English words stay in Latin script, Hindi in Devanagari.
2. VERBATIM FIDELITY. Every repetition, filler, stammer, false start â€” exactly as spoken.
3. NO CORRECTION. Do not fix grammar, pronunciation, or word choice.
4. NO HALLUCINATION. Never add words not in the audio. If audio cuts off, STOP where it stops.
5. If a word is unclear, write [UNK]. If no speech at all, write [NO_SPEECH].
6. Audio is VAD-cut and may start/end mid-speech. Transcribe only what you hear.

SCRIPT: Devanagari. Preserve Sandhi and combined forms as spoken.
PUNCTUATION: Only comma, period, ? and ! â€” from audible pauses/intonation only.
NUMBERS: Write as digits (1, 2, 100) not words.

Do NOT add any audio event tags like [breath], [applause], [music] etc. Output clean text only.

EXAMPLES:
transcription: "à¤­à¤¾à¤‡à¤¯à¥‹à¤‚ à¤”à¤° à¤¬à¤¹à¤¨à¥‹à¤‚, à¤†à¤œ à¤¹à¤® à¤à¤• à¤¨à¤ India à¤•à¥€ à¤¬à¤¾à¤¤ à¤•à¤°à¤¤à¥‡ à¤¹à¥ˆà¤‚"
transcription: "à¤¤à¥‹ basically à¤¹à¤®à¤¨à¥‡ Digital India à¤•à¥‹ à¤†à¤—à¥‡ à¤¬à¤¢à¤¼à¤¾à¤¯à¤¾"
transcription: "à¤”à¤° à¤‡à¤¸à¤²à¤¿à¤ à¤®à¥ˆà¤‚ à¤•à¤¹à¤¤à¤¾ à¤¹à¥‚à¤ à¤•à¤¿ à¤¹à¤®à¤¾à¤°à¥‡ à¤¦à¥‡à¤¶" (cutoff â€” stop where audio stops)zSTranscribe this Hindi audio segment. Return JSON with the transcription field only.c                  C  s   ddl m}  | jtdS )Nr   ©Úgenai)Úapi_key)Úgoogler3   ÚClientr   r2   r   r   r   Úmake_cliente   s   r7   Úwav_pathr   c                 C  sr   ddl m} | ¡ }|j|j|ddd}|jd|jdddtƒ td	}| j	j
t|tg|d
}|j ¡ }t |¡S )Nr   )Útypesz	audio/wav)ÚdataÚ	mime_type)Úinline_datai   )Úthinking_budgetzapplication/json)ÚtemperatureÚthinking_configÚresponse_mime_typeÚresponse_schemaÚsystem_instruction)ÚmodelÚcontentsÚconfig)Úgoogle.genair9   Ú
read_bytesÚPartÚBlobÚGenerateContentConfigÚThinkingConfigr1   ÚSYSTEM_PROMPTÚmodelsÚgenerate_contentÚGEMINI_MODELÚUSER_PROMPTÚtextÚstripÚjsonÚloads)Úclientr8   r9   Úaudio_bytesÚ
audio_partrE   ÚresponserQ   r   r   r   Útranscribe_segmentj   s"   
ûý

rY   c                   C  s   t  ¡ rt t  ¡ ¡S i S r/   )ÚPROGRESS_FILEÚexistsrS   rT   Ú	read_textr   r   r   r   Úload_progress„   s   r]   Úprogressc                 C  s   t  tj| dd¡ d S )Né   )Úindent)rZ   Ú
write_textrS   Údumps)r^   r   r   r   Úsave_progressŠ   s   rc   ú
list[Path]c                   C  s   t t d¡ƒS )Nz*.wav)ÚsortedÚSEGMENTS_DIRÚrglobr   r   r   r   Úcollect_segments   s   rh   r   c                 C  s   | j j› d| j› S )Nr   )ÚparentÚname)r8   r   r   r   Úsegment_key”   s   rk   ú#tuple[str, dict | None, str | None]c                 C  s  t |ƒ}ttƒD ]{}zt| |ƒ}||d fW   S  tjy4 } z|d d|› fW  Y d }~  S d }~w tyƒ } zDt|ƒ}d|v sGd|v rXtd|  }t	 
|¡ W Y d }~qd|v s`d|v rot	 
t|d  ¡ W Y d }~q|d d|› fW  Y d }~  S d }~ww |d d	fS )
NzJSON parse error: Ú429ÚRESOURCE_EXHAUSTEDr_   Ú500Ú503r   zError: z#Max retries exceeded (rate limited))rk   ÚrangeÚ	RETRY_MAXrY   rS   ÚJSONDecodeErrorÚ	Exceptionr   ÚRETRY_BACKOFFÚtimeÚsleep)rU   r8   ÚkeyÚattemptr,   ÚeÚerr_strÚwaitr   r   r   Úprocess_one˜   s*   
 €
 €÷

r}   c                    s²  t stdƒ t d¡ tjddd tƒ ‰tƒ } tƒ }dd„ |  	¡ D ƒ‰‡fdd„|D ƒ}t
|ƒ}t
ˆƒ}td	|› ƒ td
|› ƒ tdt
|ƒ› ƒ tdt› ƒ tdt› ƒ tƒ  |sitdƒ t| ƒ d S d}d}t ¡ }‡fdd„‰ ttd´‰‡ ‡fdd„|D ƒ}t|ƒD ]}	|	 ¡ \}
}}|d7 }|rÃd|dœ| |
< tt|
ƒj }|jddd t|
 dd¡ }| tj|ddd¡ n|d7 }d|dœ| |
< |t dkrØt| ƒ || }t ¡ | }|dkrê|| nd}|dkrøt
|ƒ| | nd}|d }|rdnd |d d… › }td!|› d"|› d#|
› d$|› d%|d&›d'|d(›d)|› d*ƒ q‹W d   ƒ n	1 s4w   Y  t| ƒ t ¡ | }td+|› d,|d d&›d-|› ƒ t| ƒ d S ).NzNERROR: No GEMINI_KEY found. Set env var or check /home/ubuntu/transcripts/.envr   T)ÚparentsÚexist_okc                 S  s"   h | ]\}}|  d ¡dkr|’qS )ÚstatusÚdone)r(   )r"   r-   r.   r   r   r   Ú	<setcomp>¸   s   " zmain.<locals>.<setcomp>c                   s   g | ]
}t |ƒˆ vr|‘qS r   )rk   )r"   Ús)Ú	done_keysr   r   r%   ¹   s    zmain.<locals>.<listcomp>zTotal segments: zAlready transcribed: zRemaining: z	Workers: zModel: z!All segments already transcribed!r   c                   s
   t ˆ | ƒS r/   )r}   )Úseg_path)rU   r   r   Ú_do_oneÍ   s   
zmain.<locals>._do_one)Úmax_workersc                   s   i | ]	}ˆ  ˆ |¡|“qS r   )Úsubmit)r"   Úseg)r†   Úpoolr   r   Ú
<dictcomp>Ñ   s    zmain.<locals>.<dictcomp>r   )r€   r,   z.wavz.jsonFr_   )Úensure_asciir`   Úerror)r€   r   é<   ÚOKzERR: ú[r   z] u    â€” z (z.1fz/s, ETA z.0fzm, errs=ú)z
Done. z processed in zm. Errors: )r   ÚprintÚsysÚexitÚ
OUTPUT_DIRÚmkdirr7   r]   rh   r)   ÚlenÚWORKERSrO   Úbuild_datasetrv   r   r   r,   r   ri   Úreplacera   rS   rb   Ú
SAVE_EVERYrc   )r^   Úall_segmentsÚpendingÚtotalÚalready_doneÚcompleted_countÚerrors_countÚt0ÚfuturesÚfuturerx   r,   r   Ú
parent_dirÚout_fileÚ
done_totalÚelapsedÚrateÚeta_sÚeta_mr€   Úelapsed_totalr   )r†   rU   r„   rŠ   r   Úmain­   sz   
ÿÿÿÿçý! r­   c           	      C  sä   t dt› ƒ d}ttdddN}t|  ¡ ƒD ]?}| | }| d¡dkr%q|d }| d	d
¡ ¡ }|r7|dkr8qtt| ƒ}t	j
d|› |dœdd}| |d ¡ |d7 }qW d  ƒ n1 saw   Y  t d|› dt› ƒ dS )z8Build the final JSONL dataset for VibeVoice fine-tuning.u   
Building dataset JSONL â†’ r   Úwzutf-8)Úencodingr€   r   r,   r   r	   z[NO_SPEECH]zSpeaker 0: )rQ   ÚaudioF)rŒ   Ú
r   Nz	Dataset: z entries written to )r’   ÚDATASET_FILEÚopenre   Úkeysr(   rR   r   rf   rS   rb   Úwrite)	r^   ÚcountÚfrx   Úentryr,   rQ   Ú
audio_pathÚliner   r   r   r™   ø   s*   þ
ñÿr™   Ú__main__r/   )r   r   r   r   r   r   )r   r   )r8   r   r   r   )r^   r   )r   rd   )r8   r   r   r   )r8   r   r   rl   )1Ú__doc__Ú
__future__r   rS   Úosr“   rv   Úconcurrent.futuresr   r   Úpathlibr   Úpydanticr   r   rf   r•   rZ   r²   rO   Úgetenvr   r˜   rr   ru   r›   Úenv_filer[   r\   Ú
splitlinesrº   Ú
startswithr'   rR   r   r!   r1   rL   rP   r7   rY   r]   rc   rh   rk   r}   r­   r™   r   r   r   r   r   Ú<module>   sX   
þ







K
ÿ