o
    ±§°iT  ã                   @  s&  U d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ee	eƒ ¡ jjd ƒ dZd	Zd
ZddddddddœZded< dZddddddddddd d!d"œZd#ed$< ee ¡ ƒZd%d&d'd(d)d*d+d,d-d.d/d0d"œZded1< d2d3„ e ¡ D ƒZded4< i d5d6“d7d8“d9d:“d;d<“d=d>“d?d@“dAdB“dCdD“dEdF“dGdH“dIdJ“dKdL“dMdN“dOdP“dQdR“dSdT“ZdedU< h dV£ZdWZdXZdYZ e!e "dZd[¡ƒZ#e!e "d\d]¡ƒZ$e!e "d^d[¡ƒZ%e!e "d_d[¡ƒZ&e!e "d`da¡ƒZ'e!e "dbdc¡ƒZ(ddZ)e!e "dedf¡ƒZ*e!e "dgdh¡ƒZ+drdsdndo„Z,eG dpdq„ dqƒƒZ-dS )tz’
Validation pipeline config: model IDs, language maps, scoring thresholds, env vars.
All tunables live here so Docker ENV overrides work cleanly.
é    )ÚannotationsN)Ú	dataclassÚfield)ÚPath)Úload_dotenvz.envzfacebook/mms-lid-256z&speechbrain/lang-id-voxlingua107-ecapaz+ai4bharat/indic-conformer-600m-multilingualzai4bharat/indicwav2vec-hindiz!ai4bharat/indicwav2vec_v1_bengaliz"ai4bharat/indicwav2vec_v1_gujaratizai4bharat/indicwav2vec_v1_tamilz ai4bharat/indicwav2vec_v1_teluguz!ai4bharat/indicwav2vec_v1_marathizai4bharat/indicwav2vec_v1_odia)ÚhiÚbnÚguÚtaÚteÚmrÚorzdict[str, str]ÚINDICWAV2VEC_MODELSz9ai4bharat/indicconformer_stt_{lang}_hybrid_ctc_rnnt_large)ÚHindiÚ
Devanagari)ÚMarathir   )ÚTelugur   )ÚTamilr   )ÚKannadar   )Ú	Malayalamr   )ÚGujaratir   )ÚPunjabiÚGurmukhi)ÚBengalir   )ÚAssameser   )ÚOdiar   )ÚEnglishÚLatin)r   r   r   r
   ÚknÚmlr	   Úpar   Úasr   Úenzdict[str, tuple[str, str]]ÚLANGUAGE_MAPÚhinÚmarÚtelÚtamÚkanÚmalÚgujÚpanÚbenÚasmÚoryÚengÚLANG_TO_ISO3c                 C  s   i | ]\}}||“qS © r1   )Ú.0ÚkÚvr1   r1   ú./home/ubuntu/transcripts/validations/config.pyÚ
<dictcomp>;   s    r6   ÚISO3_TO_LANGz	hi: Hindir   zmr: Marathir   z
te: Telugur   z	ta: Tamilr
   zkn: Kannadar   zml: Malayalamr   zgu: Gujaratir	   zpa: Punjabir    zbn: Bengalir   zas: Assameser!   zor: Odiar   zen: Englishr"   zur: UrduÚurz
sd: SindhiÚsdz
ne: NepaliÚnezsa: SanskritÚsaÚVOXLINGUA_LABEL_MAP>   r!   r   r	   r   r   Úksr   r   r:   r   r    r;   r9   r
   r   r8   ÚbrxÚdoiÚkokÚmaiÚmniÚsati€>  g      >@g      à?ÚMMS_BATCH_SIZEÚ8ÚVOX_BATCH_SIZEÚ16ÚCONFORMER_BATCH_SIZEÚWAV2VEC_BATCH_SIZEÚPREFETCH_QUEUE_SIZEÚ3ÚPARQUET_SHARD_SIZEÚ50é<   Ú
MAX_VIDEOSÚ0ÚLRU_MODEL_CACHE_SIZEÚ5Ú ÚkeyÚstrÚdefaultÚreturnc                 C  s   t  | |¡S )N)ÚosÚgetenv)rT   rV   r1   r1   r5   Ú_envc   s   rZ   c                   @  s€  e Zd ZU dZdZded< dZded< dZded< dZded	< dZ	ded
< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZd ed!< d"Zded#< d$Zded%< dZded&< dZded'< d(Zded)< dZded*< d+Zd ed,< d-Zded.< d-Zded/< d-Z ded0< d-Z!ded1< d2d3„ Z"d9d6d7„Z#d8S ):ÚValidationConfigz(Resolved config for validation pipeline.rS   rU   Úr2_endpoint_urlÚtranscribedÚr2_bucket_sourcezvalidation-resultsÚr2_model_bucketÚr2_bucket_outputÚr2_reference_bucketÚr2_access_key_idÚr2_secret_access_keyFÚboolÚr2_skip_uploadÚdatabase_urlÚsupabase_urlÚsupabase_admin_keyÚvalidation_recover_queueÚrecover_queue_tableÚdatabaseÚrecover_reference_modez4reference-data/transcription_results_recover.parquetÚrecover_tx_parquet_keyz2reference-data/transcription_flags_recover.parquetÚrecover_flags_parquet_keyz,reference-data/validated_segment_ids.parquetÚrecover_validated_parquet_keyz.reference-data/recover_reference_manifest.jsonÚrecover_reference_manifest_keyé   ÚintÚ&recover_reference_download_concurrencyzrecover-replay-ledgersÚrecover_replay_ledger_prefixÚshardsÚr2_shard_prefixÚhf_tokenÚ	worker_idÚunknownÚgpu_typeÚ	mock_moder   Ú
max_videosTÚenable_mms_lidÚenable_voxlinguaÚenable_conformer_multiÚenable_wav2vec_langc                 C  s°  | j ptdƒ| _ td| jƒ| _td| jƒ| _td| jƒ| _td| jƒ| _| jp*tdƒ| _| jp2tdƒ| _tdd	ƒ ¡ d
k| _	| j
pDtdƒ| _
| jpLtdƒ| _| jpTtdƒ| _td| jƒ| _td| jƒ ¡ | _td| jƒ| _td| jƒ| _td| jƒ| _td| jƒ| _ttdt| jƒƒƒ| _td| jƒ| _td| jƒ d¡| _| jp¤tdƒ| _| jpµtdtt ¡ ƒd d… ƒ| _td| jƒ| _| jsËtdd	ƒ ¡ d
k| _ttdt| jƒƒƒ| _d S )NÚR2_ENDPOINT_URLÚR2_VALIDATION_SOURCEÚR2_VALIDATION_MODEL_BUCKETÚR2_VALIDATION_OUTPUTÚR2_VALIDATION_REFERENCE_BUCKETÚR2_ACCESS_KEY_IDÚR2_SECRET_ACCESS_KEYÚR2_SKIP_UPLOADÚfalseÚtrueÚDATABASE_URLÚURLÚSUPABASE_ADMINÚVALIDATION_RECOVER_QUEUE_TABLEÚRECOVER_REFERENCE_MODEÚRECOVER_TX_PARQUET_KEYÚRECOVER_FLAGS_PARQUET_KEYÚRECOVER_VALIDATED_PARQUET_KEYÚRECOVER_REFERENCE_MANIFEST_KEYÚ&RECOVER_REFERENCE_DOWNLOAD_CONCURRENCYÚRECOVER_REPLAY_LEDGER_PREFIXÚR2_SHARD_PREFIXú/ÚHF_TOKENÚ	WORKER_IDé   ÚGPU_TYPEÚ	MOCK_MODErO   ) r\   rZ   r^   r_   r`   ra   rb   rc   Úlowerre   rf   rg   rh   rj   rl   rm   rn   ro   rp   rr   rU   rs   rt   rv   Ústriprw   rx   ÚuuidÚuuid4rz   r{   r|   )Úselfr1   r1   r5   Ú__post_init__‘   sJ   þþÿþ"zValidationConfig.__post_init__rW   ú	list[str]c                 C  s2   g }| j s| js| d¡ | jdvr| d¡ |S )NzR2_ENDPOINT_URL required>   Úparquetrk   z6RECOVER_REFERENCE_MODE must be 'database' or 'parquet')r{   r\   Úappendrl   )r¡   Úerrorsr1   r1   r5   Úvalidate¸   s   


zValidationConfig.validateN)rW   r£   )$Ú__name__Ú
__module__Ú__qualname__Ú__doc__r\   Ú__annotations__r^   r_   r`   ra   rb   rc   re   rf   rg   rh   rj   rl   rm   rn   ro   rp   rs   rt   rv   rw   rx   rz   r{   r|   r}   r~   r   r€   r¢   r§   r1   r1   r1   r5   r[   g   sB   
 'r[   )rS   )rT   rU   rV   rU   rW   rU   ).r«   Ú
__future__r   rX   rŸ   Údataclassesr   r   Úpathlibr   Údotenvr   Ú__file__ÚresolveÚparentÚMMS_LID_MODELÚVOXLINGUA_MODELÚCONFORMER_MULTI_MODELr   r¬   ÚCONFORMER_LANG_PATTERNr#   ÚsetÚkeysÚTARGET_LANGUAGESr0   Úitemsr7   r<   ÚCONFORMER_LANG_CODESÚAUDIO_SAMPLE_RATEÚMAX_AUDIO_DURATION_SÚMIN_AUDIO_DURATION_Srr   rY   rD   ÚVOXLINGUA_BATCH_SIZErH   rI   rJ   rL   ÚHEARTBEAT_INTERVAL_SrO   rQ   rZ   r[   r1   r1   r1   r5   Ú<module>   s°    ùôýÿÿÿþþþýýýüüüûûûú
