o
    Oʦi%                     @  s   d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
Z
ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZeG dd dZG dd dZdS )u   
Per-video validation pipeline: orchestrates all models over a video's segments.
Downloads tar → loads audio → runs LID → runs CTC → returns rich segment data.
Maximum GPU utilization via batched inference and parallel model execution.
    )annotationsN)	dataclassfield)Optional   )ValidationConfigTARGET_LANGUAGESCONFORMER_LANG_CODES)SegmentData)MMSLID)VoxLinguaLID)IndicConformerMulti)
EnglishCTCc                   @  sJ  e Zd ZU dZded< ded< ded< dZded< dZded	< dZded
< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded < dZded!< d"Zd#ed$< d%Zd&ed'< dZded(< dS ))SegmentResultz'Full validation result for one segment.strvideo_idsegment_filefloat
duration_s gemini_langgemini_transcriptiongemini_tagged        gemini_quality_scorespeaker_infomms_lang_iso3mms_lang_iso1mms_confidencemms_top3vox_langvox_lang_iso1vox_confidencevox_top3    bytesvox_speaker_embeddingconformer_multi_transcriptionNzOptional[float]conformer_multi_ctc_rawconformer_multi_ctc_normalizedwav2vec_transcriptionwav2vec_ctc_rawwav2vec_ctc_normalizedwav2vec_model_usedFboollid_consensusr   intlid_agree_countconsensus_lang)__name__
__module____qualname____doc____annotations__r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r&   r'   r(   r)   r*   r+   r,   r-   r/   r1   r2    r8   r8   0/home/ubuntu/transcripts/validations/pipeline.pyr      s:   
 r   c                   @  s<   e Zd ZdZdddZdd Zdd	 ZdddZdd ZdS )ValidationPipelinez:Orchestrates model inference over all segments of a video.configr   c                 C  s(   || _ d | _d | _d | _d | _d| _d S NF)r;   mmsvox	conformerwav2vec_loaded)selfr;   r8   r8   r9   __init__H   s   
zValidationPipeline.__init__c                 C  s   t j rdnd}| jj}t }| jjr!t|d| _| j	| | jj
r1t|d| _| j	| | jjrAt|d| _| j	| | jjrQt|d| _| j	| d| _t | }|   td|dd dS )	zLoad all enabled models to GPU.cudacpu)deviceTzAll models loaded in .1fsN)torchrD   is_availabler;   hf_tokentimeenable_mms_lidr   r=   loadenable_voxlinguar   r>   enable_conformer_multir   r?   enable_wav2vec_langr   r@   rA   _log_gpu_usageloggerinfo)rB   rF   hft0elapsedr8   r8   r9   load_modelsP   s&   zValidationPipeline.load_modelsc                 C  s0   | j | j| j| jfD ]}|r|  q
d| _d S r<   )r=   r>   r?   r@   unloadrA   )rB   modelr8   r8   r9   unload_modelsk   s
   
z ValidationPipeline.unload_modelsr   r   segmentslist[SegmentData]returnlist[SegmentResult]c           .        s  |sg S t   }t|}td| d| d dd |D i g| }i g| }| jrJt   }| j}td| d| dt   | dd	 | jrkt   }| j}td| d
| dt   | dd	 i g| }	i g| }
dd |D  dd |D dd  D }dd t|D }| jr|rt   }fdd|D } fdd|D }fdd|D }| j|||}t|D ]
\}}|| |	|< qtd| dt| dt   | dd	 | j	rN| j	j
fdd D }dd t|D }|rNt   }fdd|D } fdd|D }fdd|D }| j	|||}t|D ]\}}|| |
|< q,td| dt| dt   | dd	 ddl}g }t|D ]\}}|| }|| } |	| }!|
| }"|dd}#| dd}$|j}%|%|#|$g}&dd |&D }&|&rdd lm}' |'|&}(|(d!d \})}*|*d"k}+nd#\})}*}+|tdEi d$|d%|jd&t|jd'd(|%d)|jd*|jd+|jd,|jr||jndd-|d-dd|#d.|d.d/d0||d0g d1| d1dd|$d2| d2d/d3|| d3g d4| d4d5d6|!d6dd7|!d7d8|!d8d9|"d9dd:|"d:d;|"d;d<|"d<dd=|+d>|*d?|) qXt   | },|,dkr]||, nd}-td| d@| dA|,ddB|-dCdD	 |S )Fz
        Run all validation models over a video's segments.
        Returns one SegmentResult per segment with all metrics populated.
        [z] Processing z	 segmentsc                 S     g | ]}|j qS r8   )waveform.0rH   r8   r8   r9   
<listcomp>       z4ValidationPipeline.process_video.<locals>.<listcomp>z] MMS LID: z	 segs in rG   rH   z] VoxLingua: c                 S  ra   r8   )r   rc   r8   r8   r9   re      rf   c                 S  ra   r8   )r   rc   r8   r8   r9   re      rf   c                 S  s   g | ]}|t v qS r8   )r	   rd   langr8   r8   r9   re          c                 S     g | ]\}}|r|qS r8   r8   rd   imr8   r8   r9   re          c                      g | ]} | qS r8   r8   rd   rl   	waveformsr8   r9   re      ri   c                   ro   r8   r8   rp   
lang_codesr8   r9   re      ri   c                   ro   r8   r8   rp   	ref_textsr8   r9   re      ri   z] Conformer CTC: z Indic segs in c                   s   g | ]}| v qS r8   r8   rg   )wav2vec_langsr8   r9   re      ri   c                 S  rj   r8   r8   rk   r8   r8   r9   re      rn   c                   ro   r8   r8   rp   rq   r8   r9   re      ri   c                   ro   r8   r8   rp   rs   r8   r9   re      ri   c                   ro   r8   r8   rp   ru   r8   r9   re      ri   z] Wav2Vec CTC: r   Nr   r   r!   c                 S  s   g | ]}|r|qS r8   r8   )rd   lr8   r8   r9   re      ri   )Counterr      )r   r   Fr   r   r      r   r   r   r   r   r   r   r   r   r    r"   r#   r&   r$   r'   r(   r)   r*   r+   r,   r-   r/   r1   r2   z] Done: z segments in zs (z.0fz segs/s)r8   )rL   lenrS   rT   r=   predict_batchr>   	enumerater?   r@   available_languagesjsongetr   collectionsry   most_commonappendr   r   roundr   r   r   r   r   dumps).rB   r   r\   rV   nmms_resultsvox_resultst_lidt_voxconformer_resultswav2vec_results
indic_maskindic_indicest_conf
indic_wavsindic_langs
indic_refs
conf_batchjidxwav2vec_maskwav2vec_indicest_w2vw2v_wavs	w2v_langsw2v_refs	w2v_batchr   resultsrl   segmrvrcrwrmms_iso1vox_iso1r   langsry   countsr2   agree_countr/   rW   segs_per_secr8   )rt   rv   rw   rr   r9   process_videoq   s  

((





	





z ValidationPipeline.process_videoc                 C  sJ   t j r#t j d }t j d }td|dd|dd d S d S )Ng    eAz
GPU VRAM: rG   zGB allocated, zGB reserved)rI   rD   rJ   memory_allocatedmemory_reservedrS   rT   )rB   	allocatedreservedr8   r8   r9   rR      s
   
 z!ValidationPipeline._log_gpu_usageN)r;   r   )r   r   r\   r]   r^   r_   )	r3   r4   r5   r6   rC   rX   r[   r   rR   r8   r8   r8   r9   r:   E   s    

 r:   )r6   
__future__r   loggingrL   dataclassesr   r   typingr   rI   r;   r   r   r	   audio_loaderr
   models.mms_lidr   models.voxlinguar   models.conformer_multir   models.wav2vec_langr   	getLoggerr3   rS   r   r:   r8   r8   r8   r9   <module>   s"    
+