o
    i                     @  s   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlZddlZddlmZ d	d
lmZ eeZedZd0ddZd1ddZe	G dd dZ		d2d3d#d$Zd4d'd(Zd5d,d-Z d5d.d/Z!dS )6a  
Recovery loader: rebuild validation-ready segments from raw `1-cleaned-data`
audio and historical transcription rows.

This is the core primitive for the recover path:
  - replay `audio_polish` from raw parent FLACs
  - regenerate deterministic child segment IDs
  - intersect them with historical `transcription_results`
  - return SegmentData objects ready for the validation models
    )annotationsN)defaultdict)	dataclassfield)Path)Optional)polish_all_segments   )SegmentDataz
_split\d+$
segment_idstrreturnc                 C  s   t d| S )z<Map a child segment ID back to its raw parent FLAC filename. )_SPLIT_SUFFIX_REsub)r    r   6/home/ubuntu/transcripts/validations/recover_loader.pyparent_segment_file!   s   r   original_file	was_splitboolsplit_indexintc                 C  s   |r	|  d| S | S )z8Rebuild the segment ID exactly like `src.pipeline` does._splitr   )r   r   r   r   r   r   replay_segment_id&   s   r   c                   @  s   e Zd ZU ded< eedZded< eedZded< eedZded< eedZ	ded	< eedZ
ded
< eedZded< dS )RecoveryLoadResultdictmetadata)default_factoryzlist[SegmentData]segmentsz	list[str]matched_tx_idsmissing_tx_idsreplayed_regen_idsextra_regen_idsmissing_parent_filesN)__name__
__module____qualname____annotations__r   listr   r    r!   r"   r#   r$   r   r   r   r   r   -   s   
 r   Fwork_dirr   video_idtx_rows
list[dict]target_segment_idsOptional[set[str]]replay_all_tx_parentsc                    s8  | | }|  s
| }t|d}i }|rt| }t|d}|s1td| d|  t|dS dd |D  |dur@t	|nt	 }	 fd	d
|	D }
 fdd
|	D }	t
t	}t
t	} D ]}t|}|| | q`|	D ]}t|}|| | qpdd |dD }t|d}t	 }t	 }t	 }g }g }|r|n|}t|D ]}||}|du r|j| q|| || qi }|rt|}|D ]}||jjg | q|D ]}||g }tdd |D r|j| q||t	 }g }|D ]r}|jjrqt|jj|jj|jj}|| || ||	vr"q | }|jt|tt |j!" t#|jj$d |ddpDd|ddpLd|ddp[|ddp[dt#|dpcdt%|d || q|D ]}||vr|| qsqt||_&t|	| |
B |_'t||_(t||_)|S )a  
    Rebuild validation-ready SegmentData from raw extracted tar + tx rows.

    `tx_rows` should come from `transcription_results` and include at least:
      - segment_file
      - transcription
      - tagged
      - detected_language
      - quality_score

    If `target_segment_ids` is provided, only those historical IDs are returned.
    If `replay_all_tx_parents` is true, we still replay every historical parent
    audio file for the video so replay-only extras can be discovered even when
    all already-validated segments are skipped from GPU inference.
    All replay-only IDs for the processed parents are surfaced via `extra_regen_ids`
    so the caller can log salvage candidates separately.
    zmetadata.jsonr   [z ] No raw segments/ dir found in )r   c                 S  s    i | ]}| d r|d  |qS )segment_file)get).0rowr   r   r   
<dictcomp>^   s     z)load_recover_segments.<locals>.<dictcomp>Nc                   s   h | ]}| vr|qS r   r   r4   seg_idtx_mapr   r   	<setcomp>b       z(load_recover_segments.<locals>.<setcomp>c                   s   h | ]}| v r|qS r   r   r7   r9   r   r   r;   c   r<   c                 S  s   i | ]}|j |qS r   )name)r4   pathr   r   r   r6   n   s    z*.flacc                 s  s&    | ]}|j jo|j jd V  qdS )zPolish error:N)	trim_meta	discardeddiscard_reason
startswith)r4   segr   r   r   	<genexpr>   s
    
z(load_recover_segments.<locals>.<genexpr>g     @@transcriptionr   taggeddetected_languageexpected_language_hintquality_scoreg        )r2   waveform
duration_sgemini_transcriptiongemini_taggedgemini_langgemini_quality_scorespeaker_info)*exists
_find_filejsonloads	read_text	_find_dirloggerwarningr   setr   r   addglobsortedr3   r$   appendr   
setdefaultr?   r   anyr@   r   r   r   r   r
   torch
from_numpynpascontiguousarrayaudiocopyfloatfinal_duration_ms_speaker_info_from_rowr    r!   r"   r#   ) r*   r+   r,   r.   r0   	video_dirmetadata_pathr   segments_dirrequested_idsunknown_target_idsparent_to_requested_idsparent_to_all_tx_idsr8   parent	raw_pathsresultmatchedreplayed_idsextrasreplay_pathsreplay_parent_filesreplay_parent_sourceparent_fileraw_pathpolished_by_parentreplayedrC   polishedparent_tx_idsregen_ids_for_parentr5   r   r9   r   load_recover_segments8   s   















r   r5   r   c                 C  sP   |  ddpd|  ddpd|  ddpd|  ddpdd}dd | D S )	Nspeaker_emotionr   speaker_stylespeaker_pacespeaker_accent)emotionspeaking_stylepaceaccentc                 S  s   i | ]	\}}|r||qS r   r   )r4   kvr   r   r   r6      s    z*_speaker_info_from_row.<locals>.<dictcomp>)r3   items)r5   speakerr   r   r   rh      s   rh   rootr=   Optional[Path]c                 C  $   |  |D ]
}| r|  S qd S N)rglobis_filer   r=   r>   r   r   r   rR      
   rR   c                 C  r   r   )r   is_dirr   r   r   r   rV      r   rV   )r   r   r   r   )r   r   r   r   r   r   r   r   )NF)r*   r   r+   r   r,   r-   r.   r/   r0   r   r   r   )r5   r   r   r   )r   r   r=   r   r   r   )"__doc__
__future__r   rS   loggingrecollectionsr   dataclassesr   r   pathlibr   typingr   numpyrb   r`   src.audio_polishr   audio_loaderr
   	getLoggerr%   rW   compiler   r   r   r   r   rh   rR   rV   r   r   r   r   <module>   s4    




 


