o
    i                     @  s   d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlmZmZmZ eeZeG d	d
 d
Zd'ddZefd(ddZd)ddZd*d#d$Zd*d%d&ZdS )+z
Audio loading, resampling, and batch collation.
Loads FLAC segments from extracted tars, resamples to 16kHz,
and provides padded batches for model inference.
    )annotationsN)	dataclassfield)Path)Optional   )AUDIO_SAMPLE_RATEMAX_AUDIO_DURATION_SMIN_AUDIO_DURATION_Sc                   @  sx   e Zd ZU dZded< ded< ded< eZded	< d
Zded< d
Zded< d
Z	ded< dZ
ded< eedZded< dS )SegmentDataz6One audio segment with its metadata and transcription.strsegment_fileztorch.Tensorwaveformfloat
duration_sintsample_rate gemini_transcriptiongemini_taggedgemini_lang        gemini_quality_score)default_factorydictspeaker_infoN)__name__
__module____qualname____doc____annotations__r   r   r   r   r   r   r   r   r    r!   r!   4/home/ubuntu/transcripts/validations/audio_loader.pyr      s   
 r   work_dirr   video_idr   returntuple[dict, list[SegmentData]]c                 C  s  | | }|  s
| }t|d}i }|rt| }t|d}t|d}|s2td|  |g fS t|	d}|sGtd|  |g fS i }|r|	dD ].}	zt|	 }
|
||	j
< W qP ty~ } ztd|	j d	|  W Y d
}~qPd
}~ww i }g }d}|D ]}z{tt|\}}|jd dkr|jddd}|d}|tkr||vrtj|t||< || |}|jd t }|tk s|tkr|d7 }W q|j
}||i }|t|j|||dd|dd|d|dd|dd|di d W q ty' } ztd|j d	|  |d7 }W Y d
}~qd
}~ww |r=tdt| d| d ||fS tdt| d ||fS )a  
    Load all segments from an extracted transcribed tar.
    Returns (metadata_dict, list_of_segments).
    
    Expected structure:
      work_dir/{video_id}/metadata.json
      work_dir/{video_id}/segments/*.flac
      work_dir/{video_id}/transcriptions/*.json
    zmetadata.jsonsegmentstranscriptionszNo segments/ dir found in z*.flaczNo FLAC files in z*.jsonzBad transcription JSON z: Nr   r   T)dimkeepdimtranscriptionr   taggeddetected_languagelanguagequality_scorer   speaker)r   r   r   r   r   r   r   r   zFailed to load zLoaded z segments, skipped z (too short/long/corrupt)z	 segments)exists
_find_filejsonloads	read_text	_find_dirloggerwarningsortedglobstem	Exceptionname
torchaudioloadr   shapemeansqueezer   
transformsResampler
   r	   getappendr   infolen)r#   r$   	video_dirmetadata_pathmetadatasegments_dirtranscriptions_dir
flac_pathstranscription_map	json_pathdataeresampler_cacher'   skipped	flac_pathr   srr   r;   txr!   r!   r"   load_video_segments&   s   


$





rX   	waveformslist[torch.Tensor]	target_srr   !tuple[torch.Tensor, torch.Tensor]c                 C  sl   t jdd | D t jd}|  }t jt| |t jd}t| D ]\}}|||d|j	d f< q"||fS )zr
    Pad variable-length waveforms into a batch tensor.
    Returns (padded_batch [B, max_len], lengths [B]).
    c                 S  s   g | ]}|j d  qS )r   )r@   ).0wr!   r!   r"   
<listcomp>   s    z%collate_waveforms.<locals>.<listcomp>)dtypeNr   )
torchtensorlongmaxitemzerosrH   float32	enumerater@   )rY   r[   lengthsmax_lenbatchir^   r!   r!   r"   collate_waveforms   s   rm   r'   list[SegmentData]
batch_sizelist[list[SegmentData]]c                   s     fddt dt D S )zSplit segments into batches.c                   s   g | ]
}||   qS r!   r!   )r]   rl   ro   r'   r!   r"   r_      s    z"batch_segments.<locals>.<listcomp>r   )rangerH   )r'   ro   r!   rq   r"   batch_segments   s    rs   rootr=   Optional[Path]c                 C  $   |  |D ]
}| r|  S qd S N)rglobis_filert   r=   pr!   r!   r"   r2      
   r2   c                 C  rv   rw   )rx   is_dirrz   r!   r!   r"   r6      r|   r6   )r#   r   r$   r   r%   r&   )rY   rZ   r[   r   r%   r\   )r'   rn   ro   r   r%   rp   )rt   r   r=   r   r%   ru   )r   
__future__r   r3   loggingdataclassesr   r   pathlibr   typingr   numpynpra   r>   configr   r	   r
   	getLoggerr   r7   r   rX   rm   rs   r2   r6   r!   r!   r!   r"   <module>   s(    

b

