o
    Si#                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZ i dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5Zed6Zed7Z ed8Z!ed9Z"ed:Z#		;dHd<ed=ee d>e$d?ee%ee%eeef f f fd@dAZ&dBe%d?e%fdCdDZ'dEe	e d?e
e fdFdGZ(dS )Iz
BABEL is a collection of corpora created during the IARPA BABEL program:
https://www.iarpa.gov/index.php/research-programs/babel

It has about 25 languages with 40h - 160h of training recordings and ~20h
of development set recordings.
    N)defaultdict)Path)DictIterableListOptionalUnion)sliding_window)	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)combine)*remove_missing_recordings_and_supervisionstrim_supervisions_to_recordings)Pathlike101	Cantonese102Assamese103Bengali104Pashto105Turkish106Tagalog107
Vietnamese201Haitian202Swahili203Lao204Tamil205Kurmanji206Zulu207z	Tok-Pisin301Cebuano302Kazakh303Telugu
LithuanianGuaraniIgboAmharic	MongolianJavaneseDholuoGeorgian)304305306307401402403404z-(\(\(\)\)|<foreign>|<prompt>|<overlap>|<hes>)z"<(limspack|lipsmack|breath|cough)>z<(click|ring|dtmf|int|sta)>z<no-speech>z"<(male-to-female|female-to-male)> F
corpus_dir
output_dir
no_eval_okreturnc                 C   sv  t t}| }t| } dd | dD } | std| dt| dkr2td| d| d	  d
 | d	 j} dD ]|}| d| d }t	
dd |dD }t	
dd |dD }t||}	t|	d	krx|dkrp|rpq9td|  g }
| d| d }t|dD ]}|jd^}}}}}}}}ddd|d}|  dg }dd td|D }|dg7 }tdt|ddd |ddd D ]q\\}}\}}zJt|dd }t|dd }|
t| d| d| d| d| dtd | d!|j|t|| d"d#d	t|t| | d| d| d$ W q ty> } ztd%t|  td&| d'd}~ww qt|
}
t|
d	krStd(|  t |
}
|dkret|
d	krent!|	|
\}	}
t"|	|
}
t#|	|
 |	|
d)||< |durt|}|j$d*d*d+ t| }|d,krd-n|}|	%|d.| d/| d0  |
%|d.| d1| d0  q9t|S )2a  
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    c                 S   s   g | ]}|  r|qS  )is_dir).0drH   rH   H/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/babel.py
<listcomp>]   s    z1prepare_single_babel_language.<locals>.<listcomp>conversationalz;Could not find 'conversational' directory anywhere inside 'z' - please check your path.   z=It seems there are multiple 'conversational' directories in 'z)' - we are selecting the first one only (r   zq). Please ensure that you provided the path to a single language's dir, and the root dir for all BABEL languages.)devevaltrainingzconversational/z/audioc                 s       | ]}t |V  qd S Nr
   	from_filerJ   prH   rH   rL   	<genexpr>n       

z0prepare_single_babel_language.<locals>.<genexpr>z*.sphc                 s   rS   rT   rU   rW   rH   rH   rL   rY   q   rZ   z*.wavrQ   z No SPHERE or WAV files found in z/transcription*_AB)inLineoutLine c                 S   s(   g | ]\}}| d r| d s|qS )[)
startswith)rJ   prev_llrH   rH   rL   rM      s       Nd   06   )ndigits)idrecording_idstartdurationchanneltextlanguagespeakerz&Error while parsing segment. Message: z/Too many errors while parsing segments (file: 'z5'). Please check your data or increase the threshold.zNo supervisions found in )
recordingssupervisionsT)parentsexist_okrR   trainzbabel-_recordings_z	.jsonl.gz_supervisions_)&r   dictr   rglob
ValueErrorlenloggingwarningparentr   from_recordingsglobr   tqdmstemsplitget	read_text
splitlinesr	   zipfloatappendr   introundnormalize_textBABELCODE2LANG	Exceptionstrdeduplicate_supervisionsr   from_segmentsr   r   r   mkdirto_file)rD   rE   rF   	manifestsorig_corpus_dirr   	audio_dirsph_recordingswav_recordingsrt   ru   text_dirrX   p0p1	lang_coders   datehourrp   r\   lines	timestamprq   next_timestamprn   enderr   
save_splitrH   rH   rL   prepare_single_babel_languageB   s   







.




r   rq   c                 C   s@   t d| } td| } td| } td| } td| } | S )Nz<unk>z	<v-noise>z<noise>z	<silence>ra   )OOV_PATTERNsubSPK_NOISE_PATTERNNOISE_PATTERNSIL_PATTERNREMOVE_PATTERN)rq   rH   rH   rL   r      s   r   ru   c                 C   s|   ddl m} |dd t| dd d}g }| D ]"\}}t|dkr4tdt| d|d j d	 ||d  q|S )
Nr   )groupbyc                 S   s   | j S rT   )rl   )srH   rH   rL   <lambda>   s    z*deduplicate_supervisions.<locals>.<lambda>)keyrO   zFound z$ supervisions with conflicting IDs (z) - keeping only the first one.)	cytoolzr   sorteditemsr~   r   r   rl   r   )ru   r   
duplicatesfilteredkvrH   rH   rL   r      s   r   )NF))__doc__r   recollectionsr   pathlibr   typingr   r   r   r   r   r   r   r	   lhotser
   r   r   r   r   lhotse.manipulationr   	lhotse.qar   r   lhotse.utilsr   r   compiler   r   r   r   r   boolr   r   r   r   rH   rH   rH   rL   <module>   s    	






 	