o
    2wi                     @   s.  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ dZ	ddedee de	fddZdeddfdedee deeee f de dedeeeeeeef f f fddZ!d dededefddZ"dS )!u  
The following are the original TED-LIUM 3 README contents.

This is the TED-LIUM corpus release 3,
licensed under Creative Commons BY-NC-ND 3.0 (http://creativecommons.org/licenses/by-nc-nd/3.0/deed.en).

All talks and text are property of TED Conferences LLC.

This new TED-LIUM release was made through a collaboration between the Ubiqus company and the LIUM (University of Le Mans, France)

---

Contents:

- 2351 audio talks in NIST sphere format (SPH), including talks from TED-LIUM 2: be careful, same talks but not same audio files (only these audio file must be used with the TED-LIUM 3 STM files)
--> 452 hours of audio
- 2351 aligned automatic transcripts in STM format

- TEDLIUM 2 dev and test data: 19 TED talks in SPH format with corresponding manual transcriptions (cf. 'legacy' distribution below).

- Dictionary with pronunciations (159848 entries), same file as the one included in TED-LIUM 2
- Selected monolingual data for language modeling from WMT12 publicly available corpora: these files come from the TED-LIUM 2 release, but have been modified to get a tokenization more relevant for English language

- Two corpus distributions:
-- the legacy one, on which the dev and test datasets are the same as in TED-LIUM 2 (and TED-LIUM 1).
-- the 'speaker adaptation' one, especially designed for experiments on speaker adaptation.

---

SPH format info:

Channels       : 1
Sample Rate    : 16000
Precision      : 16-bit
Bit Rate       : 256k
Sample Encoding: 16-bit Signed Integer PCM

---

François Hernandez, Vincent Nguyen, Sahar Ghannay, Natalia Tomashenko, and Yannick Estève, "TED-LIUM 3: twice as much data and corpus repartition for experiments on speaker adaptation", submitted to the 20th International Conference on Speech and Computer (SPECOM 2018), September 2018, Leipzig, Germany
A preprint version is available on arxiv (and in the doc/ directory):
https://arxiv.org/abs/1805.04699
    N)ThreadPoolExecutor)partial)Path)DictOptionalSequenceUnion)RecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)fix_manifests)normalize_text_tedlium)Pathlikeresumable_downloadsafe_extract)traindevtest.F
target_dirforce_downloadreturnc                 C   s   t | } | jddd | d }| d }|d }| r*td|j d| d |S td	||d
 tj|dd t	
|}t|| d W d    n1 sNw   Y  |  |S )NT)parentsexist_okzTEDLIUM_release-3.tgzzTEDLIUM_release-3z
.completedz	Skipping z	 because z exists.z9http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz)filenamer   )ignore_errors)path)r   mkdiris_filelogginginfonamer   shutilrmtreetarfileopenr   touch)r   r   tar_path
corpus_dircompleted_detectortar r,   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/tedlium.pydownload_tedliumA   s&   r.      nonetedlium_root
output_dirdataset_partsnum_jobsnormalize_textc              	   C   s  t | } |durt |nd}i }t|tr|gn|}t|}|D ]}td| d | d | }tj|d d|d}	t|d 	d	}
t
|
t
|	ks\J d
t
|	 dt
|
 dg }tt|d}|
D ]}|||| qfg }|D ]	}||  qvt|}t|	|\}	}|	|d||< tdi ||  |dur|	|d| d  ||d| d  q!W d   |S 1 sw   Y  |S )ae  
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :param output_dir: Path where the manifests should be written.
    :param dataset_parts: Which parts of the dataset to prepare.
        By default, all parts are prepared.
    :param num_jobs: Number of parallel jobs to use.
    :return: A dict with standard corpus splits containing the manifests.
    NzProcessing z	 split...legacysphz*.sph)patternr4   stmz*.stmzMismatch: found z sphere files and z9 STM files. You might be missing some parts of TEDLIUM...)r5   )
recordingssupervisionstedlium_recordings_z	.jsonl.gztedlium_supervisions_r,   )r   
isinstancestrr   r    r!   r	   from_dirlistgloblenr   _parse_stm_fileappendsubmitextendresultr   from_segmentsr   r   to_file)r1   r2   r3   r4   r5   corpusexsplitrootr:   stmsfutures_parse_stm_workerr9   segmentsfuturer;   r,   r,   r-   prepare_tedliumX   sN   



""rT   r9   c                 C   s   g }|   Q}t|D ]C\}}| ^}}}}}	}}
t|t|	}}	d|
dd}|dkr1q|t| d| ||t|	| dddt	||d	|d
 qW d   |S 1 sZw   Y  |S )z+Helper function to parse a single STM file. z{NOISE}z[NOISE]ignore_time_segment_in_scoring-   )ndigitsr   English)idrecording_idstartdurationchanneltextlanguagespeakerN)
r&   	enumeraterM   floatjoinreplacerE   r
   roundr   )r9   r5   rR   fidxlrec_id_r]   endwordsr`   r,   r,   r-   rD      s2   

rD   )r   F)r0   )#__doc__r    r#   r%   concurrent.futures.threadr   	functoolsr   pathlibr   typingr   r   r   r   lhotser	   r
   r   r   	lhotse.qar   lhotse.recipes.utilsr   lhotse.utilsr   r   r   TEDLIUM_PARTSboolr.   r?   intrT   rD   r,   r,   r,   r-   <module>   sN    +

?