o
    Si                     @   s  d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ 	ddededededeeeeeeef f f f
ddZ dede
e	e! e	e f dede
ee	e f fddZ"dS )a*  
About the MuST-C corpus

MuST-C is a multilingual speech translation corpus whose size and quality will
facilitate the training of end-to-end systems for SLT from English into a set
of languages.

For each target language, MuST-C comprises several hundred hours of audio
recordings from English TED Talks, which are automatically aligned at the
sentence level with their manual transcriptions and translations.

We don't provide download_mustc().

Please refer to
https://ict.fbk.eu/must-c-releases/
for downloading.

If you have downloaded and extracted the dataset to the directory

/ceph-data3/fangjun/data/must-c/v2.0/en-de
/ceph-data3/fangjun/data/must-c/v2.0/en-zh

You can call lhotse with the following commands

(1) When the target language is German:

    lhotse prepare must-c       --tgt-lang de       -j 10       /ceph-data3/fangjun/data/must-c/v2.0/       ./data/manifests/v2.0

(2) When the target language is Chinese:

    lhotse prepare must-c       --tgt-lang zh       -j 10       /ceph-data3/fangjun/data/must-c/v2.0/       ./data/manifests/v2.0
    N)groupbyrepeat)Path)DictListTupleUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)parallel_map)	load_yaml)SupervisionSegmentSupervisionSet)PathlikeSeconds   
corpus_dir
output_dirtgt_langnum_jobsreturnc                 C   s  d}t | | d| d }| sJ |g d}t |}|jddd |D ]-}td|  || }| s@J || t|d | d	|  }	d
d |	D }
W d   n1 s_w   Y  t|d | d }t|
t|ksJ t|
t|fg }d}t|dd D ]#\}}t	|g}t|d }|| }|
|
||  |}|
| q|t|
ksJ |t|
fg }g }tttt|d |t||dd| dD ]\}}|
| || qtt|t|d\}}t||d t|d| d| d| d }|D ]}|| qW d   n	1 s#w   Y  t|d| d| d| d }|D ]}|| q<W d   n	1 sPw   Y  q'dS )a  Prepare manifests for the MUST-C corpus.

    :param: corpus_dir: We assume there is a folder {src_lang}-{tgt_lang} inside
        this directory.
    :param: output_dir: Directory where the manifests should be written.
    :param: tgt_lang: Target language, e.g., zh, de, etc.
    :param: src_lang: Source language, e.g., en.
    :param: num_jobs: Number of processes to use for parsing the data
    en-z/data)devz
tst-COMMONztst-HEtrainT)parentsexist_okzProcessing txt.c                 S   s   g | ]}|  qS  )strip).0liner"   r"   I/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/must_c.py
<listcomp>Z   s    z"prepare_must_c.<locals>.<listcomp>Nz.yamlr   c                 S   s   | d S )Nwavr"   )xr"   r"   r&   <lambda>d   s    z prepare_must_c.<locals>.<lambda>r(   )r   zProcessing must-c )desc)
recordingssupervisionsmust_c_recordings__z	.jsonl.gzmust_c_supervisions_)r   is_dirmkdirlogginginfoopenr   lenr   listappendr	   r   parse_utterancer   extendr
   r   from_recordingsr   from_segmentsr   open_writerwrite)r   r   r   r   src_langin_data_dirdatasetsddataset_dirftranscriptssegmentsgroupsstartr/   group	this_wavenum_sentencesendrecording_listsupervision_list	recordingsup_segmentsr,   r-   
rec_writerr
sup_writersupr"   r"   r&   prepare_must_c9   s   
$




rU   wave_dirrG   c                 C   s   |d }|d }t |t |ksJ t |t |f| |d d  }t|}g }tt||D ]*\}\}	}
|t|j d| |jt|	d t	t|	d ddd||	d	 |
d
 q0||fS )a  
    :param: wave_dir: The wave directory. It contains *.wav files.
    :param: groups: A tuple containing two lists. The first one is a list
        of dict, where each dict contains something like below:

          {duration: 3.500000, offset: 16.080000, rW: 9, uW: 0,
           speaker_id: spk.767, wav: ted_767.wav}

        The second one is a list of transcripts.
    :param: tgt_lang: The language of the transcript, e.g., zh, en, de, etc.
    r   r   r(   z-seg-offsetduration   )ndigits
speaker_id)idrecording_idrH   rX   channellanguagespeakertext)
r6   r   	from_file	enumeratezipr8   r   r\   r   round)rV   rG   r   wave_segmentsrE   	wave_filerO   rF   iwave_segment
transcriptr"   r"   r&   r9      s.   

r9   )r   )#__doc__r3   	itertoolsr   r   pathlibr   typingr   r   r   r   	tqdm.autor	   lhotser
   r   lhotse.audior   r   lhotse.parallelr   lhotse.serializationr   lhotse.supervisionr   r   lhotse.utilsr   r   strintrU   dictr9   r"   r"   r"   r&   <module>   sB    )
\