o
    SiI(                     @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZ g dZdd eD ZeeddZ g dZ!dgZ"e!Z#g dZ$g dZ%dZ&		d.dedee' de	fddZ(						d/dedee de'd e'd!ee' d"ee' d#e)dee'ee'eeef f f fd$d%Z*G d&d' d'Z+	d0de	de	d e'd#e)dee'ee'eeef f f f
d(d)Z,de	d!e'd"e'deeef fd*d+Z-de	d e'deedf fd,d-Z.dS )1uI  
VoxPopuli provides

- 400K hours of unlabelled speech data for 23 languages
- 1.8K hours of transcribed speech data for 16 languages
- 17.3K hours of speech-to-speech interpretation data for 15x15 directions
- 29 hours of transcribed speech data of non-native English intended for research in ASR
for accented speech (15 L2 accents)

The raw data is collected from 2009-2020 European Parliament event recordings.
For details about the corpus, please refer to the website:
https://github.com/facebookresearch/voxpopuli

Reference:
Wang, Changhan et al. “VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation
Learning, Semi-Supervised Learning and Interpretation.” Annual Meeting of the Association
for Computational Linguistics (2021).

This script is based on code from the repository linked above.

NOTE: Our data preparation is slightly different from the original repository. In particular,
we only use the metadata to create manifests, i.e., we do not create segment-level wav files,
unlike the original repository. In this way, we can avoid duplicating the audio files.
    N)defaultdict)Path)DictOptionalTupleUnion)download_url_to_file)tqdm)RecordingSetSupervisionSegmentSupervisionSet)fix_manifests$validate_recordings_and_supervisions)Pathlikesafe_extract)endefresplitrohucsnlfihrsksletltptbgellvmtsvdac                 C      g | ]}| d qS )_v2 .0xr*   r*   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/voxpopuli.py
<listcomp>.       r/   i  i  )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    en_accented)r   r   r   z(https://dl.fbaipublicfiles.com/voxpopuli.asr
target_dirsubsetreturnc              
   C   s\  t | } | jddd |tv r!|dd g}tdd tD  }n*|tv r+|g}t}n tttdgd|d	}td
d tD  tddgtd|d	}g }|D ]}|D ]}|t d| d| d qSqO| d }|jddd t	
t| d t|D ]-}|t |j }	t||	 t|	d}
t|
| W d	   n1 sw   Y  |	  q~| S )aw  
    Download and untar/unzip the VoxPopuli dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param subset: str, the subset of the dataset to download, can be one of "400k", "100k",
        "10k", "asr", or any of the languages in LANGUAGES or LANGUAGES_V2.
    :return: the path to downloaded and extracted directory with data.
    T)parentsexist_ok_r   c                 S   r(   _2r*   r,   yr*   r*   r.   r/   X   r0   z&download_voxpopuli.<locals>.<listcomp>original)400k100k10kr3   Nc                 S   r(   r:   r*   r<   r*   r*   r.   r/   d   r0   i  i  z/audios/z.tar
raw_audiosr8   r7   z files to download...r)r   mkdirLANGUAGES_V2splitYEARS	LANGUAGESgetappendDOWNLOAD_BASE_URLlogginginfolenr	   namer   tarfileopenr   unlink)r4   r5   	languagesyearsurl_listlr=   out_rooturltar_pathtar_filer*   r*   r.   download_voxpopuliG   sL   

r\   r      
corpus_dir
output_dirtasklangsource_langtarget_langnum_jobsc                 C   s  t | } |  sJ d|  |durt |}|jddd |dkr6|tv s-J d| t| |||d}n5|dkrW|tv sEJ d	| |tv sPJ d
| t| ||}n|dkrk|tv sfJ d| t| |}|	 D ]R\}}	t
di |	\}
}t|
|d |
|| d< ||| d< |dkr| d| n|}|dur|
|d| d| d| d  ||d| d| d| d  qo|S )aY  
    Prepares and returns the VoxPopuli manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param task: str, the task to prepare the manifests for, can be one of "asr", "s2s", "lm".
    :param lang: str, the language to prepare the manifests for, can be one of LANGUAGES
        or LANGUAGES_V2. This is used for "asr" and "lm" tasks.
    :param source_lang: str, the source language for the s2s task, can be one of S2S_SRC_LANGUAGES.
    :param target_lang: str, the target language for the s2s task, can be one of S2S_TGT_LANGUAGES.
    :param num_jobs: int, the number of parallel jobs to use for preparing the manifests.
    :return: Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]], the manifests.
    zNo such directory: NTrC   r3   zUnsupported language: )rd   s2szUnsupported source language: zUnsupported target language: lm)
recordingssupervisionsrg   rh   -z
voxpopuli-_recordings_z	.jsonl.gz_supervisions_r*   )r   is_dirrE   ASR_LANGUAGES_prepare_voxpopuli_asrS2S_SRC_LANGUAGESS2S_TGT_LANGUAGES_prepare_voxpopuli_s2s_prepare_voxpopuli_lmitemsr   r   to_file)r^   r_   r`   ra   rb   rc   rd   	manifestskvrg   rh   
lang_affixr*   r*   r.   prepare_voxpopuli|   sL   


ry   c                   @   s0   e Zd ZdZdefddZdedefddZd	S )
RecordingIdFnz
    This functor class avoids error in multiprocessing:
    `AttributeError: Can't pickle local object '_prepare_voxpopuli_asr.<locals>.<lambda>'`
    languagec                 C   s
   || _ d S Nr{   )selfr{   r*   r*   r.   __init__   s   
zRecordingIdFn.__init__pathr6   c                 C   s,   t d| j dd|j}t dd|}|S )Nr9   $ z
_original$)resubr{   stem)r~   r   recording_idr*   r*   r.   __call__   s   zRecordingIdFn.__call__N)__name__
__module____qualname____doc__strr   r   r   r*   r*   r*   r.   rz      s    rz   c                    s  t d | d | }tj|d|t|dd}t d| d}|t|j }| s;t d| d	|  t	|| nt d
|  t
|d}dd tj|ddD }	W d   n1 s`w   Y  tt}
tdd }t|	D ]Q}|d dvr~qs|d }t|d }t|d | }||  d7  < |
 t| d||  |t|ddt|ddd||d |d |d d |d! id"
 qstt |
 D ]\}ttd#d |D  < qtt}dD ]| fd$d| d%< t|
 | d&< q|S )'zG
    Download metadata TSV and prepare manifests for the ASR task.
    z5Preparing recordings (this may take a few minutes)...rB   z*.oggr}   )rd   r   z/annotations/asr/asr_z.tsv.gzzDownloading : z -> z!Using pre-downloaded annotations rtc                 S   s   g | ]}|qS r*   r*   r+   r*   r*   r.   r/      s    z*_prepare_voxpopuli_asr.<locals>.<listcomp>|)	delimiterNc                   S   s   dS )Nr   r*   r*   r*   r*   r.   <lambda>   s    z(_prepare_voxpopuli_asr.<locals>.<lambda>rG   )traindevtest
session_id
start_timeend_timer]   ri      )ndigitsr   
speaker_idgendernormed_text	orig_textoriginal_text)
idr   startdurationchannelr{   speakerr   textcustomc                 S   s   g | ]}|j qS r*   )r   )r,   sr*   r*   r.   r/     s    c                    s   | j   v S r|   )r   )rD   reco_idsrG   r*   r.   r     s    rg   rh   )rM   rN   r
   from_dirrz   rL   r   rP   existsr   gziprR   csv
DictReaderr   listr	   floatrK   r   roundrs   sortedsetdictfilterr   from_segments)r^   r_   ra   rd   in_rootrg   rY   tsv_pathfmetadatasegmentsnum_segmentsrD   reco_idr   r   segsru   r*   r   r.   rn      sh   



rn   c                 C      t )z1
    Prepare the manifests for the s2s task.
    NotImplementedError)r^   rb   rc   r*   r*   r.   rq     s   rq   c                 C   r   )z0
    Prepare the manifests for the lm task.
    r   )r^   ra   r*   r*   r.   rr   $  s   rr   )r2   r3   )Nr3   r   NNr]   )r]   )/r   r   r   rM   r   rQ   collectionsr   pathlibr   typingr   r   r   r   	torch.hubr   r	   lhotser
   r   r   	lhotse.qar   r   lhotse.utilsr   r   rI   rF   r   rangerH   rm   ASR_ACCENTED_LANGUAGESro   rp   *S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTIONrL   r   r\   intry   rz   rn   rq   rr   r*   r*   r*   r.   <module>   s    
7
B
N

"	