o
    Si|0                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	m	Z	 ddl
mZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ g d	Z	
		d dedeeeee f  dee defddZ	d!dedededefddZ			d"dedee deeeee f  dedeeeeeeeeef f f f f
ddZdd Z			d#dedee dededeeeeeeef f f f
ddZ dS )$a  
This recipe provides functionality for downloading and preparing the fleurs
corpus. The data is hosted on huggingface and to enable more control of the
download format, we use the streaming download interface and save each audio
file as it is streamed. The download can take quite some time.

The fleurs corpus consist of data in 102 languages spoken by multiple speakers.
There is about 10 hrs of trainign data in each language with smaller
accompanying dev and test sets. Full details can be found in

@inproceedings{conneau2023fleurs,
  title={Fleurs: Few-shot learning evaluation of universal representations of speech},
  author={Conneau, Alexis and Ma, Min and Khanuja, Simran and Zhang, Yu and Axelrod, Vera and Dalmia, Siddharth and Riesa, Jason and Rivera, Clara and Bapna, Ankur},
  booktitle={2022 IEEE Spoken Language Technology Workshop (SLT)},
  pages={798--805},
  year={2023},
  organization={IEEE}
}
    N)Path)DictOptionalSequenceUnion)tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSetaudiofix_manifests"get_ffmpeg_torchaudio_info_enabled"set_ffmpeg_torchaudio_info_enabled)parallel_map)Pathlikeis_module_available)faf_zaam_etar_egas_inast_esaz_azbe_bybg_bgbn_inbs_baca_esceb_phckb_iqcmn_hans_cncs_czcy_gbda_dkde_deel_gren_uses_419et_eefa_irff_snfi_fifil_phfr_frga_iegl_esgu_inha_nghe_ilhi_inhr_hrhu_huhy_amid_idig_ngis_isit_itja_jpjv_idka_gekam_kekea_cvkk_kzkm_khkn_inko_krky_kglb_lulg_ugln_cdlo_lalt_ltluo_kelv_lvmi_nzmk_mkml_inmn_mnmr_inms_mymt_mtmy_mmnb_none_npnl_nlnso_zany_mwoc_from_etor_inpa_inpl_plps_afpt_brro_roru_rusd_insk_sksl_sisn_zwso_sosr_rssv_sesw_keta_inte_intg_tjth_thtr_truk_uaumb_aour_pkuz_uzvi_vnwo_snxh_zayo_ngyue_hant_hkzu_za.allF
target_dir	languagesforce_downloadreturnc                 C   s~   t | } | d }|d }|jddd t|tr|dks"|d dkr$t}t|tr,|g}t|D ]}|| }t||| q0|S )a  
    Download the specified fleurs datasets.

    :param target_dir: The path to which the corpus will be downloaded.
    :type target_dir: Pathlike
    :param languages: Optional list of str or str specifying which
        languages to download. The str specifier for a language has the
        ISOCODE_COUNTRYCODE format, and is all lower case. By default
        this is set to "all", which will download the entire set of
        languages.
    :type languages: Optional[Union[str, Sequence[str]]]
    :param force_download: Specifies whether to overwrite an existing
        archive.
    :type force_download: bool
    :return: The root path of the downloaded data
    :rtype: Path
    fleursmetadataTparentsexist_okrz   r   )r   mkdir
isinstancestrDEFAULT_LANGUAGESr   download_single_fleurs_language)r{   r|   r}   
corpus_dirmetadata_dirlanglang_dir r   I/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/fleurs.pydownload_fleurs   s    
r   languagec                 C   s  t dstdddlm} dd }t| } | jd d | }| jddd	 |jddd	 | d
| d }| rB|sBt	d | S t
g dD ]}|d|| d|d}g }	|dkr[dn|}
| |
 }|jddd	 t
|d| d|
 dD ]R}t| dt|d d j |d d |d d  t|d t|d d j|d |d dd|d  d t|d  |d! d"krd#nd$g}|	| qtt||
 d% d&}|	D ]}td'||d( qW d)   n1 sw   Y  qH|  | S )*a  
    Download a single fleurs language

    :param target_dir: The path to which one langauge will be downloaded
    :type target_dir: Pathlike
    :param language: The code for the specified language
    :type language: str
    :param force_download: Specifies whether to overwrite an existing
        archive.
    :type force_download: bool
    :return: The path to the downloaded data for the specified language
    :rtype: Path
    datasetsz[The huggingface datasets package is not installed. Please install ...(pip install datasets)r   )load_datasetc                 S   s   | S )Nr   )xr   r   r   	_identity   s   z2download_single_fleurs_language.<locals>._identityr   Tr   ry   
_completedz5Skipping dowload because {completed_detector} exists.)train
validationtestzgoogle/fleurs)	cache_dir	streamingsplitr   devzDownloading data from -desc/r   patharraysampling_rateidraw_transcriptiontranscription |z |num_samplesgender   FEMALEMALE.tsvw	)fileN)r   ImportErrorr   r   r   r   r   is_filelogginginfor   r   
save_audionamer   joinr   appendopenprinttouch)r{   r   r}   r   r   r   completed_detectorr   r   r   osplit	split_dirdata	metadata_fmdr   r   r   r      sb   




	r   r   r   
output_dirnum_jobsc              
   C   sp  t | tr	t| } |  sJ d|  t |trt|}|jdddd t}t |tr4|dkr4|g}nt |ts>t |trF|d dkrF|}i }|D ]/}| |  }| sbt	d| d| d	 qJ||  }|jdddd t
||||d
||< qJ|dur|D ]5}	dD ]0}
||	 |
 d ||	  d|	 d|
 d  ||	 |
 d ||	  d|	 d|
 d  qq|S )a	  
    Prepares the manifest for all of the FLEURS languages requested.

    :param corpus_dir: Path to the root where the FLEURS data are stored.
    :type corpus_dir: Pathlike,
    :param output_dir: The directory where the .jsonl.gz manifests will be written.
    :type output_dir: Pathlike,
    :param langauges: str or str sequence specifying the languages to prepare.
        The str 'all' prepares all 102 languages.
    :return: The manifest
    :rtype: Dict[str, Dict[str, Union[RecordingSet, Supervisions]]]]
    zNo such directory: i  T)moder   r   rz   r   z	Skipping z. No directory z found.)r   r   Nr   r   r   supervisionszfleurs-_supervisions_z	.jsonl.gz
recordings_recordings_)r   r   r   is_dirr   r   listtupler   r   prepare_single_fleurs_languageto_file)r   r   r|   r   
langs_list	manifestsr   corpus_dir_langoutput_dir_langldsetr   r   r   prepare_fleurs  sJ   



r   c                 C   s   t j| t| jdS )Nrecording_id)r   	from_filer   stem)r   r   r   r   _make_recordingN  s   r   c                    s  t  tr	t  g g g d}g g g d}dD ]td d i }t jd d  j  d _}|D ]T}| d}	|	\}
}}}}}}|
|vrQd||
< ||
  d7  < t|j}| 	t
|
 d	||
  d	| |d
tt|d dd|||
 d	||
  |d|id
 q7W d   n1 sw   Y  qdD ]'ttt fdd| D |dd| d dD ]	}| 	| qqi }dD ]t| }t| }t||\}}||d|< q|S )a  
    Prepares manifests using a single FLEURS language.

    :param corpus_dir: Path to the root where the FLEURS data are stored.
    :type corpus_dir: Pathlike,
    :param output_dir: The directory where the .jsonl.gz manifests will be written.
    :type output_dir: Pathlike,
    :param langauge: str specifying the language to prepare.

    :return: The manifest
    :rtype: Dict[str, Dict[str, Union[RecordingSet, Supervisions]]]]
    r   z
Preparing z ...r   r   r   r   r   _g        i>     raw_text)
r   r   startdurationchanneltextr   speakerr   customNc                 3   s&    | ]}  d |j  d V  qdS )r   z.wavNr   ).0sr   r   r   r   	<genexpr>  s
    
z1prepare_single_fleurs_language.<locals>.<genexpr>)r   zMaking recordings from r   r   )r   r   )r   r   r   r   r   r   r   stripr   r   r
   roundintr   r   r   r   from_segmentsr	   from_recordingsr   )r   r   r   r   r   r   
prompt_idsr   r   vals	prompt_idfnamer   r   r   nsamplesr   recor   supsrecosr   r   r   r   R  sn   


r   )ry   rz   F)F)Nrz   r   )Nr   r   )!__doc__r   pathlibr   typingr   r   r   r   r   lhotser   r	   r
   r   r   r   r   r   lhotse.parallelr   lhotse.utilsr   r   r   r   boolr   r   r   r   r   r   r   r   r   r   <module>   sx    (
	k
/
M"
?