o
    2wi                     @   s   d Z ddlmZ ddlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ dededeeeef  fddZ	ddedededee def
ddZdS )a  
The Emilia dataset is constructed from a vast collection of speech data sourced
from diverse video platforms and podcasts on the Internet, covering various
content genres such as talk shows, interviews, debates, sports commentary, and
audiobooks. This variety ensures the dataset captures a wide array of real
human speaking styles. The initial version of the Emilia dataset includes a
total of 101,654 hours of multilingual speech data in six different languages:
English, French, German, Chinese, Japanese, and Korean.

See also
https://emilia-dataset.github.io/Emilia-Demo-Page/

Please note that Emilia does not own the copyright to the audio files; the
copyright remains with the original owners of the videos or audio. Users are
permitted to use this dataset only for non-commercial purposes under the
CC BY-NC-4.0 license.

Please refer to
https://huggingface.co/datasets/amphion/Emilia-Dataset
or
https://openxlab.org.cn/datasets/Amphion/Emilia
to download the dataset.

Note that you need to apply for downloading.

    )ThreadPoolExecutor)Path)OptionalTuple)tqdm)CutSetMonoCut)	Recording)
load_jsonl)SupervisionSegment)Pathlikedata_dirlinereturnc                 C   sd   | |d  }|  sdS tj||jd}t|j|jd|jd|d |d |d d	|d	 id
	}||fS )a  
    :param data_dir: Path to the data directory
    :param line: dict, it looks like below::

        {
          "id": "DE_B00000_S00000_W000029",
          "wav": "DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000029.mp3",
          "text": " Und es gibt auch einen Stadtplan von Tegun zu sehen.",
          "duration": 3.228,
          "speaker": "DE_B00000_S00000",
          "language": "de",
          "dnsmos": 3.3697
        }

    :return: a tuple of "recording" and "supervision"
    wavN)pathrecording_idg        r   textlanguagespeakerdnsmos)	idr   startdurationchannelr   r   r   custom)is_filer	   	from_filestemr   r   r   )r   r   	full_path	recordingsegment r"   R/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/emilia.py_parse_utterance)   s&   
r$   N
corpus_dirlangnum_jobs
output_dirc                 C   sj  |du rt d| }|dvrt d| t| } |  s&J d|  | d | }| s7J d| |d}g }g }t|O}	|D ]}
tt|
d|
 d	| d
dD ]}||		t
|| qXqGt|ddD ]}| }|du rwql|\}}|t|j|d|j|gdd qlW d   n1 sw   Y  t|}|durt|}||d| d  |S )a  
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
                       We assume the directory has the following structure:
                       corpus_dir/raw/openemilia_all.tar.gz,
                       corpus_dir/raw/DE,
                       corpus_dir/raw/DE/DE_B00000.jsonl,
                       corpus_dir/raw/DE/DE_B00000/DE_B00000_S00000/mp3/DE_B00000_S00000_W000000.mp3,
                       corpus_dir/raw/EN, etc.
    :param lang: str, one of en, zh, de, ko, ja, fr
    :param num_jobs: int, number of threads for processing jsonl files
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The CutSet containing the data for the given language.
    NzPlease provide --lang)DEENFRJAKOZHzLPlease provide a valid language. Choose from de, en, fr, ja, ko, zh. Given: zNo such directory: rawz*.jsonlzProcessing z with z jobs)desczCollecting futuresr   )r   r    r   r   supervisionsr   emilia_cuts_z	.jsonl.gz)
ValueErrorupperr   is_dirglobr   r   r
   appendsubmitr$   resultr   r   r   r   	from_cutsto_file)r%   r&   r'   r(   lang_uppercaser   jsonl_filescutsfuturesex
jsonl_fileitemfuturer9   r    r!   cut_setr"   r"   r#   prepare_emiliaU   sh   



!rE   )N)__doc__concurrent.futures.threadr   pathlibr   typingr   r   	tqdm.autor   lhotser   r   lhotse.audior	   lhotse.serializationr
   lhotse.supervisionr   lhotse.utilsr   dictr$   strintrE   r"   r"   r"   r#   <module>   s:    
0