o
    Si3                     @   sL  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. dZ/d0 Z1dZ2dZ3ddde/dfde+dee4ee4 f de5de4dee4 ddfddZ6ed4dd Z7d!ee+ d"e4dee4ee4ee$e)f f f fd#d$Z8d%ed"e4d&e4deee#e(f  fd'd(Z9	)d5d*e4d+e4d%e+d,e:dee$e)f f
d-d.Z;d/e3d)fd0e+d!e+dee4ee4 f d1ee4ee4 f d,e:dee4ee4ee4ee$e)f f f f fd2d3Z<dS )6a[  
Official description from the "about" page of the Mozilla CommonVoice project
(source link: https://commonvoice.mozilla.org/en/about)

Why Common Voice?
Mozilla Common Voice is an initiative to help teach machines how real people speak.
This project is an effort to bridge the digital speech divide. Voice recognition technologies bring a human dimension to our devices, but developers need an enormous amount of voice data to build them. Currently, most of that data is expensive and proprietary. We want to make voice data freely and publicly available, and make sure the data represents the diversity of real people. Together we can make voice recognition better for everyone.

How does it work?
We are crowdsourcing an open-source dataset of voices. Donate your voice, validate the accuracy of other people's clips, make the dataset better for everyone.
    N)defaultdict)ProcessPoolExecutor)contextmanager)get_context)Path)DictIterableListOptionalSequenceTupleUnion)tqdm)"get_ffmpeg_torchaudio_info_enabledload_manifest"set_ffmpeg_torchaudio_info_enabled$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadsafe_extractzJhttps://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.comzNen de fr cy tt kab ca zh-TW it fa eu es ru tr nl eo zh-CN rw pt zh-HK cs pl uk)traindevtest	validatedinvalidatedother)r   r   r   .allFzcv-corpus-13.0-2023-03-09
target_dir	languagesforce_downloadbase_urlreleasereturnc              	   C   s  t | } | jddd | d| }|dkrt}nt|tr"|g}nt|}tdt| d|  t	|ddD ]}td	|  | | | }|d
 }|
 r^td| d| d q9| d}	| |	 }
|sm|

 st|dd dk r|td|d| d| d }t||
|d td|  td|  tj|dd t|
}t|| d W d   n1 sw   Y  |  q9dS )a5  
    Download and untar the CommonVoice dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param languages: one of: 'all' (downloads all known languages); a single language code (e.g., 'en'),
        or a list of language codes.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the base URL for CommonVoice.
    :param release: str, the name of the CommonVoice release (e.g., "cv-corpus-13.0-2023-03-09").
        It is used as part of the download URL.
    Tparentsexist_ok/r#   zAbout to download z CommonVoice languages: z!Downloading CommonVoice languagesdesc
Language: z
.completedz	Skipping z	 because z exists.z.tar.gz-   g       @zWhen the version is less than 8.0, CommonVoice requires you to enter e-mail to download the data.
Please download it manually for now.
Or you can choose a version greater than 8.0.
)filenamer&   zDownloading finished: zUnpacking archive: )ignore_errors)pathN)r   mkdirCOMMONVOICE_LANGS
isinstancestrlistlogginginfolenr   is_filefloatsplitNotImplementedErrorr   shutilrmtreetarfileopenr   touch)r$   r%   r&   r'   r(   urllangpart_dircompleted_detectortar_nametar_path
single_urltar rO   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/commonvoice.pydownload_commonvoice1   sJ   


rQ   c                  c   s0    t  } td z
d V  W t|  d S t|  w )NF)r   r   )enabledrO   rO   rP   disable_ffmpeg_torchaudio_infov   s   rS   
output_dirlanguagec              
   C   sb   | du ri S t t}dD ]"}dD ]}| d| d| d| d }| s%qt||| |< qq|S )zg
    Returns:
        {'train': {'recordings': ..., 'supervisions': ...}, 'dev': ..., 'test': ...}
    N)r   r   r   )
recordingssupervisionscv__	.jsonl.gz)r   dictr>   r   )rT   rU   	manifestspartmanifestr5   rO   rO   rP   _read_cv_manifests_if_cached   s   r_   	lang_path
audio_infoc                 C   s   | d |d  }|  std|  d S t|d j}tj||d}t||d|jd||d |d 	 |d	 |d
 |d |d dd
}||fS )Nclipsr5   zNo such file: )r5   recording_idg        r   	client_idsentencegenderageaccentsvariant)rg   rh   ri   )
idrc   startdurationchannelrU   speakertextrf   custom)
r>   r;   r<   r   stemr   	from_filer   rl   strip)r`   rU   ra   
audio_pathrc   	recordingsegmentrO   rO   rP   _parse_utterance   s,   
rw      rH   r]   num_jobsc                 C   s<  t |}|| d }t z t|tddb}g }g }g }g }	t|d%}
tj|
dtjd}	t|	ddD ]}|	|
t|| | q5W d	   n1 sMw   Y  t|d
dD ]}| }|d	u rcqX|\}}|	| |	| qXW d	   n1 s|w   Y  W d	   n1 sw   Y  t|}t|}||fS )a  
    Prepares part of CommonVoice data.

    :param lang: string language code (e.g., "en").
    :param part: which split to prepare (e.g., "train", "validated", etc.).
    :param lang_path: path to a CommonVoice directory for a specific language
        (e.g., "/path/to/cv-corpus-13.0-2023-03-09/pl").
    :param num_jobs: How many concurrent workers to use for scanning of the audio files.
    :return: a tuple of (RecordingSet, SupervisionSet) objects,
        note that CommonVoice manifests may be fairly large in memory.
    z.tsvspawn)max_workers
mp_contextr	)	delimiterquotingzDistributing tasksr.   N
Processing)r   rS   r   mp_get_contextrE   csv
DictReader
QUOTE_NONEr   appendsubmitrw   resultr   from_recordingsr   from_segments)rH   r]   r`   ry   tsv_pathexfuturesrV   rW   audio_infosfra   futurer   ru   rv   recording_setsupervision_setrO   rO   rP   _prepare_part   sN   

"
r   auto
corpus_dirsplitsc              
   C   s  t | } |  sJ d|  |dusJ dt |}|jddd |dkr?ttdd | d	D }|s>td
|  nt|t	rG|g}i }t
|ddD ]r}td|  | | }t||d}t
|ddD ]S}	td|	  |	|v rt|	 d| d qit||	||d\}
}t|
|\}
}t|
| ||d| d|	 d  |
|d| d|	 d  ||
d||	< qi|||< qO|S )a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    This function expects the input directory structure of::

        >>> metadata_path = corpus_dir / language_code / "{train,dev,test}.tsv"
        >>> # e.g. pl_train_metadata_path = "/path/to/cv-corpus-13.0-2023-03-09/pl/train.tsv"
        >>> audio_path = corpus_dir / language_code / "clips"
        >>> # e.g. pl_audio_path = "/path/to/cv-corpus-13.0-2023-03-09/pl/clips"

    Returns a dict with 3-level structure (lang -> split -> manifest-type)::

        >>> {'en/fr/pl/...': {'train/dev/test': {'recordings/supervisions': manifest}}}

    :param corpus_dir: Pathlike, the path to the downloaded corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param languages: 'auto' (prepare all discovered data) or a list of language codes.
    :param splits: by default ``['train', 'dev', 'test']``, can also include
        ``'validated'``, ``'invalidated'``, and ``'other'``.
    :param num_jobs: How many concurrent workers to use for scanning of the audio files.
    :return: a dict with manifests for all specified languagues and their train/dev/test splits.
    zNo such directory: NzaCommonVoice recipe requires to specify the output manifest directory (output_dir cannot be None).Tr*   r   c                 s   s    | ]}|j V  qd S )N)name).0r5   rO   rO   rP   	<genexpr>  s    
z&prepare_commonvoice.<locals>.<genexpr>*z0Could not find any of CommonVoice languages in: z Processing CommonVoice languagesr.   r0   )rT   rU   Splitingz	Spliting z split of CommonVoice-z already prepared - skipping.)rH   r]   r`   ry   zcv-_supervisions_rZ   _recordings_)rW   rV   )r   is_dirr6   setr7   intersectionglob
ValueErrorr8   r9   r   r;   r<   r_   r   r   r   to_file)r   rT   r%   r   ry   r\   rH   r`   lang_manifestsr]   r   r   rO   rO   rP   prepare_commonvoice   sf   




r   )r)   N)rx   )=__doc__r   r;   mathnumbersrB   rD   warningscollectionsr   concurrent.futures.processr   
contextlibr   multiprocessingr   r   pathlibr   typingr   r   r	   r
   r   r   r   	tqdm.autor   lhotser   r   r   r   lhotse.audior   r   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   DEFAULT_COMMONVOICE_URLr@   r7   COMMONVOICE_SPLITSCOMMONVOICE_DEFAULT_SPLITSr9   boolrQ   rS   r_   rw   intr   r   rO   rO   rO   rP   <module>   s    $
E	

%

@"