o
    Sio                     @   s"  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ dd	lmZmZmZ d
Zddefdedee dee defddZde
e dee
e e
e f fddZ	ddedee de	ee	eeeef f f fddZ dS )aR  
About the yes no dataset:

This dataset was created for the Kaldi project (see kaldi.sf.net), by a
contributor who prefers to remain anonymous. The main point of the dataset
is to provide an easy and fast way to test out the Kaldi scripts for free.

The archive "waves_yesno.tar.gz" contains 60 .wav files, sampled at 8 kHz.
All were recorded by the same male speaker, in Hebrew. In each file, the
individual says 8 words; each word is either the Hebrew for "yes" or "no",
so each file is a random sequence of 8 yes-es or noes. There is no separate
transcription provided; the sequence is encoded in the filename, with 1 for
yes and 0 for no, for instance:

# tar -xvzf waves_yesno.tar.gz
waves_yesno/1_0_1_1_1_0_1_0.wav
waves_yesno/0_1_1_0_0_1_1_0.wav
...

The dataset can be downloaded from the following address:

    https://www.openslr.org/1/

    N)defaultdict)Path)DictListOptionalTupleUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractz5http://www.openslr.org/resources/1/waves_yesno.tar.gz.F
target_dirforce_downloadurlreturnc                 C   s   t | } | jddd | d }| d }|d }| r&td| d |S t|||d tj|dd	 t	|}t
|| d
 W d   n1 sJw   Y  |  |S )a  Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/waves_yesno/*.wav
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    Tparentsexist_okwaves_yesnozwaves_yesno.tar.gzz
.completedzSkipping - z exists.)filenamer   )ignore_errors)pathN)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r   r   r   extracted_dirtar_pathcompleted_detectortar r+   H/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/yesno.pydownload_yesno)   s   r-   datasetc              
      s   ddd g }g }| D ]Q}|j d}t|dksJ t|ddhddhks/J d|  fd	d
|D }d|}t| }|	| t
|j |j d|jdd|d}|	| q||fS )zBuild a list of Recording and SupervisionSegment from a list
    of sound filenames.

    :param dataset: List[Pathlike], a list of sound filenames
    :return: a tuple containing a list of Recording and a list
        of SupervisionSegment
    NOYES)01_   r1   r2   z
words is: c                    s   g | ]} | qS r+   r+   ).0wword_mapr+   r,   
<listcomp>`   s    z$_prepare_dataset.<locals>.<listcomp> g        r   Hebrew)idrecording_idstartdurationchannellanguagetext)stemsplitlensetunionjoinr   	from_fileabsoluteappendr   r?   )r.   
recordingssupervisions
audio_pathwordsrB   	recordingsegmentr+   r7   r,   _prepare_datasetM   s,   

(

	rR   
corpus_dir
output_dirc                 C   s@  t | } |  sJ d|  |durt |}|jddd t| d}t|dks-J |  |ddd }|ddd }t|d	ksGJ t|d	ksOJ tt}t	d
dg||gD ]A\}}t
|\}}	t|}
t|	}t|
|\}
}t|
| |dur||d| d  |
|d| d  |
|d||< q\|S )aT  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   z*.wav<            traintestyesno_supervisions_z	.jsonl.gzyesno_recordings_)rL   rM   )r   is_dirr   listglobrE   sortr   dictziprR   r   from_recordingsr   from_segmentsr	   r
   to_file)rS   rT   
wave_files	train_settest_set	manifestsnamer.   rL   rM   recording_setsupervision_setr+   r+   r,   prepare_yesnot   s4   


rm   )N)!__doc__r    r"   r$   collectionsr   pathlibr   typingr   r   r   r   r   lhotser	   r
   lhotse.audior   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   _DEFAULT_URLboolstrr-   rR   rm   r+   r+   r+   r,   <module>   sJ    
$
(