o
    2wi2                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ ddddddZddddddZdgZddgZ ddgZ!g dZ"e"e Z#	d5de
ee$ee$ f  de%fddZ&				 d6d!ede
ee$ee$ f  d"e%d#e$def
d$d%Z'd&d'd(d)d*Z(d+ed,e$deeef fd-d.Z)d/edeeef fd0d1Z*		d7d+edee$ee$ f d2e
e de	e$e	e$eeef f f fd3d4Z+dS )8u2  
About the HI_MIA corpus
HI_MIA is a far-field text-dependent speaker verification data
published by Beijing Shell Shell Technology Co.,Ltd.
The contents are wake-up words "Hi, Mia" in Chinese(ni hao mi ya; 你好，米雅).
It' publicly available on https://www.openslr.org/85

The HI_MIA_CW is a supplemental database of the HI_MIA database.
The specific text of the audios is the HI-MIA confusion words in Chinese,
which are the negative samples for wake-up words "hi, Mia"(ni hao mi ya; 你好, 米雅).
It' publicly available on https://www.openslr.org/120
    N)defaultdict)Path)DictOptionalSequenceTupleUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractztrain.tar.gzz
dev.tar.gzztest_v2.tar.gzzdata.tgzresource.tgz)traindevtestdataresourcer   r   r   16k_wav_filer   cw_testr   )r   r   r   autodataset_partsreturnc                    s\   t ddg  dtdtf fdd}t| tr||  dS t| ts#J | D ]}|| q%dS )Nr   himiadataset_namer   c                    s   |  v sJ |  d  dS )Nz8 is not a valid subset. You may want to select one from T )r   valid_dataset_partsr    Q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/himia.pyvalidate_a_datasetN   s   
z3_validate_dataset_parts.<locals>.validate_a_datasetT)HI_MIA_AND_CW_PARTSstrbool
isinstancetuple)r   r$   r   r    r!   r#   _validate_dataset_partsI   s   

r*   .F http://www.openslr.org/resources
target_dirforce_downloadbase_urlc              	   C   s  t | } | d }t| |dkrtt }n|dkrt}nt|tr$|g}g }|D ]}|dkr3|t7 }q(|| q(dd |D }dd |D }t||D ]\}	}
|	tv rVdnd	}|r\d
nd}| d| }| |	 }||
 }|ru|d |
 }|d }d|	kr|n|j	}|
 rtd|	 d| d qLt| d|	 ||d td|	 d tj|dd |jddd t|}t||d W d   n1 sw   Y  |  qL|S )a  
    Downdload and untar HI_MIA and HI_MIA_CW datasets.
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "auto", "himia"
        or a list of splits (e.g. "train", "dev", "test", "cw_test") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to extracted directory with data.
    HiMiar   r   r   c                 S      g | ]}t | qS r    )SOURCE_FILE.0partr    r    r#   
<listcomp>       z"download_himia.<locals>.<listcomp>c                 S   r1   r    )EXTRACTED_FOLDERr3   r    r    r#   r6      r7   TFx   U   /z
.completedr   z$Skipping download and extraction of z	 because z exists.)filenamer.   zExtracting r+   )ignore_errorsparentsexist_ok)pathN)r   r*   HI_MIA_PARTSCW_SOURCE_FILE_LISTr(   r&   appendzipCW_FILESparentis_filelogginginfor   shutilrmtreemkdirtarfileopenr   touch)r-   r   r.   r/   
corpus_dirfiles_to_downloadr   	tar_filesext_folderstar_nameext_nameis_cwurl_suffix_indexurltar_pathcompleted_detector_dircompleted_detectorextracted_dirtarr    r    r#   download_himia^   sZ   



r_   i;' i  i  i?  )r   r   r   r   rQ   r5   c                 C   s|  t d|  |dkrdnd}|dkrdn|}| | d| d }|| d }| s2J | g }g }|dkr<dnd}t|Z}	t|	t| d	D ]I}
|
 }
||  |
  }| }|j}|	d
d }d}| svt 
d|  qKt|}|| t||d|jdd|| d}|| qKW d   n1 sw   Y  t|}t|}t||\}}t|| ||fS )a
  
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param part: dataset part, one of ["train", "dev", "test"]
    :return: the RecodingSet and SupervisionSet given a dataset part.
    zProcessing HI_MIA subset: r    
SPEECHDATAwavr;   z.scpzwav/total_r   u   你好米雅zNo such file:         Chineseidrecording_idstartdurationchannellanguagespeakertextN)rI   rJ   rH   rO   r	   _TOTAL_NUM_WAVSstripresolvestemsplitwarningr   	from_filerD   r   rl   r   from_recordingsr   from_segmentsr
   r   )rQ   r5   suffix_pathscp_file_namedir_of_wav_scpwav_scp_path
recordingssupervisionswav_path	wav_scp_f	wav_entry
audio_pathaudio_file_namero   rp   	recordingsegmentrecording_setsupervision_setr    r    r#   _prepare_train_dev_test   sP   







r   corpus_pathc                 C   st  t d g }g }| d }t d|  | d }i }t|ddd"}| D ]}| }d|d	d
 }	|	||d < q'W d
   n1 sGw   Y  t|td ksVJ t|td dD ]A}
||
 }|	 }||
 }|
 swJ | dt|}|| |j}|dd }t||d|jdd|| d}|| q^t|}t|}t||\}}t|| ||fS )z
    Returns the RecodingSet and SupervisionSet of test dataset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet of test dataset.
    zProcessing HI_MIA_CW datasetzcw_test/16k_wav_filezSearching wav files in z"cw_test/resource/transcription.txtrzutf-8)encoding    Nr   r   rc   z does not exist.re   rf   rg   rh   )rI   rJ   rO   	readlinesru   joinlenrq   r	   rs   rH   r   rw   rD   rt   r   rl   rr   r   rx   r   ry   r
   r   )r   r~   r   cw_test_pathtranscript_pathtranscript_dictflineidx_transcript
transcriptwav_namer   rp   r   r   ro   r   r   r   r    r    r#   _prepare_cw_test   sP   






r   
output_dirc                 C   s  t | t| } |  sJ d|  |dur"t|}|jddd tt}|dkr-t}n|dkr4t}nt|t	rC|dkr@t
}n|g}t|dd	D ]5}d|krVt| \}}nt| |\}}|durw||d
| d  ||d| d  ||d||< qI|S )a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: "auto", "himia"
        or a list of splits (e.g. "train", "dev", "test", "cw_test") to download.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr>   r   r   r   z%Process HI_MIA and HI_MIA_CW dataset.)deschimia_supervisions_z	.jsonl.gzhimia_recordings_)r~   r   )r*   r   is_dirrM   r   dictr%   rB   r(   r&   CW_PARTSr	   r   r   to_file)rQ   r   r   	manifestsr5   r   r   r    r    r#   prepare_himia  s6   

r   )r   )r+   r   Fr,   )r   N),__doc__rI   rK   rN   collectionsr   pathlibr   typingr   r   r   r   r   	tqdm.autor	   lhotser
   r   lhotse.audior   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r2   r8   r   rC   rF   rB   r%   r&   r'   r*   r_   rq   r   r   r   r    r    r    r#   <module>   s    
	

O

98