o
    Si                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZ 	
ddede	e defddZ	ddede	e deee
eef f fddZdS )zq
See https://en.data-baker.com/datasets/freeDatasets/

It is a Chinese TTS dataset, containing 12 hours of data.
    N)Path)DictOptionalUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extract.F
target_dirforce_downloadreturnc                 C   s   t | } | jddd d}| | d }| | }|d }| r.td| d| d |S td	| d||d
 tj|dd t	|}t
|| d W d    n1 sVw   Y  |  |S )NTparentsexist_okBZNSYPz.tar.bz2z
.completedz	Skipping z	 because z exists.z6https://huggingface.co/openspeech/BZNSYP/resolve/main/)filenamer   )ignore_errors)path)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r   r   dataset_nametar_path
corpus_dircompleted_detectortar r(   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/baker_zh.pydownload_baker_zh   s(   
r*   r%   
output_dirc                 C   s  t | } |  sJ d|  |durt |}|jddd | d d }| s/t| dg }g }td td	}t	|f}zT	 t
| }t
| }|jdd
d\}	}
t|d|
}| d |	 d }| svtd|  qCt|}t|	|	d|jddd|
||dd	}|| || qD ty   Y nw W d   n1 sw   Y  t|}t|}t||\}}t|| |dur||d  ||d  ||dS )a/  
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    zNo such directory: NTr   ProsodyLabelingz000001-010000.txtz does not existz)Started preparing. It may take 30 secondsz#[12345]   )maxsplit Wavez.wavzNo such file: g        r   Chinesefemale)pinyinnormalized_text)	idrecording_idstartdurationchannellanguagegendertextcustomz"baker_zh_supervisions_all.jsonl.gzz baker_zh_recordings_all.jsonl.gz)
recordingssupervisions)r   is_dirr   r   
ValueErrorr   r   recompiler!   nextstripsplitsubwarningr   	from_filer
   r8   appendStopIterationr	   from_recordingsr   from_segmentsr   r   to_file)r%   r+   labeling_filer>   r?   patternffirstr3   r6   original_textr4   
audio_path	recordingsegmentrecording_setsupervision_setr(   r(   r)   prepare_baker_zh-   sf   










rY   )r   F)N)__doc__r   rB   r   r    pathlibr   typingr   r   r   lhotser   r   lhotse.audior   r	   lhotse.supervisionr
   r   lhotse.utilsr   r   r   boolr*   strrY   r(   r(   r(   r)   <module>   s6    
