o
    2wiQ                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ 			ddedee dee defddZ	ddedee de
ee
eeeef f f fddZ dS )a$  
About the aidatatang_200zh corpus

It is a Chinese Mandarin speech corpus by Beijing DataTang Technology Co., Ltd,
containing 200 hours of speech data from 600 speakers. The transcription
accuracy for each sentence is larger than 98%.

It is publicly available on https://www.openslr.org/62
    N)defaultdict)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extract.F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              
   C   sB  | d}t | } | jddd d}| | }| }||dd  }|d }| r3td| d	 |S t| d
| ||d tj|dd t	|}	t
|	|d W d   n1 s\w   Y  |d }
dD ]3}|
| }td|  t|D ]}t	|| }	t
|	|d W d   n1 sw   Y  qzqg|  |S )aP  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/62Tparentsexist_okzaidatatang_200zh.tgzNz
.completedzSkipping because z exists./)filenamer   )ignore_errors)pathcorpus)testdevtrainzProcessing )r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   oslistdirtouch)r   r   r   urltar_nametar_path
corpus_dirextracted_dircompleted_detectortarwav_dirsdsub_tar_name r9   \/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/aidatatang_200zh.pydownload_aidatatang_200zh   s<   
r;   r1   
output_dirc                 C   sT  t | } |  sJ d|  | d }| sJ d|  |dur-t |}|jddd |d }| s<J d| i }t|dd	d
,}| D ]}| }d|dd }|dd}|	 }|||d < qJW d   n1 stw   Y  t
t}	g d}
t|
ddD ]}td|  g }g }|d | }|dD ]N}|j}|jd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd|| d}|| qt|}t|}t||\}}t|| |dur ||d| d  ||d| d  ||d|	|< q|	S ) aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: aidatatang_200zhNTr   z+transcript/aidatatang_200_zh_transcript.txtzNo such file: rzutf-8)encoding    u   ＡAr   )r!   r    r"   z6Process aidatatang audio, it takes about 2559 seconds.)descz-Processing  prepare_aidatatang_200zh subset: r   z**/*.wavzNo transcript: z has no transcript. g        Chinese)idrecording_idstartdurationchannellanguagespeakertextaidatatang_supervisions_z	.jsonl.gzaidatatang_recordings_)
recordingssupervisions) r   is_dirr#   r$   r*   	readlinessplitjoinreplaceupperr   dictr   r%   r&   rglobstempartswarningr	   	from_fileappendr   rI   stripr
   from_recordingsr   from_segmentsr   r   to_file)r1   r<   r7   transcript_pathtranscript_dictflineidx_transcriptcontent	manifestsdataset_partspartrP   rQ   wav_path
audio_pathidxrL   rM   	recordingsegmentrecording_setsupervision_setr9   r9   r:   prepare_aidatatang_200zhF   s   	








rs   )r   Fr   )N)!__doc__r%   r+   r'   r)   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior	   r
   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   boolstrr;   rs   r9   r9   r9   r:   <module>   sD    

+