o
    SiG                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ 			ddedededefddZ	 defddZ	ddede
e de	ee	eeeef f f fddZdS )z
Thchs is an open-source Chinese Speech Corpus Released by CSLT@Tsinghua University.
Publicly available on https://www.openslr.org/resources/18
THCHS-30 (26 hours)

    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extract.F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              	   C   s   | d}t | } | jddd | d }d}|fD ]U}| | }||dd  }|d }	|	 r<td	| d
|	 d qt| d| ||d tj|dd t	|}
t
|
|d W d   n1 sew   Y  |	  q|S )aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/18Tparentsexist_okthchszdata_thchs30.tgzNz
.completedzSkipping download z	 because z exists./)filenamer   )ignore_errors)path)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r   r   r   url
corpus_dirdataset_tar_nametar_nametar_pathextracted_dircompleted_detectortar r1   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/thchs_30.pydownload_thchs_30   s.   


r3   linec                 C   s   |  dd} |  } | S )Nz l = )replaceupper)r4   r1   r1   r2   text_normalizeF   s   r8   r*   
output_dirc                 C   s(  t | } |  sJ d|  |durt |}|jddd | d d }i }|dD ]4}t |jj}t|dd	d
}t|D ]\}}|dkrOt|}|||< q?W d   n1 sZw   Y  q+tt	}	g d}
t
|
ddD ]}td|  g }g }| d |  }|dD ]P}|j}|dd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd|| d}|| qt|}t|}t||\}}t|| |dur
||d| d  ||d| d  ||d|	|< qn|	S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   data_thchs30dataz**/*.wav.trnrzutf-8)encodingr   )traindevtestz2Process thchs_30 audio, it takes about 19 seconds.)desczProcessing thchs_30 subset: z**/*.wav_zNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakertextthchs_30_supervisions_z	.jsonl.gzthchs_30_recordings_)
recordingssupervisions)r   is_dirr    rglobstemr'   	enumerater8   r   dictr   r"   r#   splitwarningr!   r
   	from_fileappendr   rG   stripr   from_recordingsr   from_segmentsr   r	   to_file)r*   r9   r   transcript_dict	text_pathidxfline_idxr4   	manifestsdataset_partspartrN   rO   wav_path
audio_pathrJ   rK   	recordingsegmentrecording_setsupervision_setr1   r1   r2   prepare_thchs_30L   s~   	







rk   )r   Fr   )N) __doc__r"   r$   r&   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   boolstrr3   r8   rk   r1   r1   r1   r2   <module>   sD    
%	