o
    Si                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ dZg dZddededefddZ		ddedeeee f dedeeeeeeef f f fddZdedefddZdS )aW  
Multi-Domain Cantonese Corpus (MDCC), consists of 73.6 hours of clean read speech paired with 
transcripts, collected from Cantonese audiobooks from Hong Kong. It comprises philosophy, 
politics, education, culture, lifestyle and family domains, covering a wide range of topics. 

Manuscript can be found at: https://arxiv.org/abs/2201.02419
    N)Path)DictSequenceUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikeis_module_availablezFhttps://drive.google.com/file/d/1epfYMMhXdBKA6nxPgUugb2Uj4DllSxkn/view)trainvalidtestF
target_dirforce_downloadreturnc                 C   s   t dstdddl}t| } | jddd | d }|d}|s/| r/t| d	 ntd
t	  |j
t	t|ddd | rH|rotd| d|   t|}|j| d W d   |S 1 sjw   Y  |S )a5  
    Downloads the MDCC data from the Google Drive and extracts it.
    :param target_dir: the directory where MDCC data will be saved.
    :param force_download: if True, it will download the MDCC data even if it is already present.
    :return: the path to downloaded and extracted directory with data.
    gdownz0Please run 'pip install gdown' to download MDCC.r   NT)parentsexist_okdatasetz.zipz# already exists. Skipping download.zRunning: gdown --fuzzy F)fuzzyquietzExtracting z to path)r   
ValueErrorr   r   mkdirwith_suffixexistslogginginfoMDCC_URLdownloadstrzipfileZipFile
extractall)r   r   r   
corpus_dir
corpus_zipzf r,   G/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/mdcc.pydownload_mdcc   s&   

r.   allr)   dataset_parts
output_dirc                 C   s  t | } | d }| sJ d| d|  di }|dks#|d dkr&t}nt|tr9|tv s6J d| |g}|D ]}g }g }| d| d	 }| sXJ d| d
|  dt|d}	|	 }
|
dd }
W d   n1 srw   Y  t|
d| ddD ]@}|	 
d\}}}}|t |j }| | }tt |}tj||d}|| t||d|jd| 	 |dd}|| qt|}t|}t||\}}t||d |durt |}|jddd ||d| d  ||d| d  ||d||< q;|S )ac  
    Create RecordingSet and SupervisionSet manifests for MDCC from a raw corpus distribution.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    audiozMissing z in .r/   r   zUnknown dataset part: cnt_asr_z_metadata.csvz metadata in r   NzProcessing z	 metadata)desc,)recording_idg        yue)idr9   startdurationchanneltextgenderlanguage)
recordingssupervisionsT)r   r   mdcc_recordings_z	.jsonl.gzmdcc_supervisions_)r   is_dir
MDCC_PARTS
isinstancer%   is_fileopen	readlinesr   stripsplitnamemake_recording_idr   	from_fileappendr   r=   	read_textr	   from_recordingsr   from_segmentsr
   r   r   to_file)r)   r0   r1   	audio_dir	manifestspartrB   rC   metadataflinesline
audio_path	text_pathr@   _r9   	recordingsupervision_segmentr,   r,   r-   prepare_mdcc;   sb   





rb   r   c                 C   s   d| j  S )Nmdcc_)stemr   r,   r,   r-   rO      s   rO   )F)r/   N)__doc__r!   r&   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior   r	   	lhotse.qar
   lhotse.supervisionr   r   lhotse.utilsr   r   r#   rG   boolr.   r%   rb   rO   r,   r,   r,   r-   <module>   s4    "
N