o
    Si                     @   s
  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ defddZ			ddedededefddZ	ddedee de
ee
eeeef f f fddZ dS )aG  
Magicdata is an open-source Chinese Mandarin speech corpus by Magic Data Technology Co., Ltd.,
containing 755 hours of scripted read speech data from 1080 native speakers of the Mandarin Chinese spoken
in mainland China. The sentence transcription accuracy is higher than 98%.
Publicly available on https://www.openslr.org/68
    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractlinec                 C   sh  |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  d	d} |  d
d} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  dd} |  } | S )u  
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/magicdata_data_prep.sh#L41
    sed 's/！//g' | sed 's/？//g' |\
    sed 's/，//g' | sed 's/－//g' |\
    sed 's/：//g' | sed 's/；//g' |\
    sed 's/　//g' | sed 's/。//g' |\
    sed 's/`//g' | sed 's/,//g' |\
    sed 's/://g' | sed 's/?//g' |\
    sed 's/\///g' | sed 's/·//g' |\
    sed 's/\"//g' | sed 's/“//g' |\
    sed 's/”//g' | sed 's/\\//g' |\
    sed 's/…//g' | sed "s///g" |\
    sed 's/、//g' | sed "s///g" | sed 's/《//g' | sed 's/》//g' |\
    sed 's/\[//g' | sed 's/\]//g' | sed 's/FIL//g' | sed 's/SPK//' |\
    tr '[a-z]' '[A-Z]' |\
    u   ！ u   ？u   ，u   －u   ：u   ；u   　u   。`,:?/   ·"u   “u   ”\u   …u   、z[ []u   《 u   《u   》FILSPKu   ﻿)replaceupper)r    r!   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/magicdata.pytext_normalize   s>   r#   .F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              	   C   s   | d}t | } | jddd | d }d}d}d}|||fD ]U}| | }	||dd	  }
|
d
 }| rBtd| d| d qt| d| |	|d tj|
dd t	|	}t
||d W d   n1 skw   Y  |  q|S )aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/68Tparentsexist_ok	magicdataztrain_set.tar.gzzdev_set.tar.gzztest_set.tar.gzNiz
.completedzSkipping download z	 because z exists.r   )filenamer'   )ignore_errors)path)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r&   r'   r(   url
corpus_dirtrain_tar_namedev_tar_nametest_tar_nametar_nametar_pathextracted_dircompleted_detectortarr!   r!   r"   download_magicdataK   s2   

rD   r;   
output_dirc                 C   sb  t | } |  sJ d|  |durt |}|jddd g d}i }|D ]^}| |  d }t|ddd	F}|D ];}|d
rItd|  q9| }|d dksY|d dkrZq9|d dd }	d|dd }
t	|
}
|
||	< q9W d   n1 sw   Y  q&t
t}t|ddD ]}td|  g }g }| |  }|dD ]N}|j}|jd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd|| d}|| qt|}t|}t||\}}t|| |dur'||d| d  ||d| d  ||d ||< q|S )!aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr*   )traindevtestz	TRANS.txtrzutf-8)encodingUtteranceIDzline is r   z16_4013_20170819121429.wavz18_1565_20170712000170.wavr$       z/Process magicdata audio, it takes 6818 seconds.)desczProcessing magicdata subset: z**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakertextmagicdata_supervisions_z	.jsonl.gzmagicdata_recordings_)
recordingssupervisions)r   is_dirr1   r8   
startswithr3   r4   splitjoinr#   r   dictr   rglobstempartswarningr2   r
   	from_fileappendr   rT   stripr   from_recordingsr   from_segmentsr   r	   to_file)r;   rE   dataset_partstranscript_dictpartr0   f1r   idx_transcriptidx_content	manifestsr[   r\   wav_path
audio_pathidxrW   rX   	recordingsegmentrecording_setsupervision_setr!   r!   r"   prepare_magicdatar   s   	











r{   )r$   Fr%   )N)!__doc__r3   osr5   r7   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   strr#   boolrD   r{   r!   r!   r!   r"   <module>   sD    3
(