o
    2wi(                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ defddZ			ddedededefddZ 	ddedee de
ee
eeeef f f fddZ!dS )z
About the Aishell corpus
Aishell is an open-source Chinese Mandarin speech corpus published by Beijing Shell Shell Technology Co.,Ltd.
publicly available on https://www.openslr.org/33
    N)defaultdict)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractlinec                 C   sH   |  dd} |  dd} |  dd} |  dd} |  d	d
} |  } | S )u   
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
    sed 's/ａ/a/g' | sed 's/ｂ/b/g' |    sed 's/ｃ/c/g' | sed 's/ｋ/k/g' |    sed 's/ｔ/t/g' > $dir/transcripts.t

    u   ａau   ｂbu   ｃcu   ｋku   ｔt)replaceupper)r    r   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/aishell.pytext_normalize   s   r   .F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              
   C   sB  | d}t | } | jddd | d }d}d}||fD ]}| | }||dd  }	|	d	 }
|
 r?td
| d|
 d qt| d| ||d tj|	dd t	|}t
||d W d   n1 shw   Y  ||kr|	d }t|D ]}t	|| }t
||d W d   n1 sw   Y  qz|
  q|S )aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/33Tparentsexist_okaishellzdata_aishell.tgzzresource_aishell.tgzNz
.completedzSkipping download of z	 because z exists./)filenamer   )ignore_errors)pathwav)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   oslistdirtouch)r   r   r    url
corpus_dirdataset_tar_nameresources_tar_nametar_nametar_pathextracted_dircompleted_detectortarwav_dirsub_tar_namer   r   r   download_aishell)   s>   

rB   r8   
output_dirc                 C   s"  t | } |  sJ d|  |durt |}|jddd | d }i }t|ddd&}| D ]}| }d	|d
d }t|}|||d < q0W d   n1 sTw   Y  tt	}g d}	t
|	ddD ]}
td|
  g }g }| d d |
  }|dD ]R}|j}|jd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd|| d	dd}|| qt|}t|}t||\}}t|| |dur||d|
 d  ||d|
 d  ||d||
< qg|S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr"   z3data_aishell/transcript/aishell_transcript_v0.8.txtrzutf-8)encoding    r   )traindevtestz2Process aishell audio, it takes about 102 seconds.)desczProcessing aishell subset: data_aishellr+   z**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese )idrecording_idstartdurationchannellanguagespeakertextaishell_supervisions_z	.jsonl.gzaishell_recordings_)
recordingssupervisions) r   is_dirr,   r3   	readlinessplitjoinr   r   dictr   r.   r/   rglobstempartswarningr-   r	   	from_fileappendr   rS   stripr   r
   from_recordingsr   from_segmentsr   r   to_file)r8   rC   transcript_pathtranscript_dictfr   idx_transcriptcontent	manifestsdataset_partspartrZ   r[   wav_path
audio_pathidxrV   rW   	recordingsegmentrecording_setsupervision_setr   r   r   prepare_aishellT   sx   	







rz   )r   Fr   )N)"__doc__r.   r4   r0   r2   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior	   r
   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   strr   boolrB   rz   r   r   r   r   <module>   sF    
,