o
    2wip                     @   s   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ dedefddZ		ddedee dedeeeee	eef f f fddZdS )zM
AISHELL2 (~1000 hours) if available(https://www.aishelltech.com/aishell_2).
    N)defaultdict)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikelinereturnc                 C   s   |  dd} |  dd} |  dd} |  dd} |  d	d
} |  dd
} |  dd} |  } 	 g }t| } t| D ]#\}}|dkrWd| |d    krOdkrWn n| dd
}|| q9d
|} |  } | S )zz
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell2_data_prep.sh#L50

    u   ＡAu   ＴTu   ＭMu   𫖯u   頫u   ， ?- 'u   一   u   鿿)replaceupperlist	enumerateappendjoin)r   new_lineichar r#   T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/aishell2.pytext_normalize   s$   /(
r%   r   
corpus_dir
output_dirnum_jobsc                 C   sV  t | } |  sJ d|  |durt |}|jddd tt}g d}t|ddD ]}td|  |d	krO| d
 d d d }| d
 d d d }n| d
 d |  d }| d
 d |  d }i }t|ddd$}	|	D ]}
|
	 }d
|dd }t|}|||d < qqW d   n1 sw   Y  g }tj|d|d}|dD ]F}|j}|jd }||vrtd|  t| d q|| }| std|  qt||d||dd|| d}|| qt|}t|}t||\}}t|| |dur!||d| d   ||d!| d   ||d"||< q,|S )#aY  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part,
             and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NT)parentsexist_ok)traindevtestzEProcess aishell2 audio, it takes about 55  minutes using 40 cpu jobs.)desczProcessing aishell2 subset: r+   z	AISHELL-2iOSdataz	trans.txtwavrzutf-8)encodingr   r   r   z*.wav)pathpatternr(   z**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakertextaishell2_supervisions_z	.jsonl.gzaishell2_recordings_)
recordingssupervisions)r   is_dirmkdirr   dictr   logginginfoopensplitr   r%   r
   from_dirrglobstempartswarningis_filer   r;   stripr   from_recordingsr   from_segmentsr   r   to_file)r&   r'   r(   	manifestsdataset_partsparttranscript_pathwav_pathtranscript_dictfr   idx_transcriptcontentrC   rB   
audio_pathidxr>   r?   segmentrecording_setsupervision_setr#   r#   r$   prepare_aishell2Z   s~   






rc   )Nr   )__doc__rG   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior	   r
   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   strr%   intrc   r#   r#   r#   r$   <module>   s.    I