o
    Si                     @   s   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ defddZ	ddedee deeeee	eef f f fddZdS )a  
About the CDSD (Chinese Dysarthric Speech Database) dataset:

    This database comprises speech data from 24 participants with dysarthria. 
    
    Among these participants, one recorded an additional 10 hours of speech data, while each recorded one hour, resulting in 34 hours of speech material. 

    To accommodate participants with varying cognitive levels, the text pool primarily consists of content from the AISHELL-1 dataset and speeches by primary and secondary school students. When participants read these texts, they must use a mobile device or the ZOOM F8n multi-track field recorder to record their speeches. 
    
    In this paper, the authors elucidate the data collection and annotation processes and present an approach for establishing a baseline for dysarthric speech recognition. Furthermore, the authors conducted a speaker-dependent dysarthric speech recognition experiment using an additional 10 hours of speech data from one of the participants. 
    
arXiv link: https://arxiv.org/abs/2310.15930v1
    N)defaultdict)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikelinec                 C   sH   |  dd} |  dd} |  dd} |  dd} |  d	d
} |  } | S )u   
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/aishell_data_prep.sh#L54
    sed 's/ａ/a/g' | sed 's/ｂ/b/g' |    sed 's/ｃ/c/g' | sed 's/ｋ/k/g' |    sed 's/ｔ/t/g' > $dir/transcripts.t

    u   ａau   ｂbu   ｃcu   ｋku   ｔt)replaceupper)r    r   G/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/cdsd.pytext_normalize   s   r   
corpus_dir
output_dirreturnc                 C   s2  t | } |  sJ d|  |durt |}|jddd tt}ddg}|D ]}td|  g }g }| d |  d	 }i }|d
D ]2}	t|	ddd!}
|
	 D ]}|
 jdd\}}t|}|||< qTW d   n1 ssw   Y  qF| d |  d }t|dddD ]R}|j}|jd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd||
 ddd}|| qt|}t|}t||\}}t|| |dur||d| d  ||d | d  ||d!||< q(|S )"aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NT)parentsexist_ok1h10hzProcessing CDSD subset: after_cattingTextz**/*.txtrzutf-8)encoding   )maxsplitAudioz**/*.wavzProcessing audio)desczNo transcript: z has no transcript.zNo such file: g        r   Chinese  )idrecording_idstartdurationchannellanguagespeakertextcdsd_supervisions_z	.jsonl.gzcdsd_recordings_)
recordingssupervisions)r   is_dirmkdirr   dictlogginginforglobopen	readlinesstripsplitr   r   stempartswarningis_filer	   	from_fileappendr   r0   r   r
   from_recordingsr   from_segmentsr   r   to_file)r   r   	manifestsdataset_partspartr7   r8   txt_pathtranscript_dict	text_pathfr   idx_transcriptcontentwav_path
audio_pathidxr3   r4   	recordingsegmentrecording_setsupervision_setr   r   r   prepare_cdsd.   sp   	







r\   )N)__doc__r<   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior	   r
   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   strr   r\   r   r   r   r   <module>   s(    