o
    Si                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZ defddZ		ddede	e dedeeeee
eef f f fddZdS )zy
optional TAL_CSASR(587 hours) if available(https://ai.100tal.com/dataset).
It is a mandarin-english code-switch corpus.
    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikelinec                 C   s   |  dd} |  dd} |  dd} |  dd} |  d	d
} |  dd} |  dd} |  dd} |  dd} tdd| } |  dd} |  dd} |  } | S )us  
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/tal_mix_data_prep.sh#L52
    sed 's/Ａ/A/g' | sed 's/Ｃ/C/g' | sed 's/Ｄ/D/g' | sed 's/Ｇ/G/g' |     sed 's/Ｈ/H/g' | sed 's/Ｕ/U/g' | sed 's/Ｙ/Y/g' | sed 's/ａ/a/g' |     sed 's/Ｉ/I/g' | sed 's/#//g' | sed 's/=//g' | sed 's/；//g' |     sed 's/，//g' | sed 's/？//g' | sed 's/。//g' | sed 's/\///g' |     sed 's/！//g' | sed 's/!//g' | sed 's/\.//g' | sed 's/\?//g' |     sed 's/：//g' | sed 's/,//g' | sed 's/"//g' | sed 's/://g' |     sed 's/@//g' | sed 's/-/ /g' | sed 's/、/ /g' | sed 's/~/ /g' |     sed "s/‘/'/g" | sed 's/Ｅ/E/g' | sed "s/’/'/g" | sed 's/《//g' | sed 's/》//g' |     sed "s/[ ][ ]*$//g" | sed "s/\[//g" | sed 's/、//g'
    210_40223_210_6228_1_1533298404_4812267_555 上面是一般现在对然后然后下面呢 HE IS ALWAYS FINISHIＮG
    u   ＡAu   ＣCu   ＤDu   ＧGu   ＨHu   ＵUu   ＹYu   ａau   ＩIuP   #|[=]|；|，|？|。|[/]|！|[!]|[.]|[?]|：|,|"|:|@|-|、|~|《|》|[|]|、|\. u   ＥEu   ＮN)replaceresubupper)r    r    L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/tal_csasr.pytext_normalize   s    r"      
corpus_dir
output_dirnum_jobsreturnc                 C   s$  t | } |  sJ d|  |durt |}|jddd g d}i }|D ]@}| d |  d }t|dd	d
&}| D ]}| }	d|	dd }
t|
}
|
||	d < q=W d   n1 saw   Y  q&tt	}t
|ddD ]}td|  g }| d |  d }tj|d|d}|dD ]C}|j}|}||vrtd|  t| d q|| }| std|  qt||d||dd|| d}|| qt|}t|}t||\}}t|| |dur||d| d  ||d| d  ||d||< qq|S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NT)parentsexist_ok)	train_setdev_settest_setTALCS_corpusz	label.txtrzutf-8)encoding r#   r   zDProcess tal_csasr audio, it takes about 4 minutes using 40 cpu jobs.)desczProcessing tal_csasr subset: wavz*.wav)pathpatternr&   z**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakertexttal_csasr_supervisions_z	.jsonl.gztal_csasr_recordings_)
recordingssupervisions)r   is_dirmkdiropen	readlinessplitjoinr"   r   dictr   logginginfor   from_dirrglobstemwarningis_filer   r9   stripappendfrom_recordingsr   from_segmentsr   r	   to_file)r$   r%   r&   dataset_partstranscript_dictparttranscript_pathfr   idx_transcriptcontent	manifestsrA   wav_pathr@   
audio_pathidxr<   r=   segmentrecording_setsupervision_setr    r    r!   prepare_tal_csasr3   s|   





rc   )Nr#   )__doc__rI   r   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   strr"   intrc   r    r    r    r!   <module>   s.    "