o
    Si                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZ defddZ	ddede	e deeeee
eef f f fddZdS )zL
optional TAL_ASR (100 hours) if available(https://ai.100tal.com/dataset).

    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikelinec                 C   s&   |  dd} tdd| } |  } | S )u  
    It is from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/tal_data_prep.sh#L57
      sed 's/Ａ/A/g' | sed 's/#//g' | sed 's/=//g' | sed 's/、//g' |     sed 's/，//g' | sed 's/？//g' | sed 's/。//g' | sed 's/[ ][ ]*$//g'    u   ＡAu   #|=|、|，|？|。|[|] )replaceresubupper)r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/tal_asr.pytext_normalize   s   r   
corpus_dir
output_dirreturnc                 C   s"  t | } |  sJ d|  |durt |}|jddd | d d d }i }t|dd	d
&}| D ]}| }d|dd }t|}|||d < q4W d   n1 sXw   Y  tt	}g d}	t
|	ddD ]}
td|
  g }g }| d d |
  }|dD ]N}|j}|jd }||vrtd|  t| d q|| }| std|  qt|}|| t||d|jdd|| d}|| qt|}t|}t||\}}t|| |dur||d|
 d  ||d|
 d  ||d||
< qk|S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NT)parentsexist_okaisolution_data
transcriptztranscript.txtrzutf-8)encoding    r   )traindevtestz2Process tal_asr audio, it takes about 447 seconds.)desczProcessing tal_asr subset: wavz**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakertexttal_asr_supervisions_z	.jsonl.gztal_asr_recordings_)
recordingssupervisions)r   is_dirmkdiropen	readlinessplitjoinr   r   dictr   logginginforglobstempartswarningis_filer
   	from_fileappendr   r.   stripr   from_recordingsr   from_segmentsr   r	   to_file)r   r   transcript_pathtranscript_dictfr   idx_transcriptcontent	manifestsdataset_partspartr5   r6   wav_path
audio_pathidxr1   r2   	recordingsegmentrecording_setsupervision_setr   r   r   prepare_tal_asr   sx   	








rZ   )N)__doc__r>   r   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   strr   rZ   r   r   r   r   <module>   s(    