o
    Si                     @   s
  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ defddZ			ddedededefddZ	ddedee de
ee
eeeef f f fddZ dS )z
Stcmds is an open-source  Chinese Mandarin corpus by Surfingtech (www.surfing.ai), containing utterances from 855 speakers, 102600 utterances;
Publicly available on https://www.openslr.org/resources/38
ST-CMDS (110 hours)

    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractlinec                 C   s   |  dd} |  } | S )u  
    Modified from https://github.com/wenet-e2e/wenet/blob/main/examples/multi_cn/s0/local/stcmds_data_prep.sh#L42
    paste -d' ' $data/utt.list $data/text.list |    sed 's/，//g' |    tr '[a-z]' '[A-Z]' |    awk '{if (NF > 1) print $0;}' > $data/train/text
    u   ， )replaceupper)r    r   I/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/stcmds.pytext_normalize   s   r   .F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              	   C   s   | d}t | } | jddd | d }d}|fD ]R}| | }||dd  }|d }	|	 r9td	|	 d
 qt| d| ||d tj|dd t	|}
t
|
|d W d   n1 sbw   Y  |	  q|S )aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/38Tparentsexist_okstcmdszST-CMDS-20170001_1-OS.tar.gzNiz
.completedzSkipping download of because z exists./)filenamer   )ignore_errors)path)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r   r   r   url
corpus_dirdataset_tar_nametar_nametar_pathextracted_dircompleted_detectortarr   r   r   download_stcmds$   s*   


r7   r0   
output_dirc                 C   s  t | } |  sJ d|  |durt |}|jddd | d }i }|dD ]1}|j}td|  t|dd	d
}|D ]
}t|}|||< q@W d   n1 sUw   Y  q)t	t
}dg}	t|	ddD ]}
td|
  g }g }|dD ]T}|j}dt|dd }||vrtd|  t| d q{|| }| std|  q{t|}|| t||d|jdd|| d}|| q{t|}t|}t||\}}t|| |dur||d|
 d  ||d|
 d  ||d||
< qh|S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   zST-CMDS-20170001_1-OSz**/*.txtzprocessing stcmds transcript  rzutf-8)encodingtrainz=process stcmds audio, it needs waste about 2169 seconds time.)desczProcessing stcmds z**/*.wavr         zNo transcript: z has no transcriptzNo such file: g        r   Chinese)idrecording_idstartdurationchannellanguagespeakertextstcmds_supervisions_z	.jsonl.gzstcmds_recordings_)
recordingssupervisions)r   is_dirr&   rglobstemr(   r)   r-   r   r   dictr   joinlistwarningr'   r
   	from_fileappendr   rC   stripr   from_recordingsr   from_segmentsr   r	   to_file)r0   r8   r%   transcript_dict	text_pathidxfr   	manifestsdataset_partspartrJ   rK   
audio_pathrF   rG   	recordingsegmentrecording_setsupervision_setr   r   r   prepare_stcmdsG   sv   	








re   )r   Fr   )N)!__doc__r(   osr*   r,   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   strr   boolr7   re   r   r   r   r   <module>   sD    
$