o
    Si                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ 			ddedededefddZ	ddedee de
ee
eeeef f f fddZdS )z
Primewords is an open-source Chinese Mandarin corpus released by Shanghai Primewords Co. Ltd.
Publicly available on https://www.openslr.org/47/
Primewords (99 hours)
    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extract.F http://www.openslr.org/resources
target_dirforce_downloadbase_urlreturnc              	   C   s   | d}t | } | jddd | d }d}|fD ]U}| | }||dd  }|d }	|	 r<td	| d
|	 d qt| d| ||d tj|dd t	|}
t
|
|d W d   n1 sew   Y  |	  q|S )aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/47Tparentsexist_ok
primewordszprimewords_md_2018_set1.tar.gzNiz
.completedzSkipping download z	 because z exists./)filenamer   )ignore_errors)path)r   mkdiris_filelogginginfor   shutilrmtreetarfileopenr   touch)r   r   r   url
corpus_dirdataset_tar_nametar_nametar_pathextracted_dircompleted_detectortar r0   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/primewords.pydownload_primewords   s.   


r2   r)   
output_dirc                 C   s"  t | } |  sJ d|  |durt |}|jddd | d d }i }i }t|ddd	+}t|}|D ]}|d
 }|d dd }	|d }
|||	< |
||	< q7W d   n1 s]w   Y  tt}dg}t	|ddD ]}t
d|  g }g }| d d }|dD ]M}|j}|| }||vrt
d|  t
| d q|| }| st
d|  qt|}|| t||d|jdd|| d}|| qt|}t|}t||\}}t|| |dur||d| d  ||d| d  ||d||< qo|S )aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   primewords_md_2018_set1zset1_transcript.jsonrzutf-8)encodingtextfiler   r   user_idtrainz.Process primewords audio, it takes 35 seconds.)desczProcessing primewords  subset: audio_filesz**/*.wavzNo transcript: z has no transcript.zNo such file: g        Chinese)idrecording_idstartdurationchannellanguagespeakerr7   primewords_supervisions_z	.jsonl.gzprimewords_recordings_)
recordingssupervisions)r   is_dirr   r&   jsonloadsplitr   dictr   r!   r"   rglobstemwarningr    r
   	from_fileappendr   rA   stripr   from_recordingsr   from_segmentsr   r	   to_file)r)   r3   transcript_pathtranscript_dictspeaker_dictfdatauttcontentuttidr9   	manifestsdataset_partspartrG   rH   wav_path
audio_pathidxrD   r7   	recordingsegmentrecording_setsupervision_setr0   r0   r1   prepare_primewords:   s|   	

	







ri   )r   Fr   )N) __doc__rJ   r!   r#   r%   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   boolstrr2   ri   r0   r0   r0   r1   <module>   sB    
%