o
    2wi<                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ 	ddedefddZ	ddedee de
ee
eeeef f f fddZdS )a  
About the XBMU-AMDO31 corpus
XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University.
publicly available on https://huggingface.co/datasets/syzym/xbmu_amdo31

XBMU-AMDO31 dataset is a speech recognition corpus of Amdo Tibetan dialect.
The open source corpus contains 31 hours of speech data and resources related
to build speech recognition systems,including transcribed texts and a Tibetan
pronunciation lexicon.
(The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused
for the Amdo dialect because of the uniformity of the Tibetan language)
The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
It was recorded by 66 native speakers of Amdo Tibetan, and the recorded audio was processed and manually inspected.
The dataset has three splits: train, evaluation (dev) and test.Each speaker had approximately 450 sentences,
with a small number of individuals having fewer than 200 sen.

Subset	Hours	Male	Female	Remarks
Train   25.41   27      27      18539 sentences recorded by 54 speakers
Dev     2.81    2       4       2050 sentences recorded by 6 speakers
Test    2.85    3       3       2041 sentences recorded by 6 speakers

Licensing Information
This dataset is distributed under CC BY-SA 4.0.
    N)defaultdict)Path)DictOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeis_module_availablesafe_extract.
target_dirreturnc              
   C   sD  d}t | } | jddd | d }|d d }d}d}d	}| sTtd
r,ddlm} ntdtd z|	|| W n t
yN } zt|  d}~ww td |||fD ]F}	||	 }
||	dd  }|d }| rytd| d qYtj|dd t|
}t||d W d   n1 sw   Y  |  qY|S )z
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :return: the path to downloaded and extracted directory with data.
    z1https://huggingface.co/datasets/syzym/xbmu_amdo31Tparentsexist_okxbmu_amdo31datawavztrain.tar.gzz
dev.tar.gzztest.tar.gzgitr   )Repoz}In order to download the xbmu-amdo31 corpus from huggingface, please install the relevant dependencies: pip install gitpythonz(Start downloading the xbmu-amdo31 corpusNDoneiz
.completedzSkipping tar of because z exists.)ignore_errors)path)r   mkdiris_filer   r   r   ImportErrorlogginginfo
clone_from	Exceptionprintshutilrmtreetarfileopenr   touch)r   url
corpus_dirwav_dirtrain_tar_namedev_tar_nametest_tar_namer   errortar_nametar_pathextracted_dircompleted_detectortar r8   W/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/xbmu_amdo31.pydownload_xbmu_amdo31*   sH   


r:   r-   
output_dirc                 C   s<  t | } |  sJ d|  |durt |}|jddd | d }i }t|ddd"}| D ]}| }d	|d
d }|||d < q0W d   n1 sPw   Y  tt}g d}	t	|	ddD ]}
t
d|
  g }g }| d d |
  }d}|dD ]a}|jdd
 }|jd }||vrt
d|  t
| d q|| }| st
d|  qt|}|| |d
7 }tt|d | |d | d|jdd|| d}|| qt|}t|}t||\}}t|| |dur||d|
 d  ||d|
 d  ||d||
< qc|S ) aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   z$data/transcript/transcript_clean.txtrzutf-8)encoding    r   )traindevtestzProcess xbmu_amdo31 audio.)desczProcessing xbmu_amdo31 subset: r   r   z**/*.wav-zNo transcript: z has no transcript.zNo such file: _g        tibetan)idrecording_idstartdurationchannellanguagespeakertextxbmu_amdo31_supervisions_z	.jsonl.gzxbmu_amdo31_recordings_)
recordingssupervisions)r   is_dirr   r*   	readlinessplitjoinr   dictr   r"   r#   rglobstempartswarningr    r
   	from_fileappendr   strrK   stripr   from_recordingsr   from_segmentsr   r	   to_file)r-   r;   transcript_pathtranscript_dictflineidx_transcriptcontent	manifestsdataset_partspartrR   rS   wav_pathcount
audio_pathidxrN   rO   	recordingsegmentrecording_setsupervision_setr8   r8   r9   prepare_xbmu_amdo31Z   s~   	









ru   )r   )N)__doc__r"   osr'   r)   collectionsr   pathlibr   typingr   r   r   	tqdm.autor   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r:   r_   ru   r8   r8   r8   r9   <module>   s6    
1