o
    2wÖi<  ã                   @   sð   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ 	ddedefdd„Z	ddedee de
ee
eeeef f f fdd„ZdS )a  
About the XBMU-AMDO31 corpus
XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University.
publicly available on https://huggingface.co/datasets/syzym/xbmu_amdo31

XBMU-AMDO31 dataset is a speech recognition corpus of Amdo Tibetan dialect.
The open source corpus contains 31 hours of speech data and resources related
to build speech recognition systems,including transcribed texts and a Tibetan
pronunciation lexicon.
(The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused
for the Amdo dialect because of the uniformity of the Tibetan language)
The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
It was recorded by 66 native speakers of Amdo Tibetan, and the recorded audio was processed and manually inspected.
The dataset has three splits: train, evaluation (dev) and test.Each speaker had approximately 450 sentences,
with a small number of individuals having fewer than 200 sen.

Subset	Hours	Male	Female	Remarks
Train   25.41   27      27      18539 sentences recorded by 54 speakers
Dev     2.81    2       4       2050 sentences recorded by 6 speakers
Test    2.85    3       3       2041 sentences recorded by 6 speakers

Licensing Information
This dataset is distributed under CC BY-SA 4.0.
é    N)Údefaultdict)ÚPath)ÚDictÚOptionalÚUnion)Útqdm)Úfix_manifestsÚ$validate_recordings_and_supervisions)Ú	RecordingÚRecordingSet)ÚSupervisionSegmentÚSupervisionSet)ÚPathlikeÚis_module_availableÚsafe_extractÚ.Ú
target_dirÚreturnc              
   C   sD  d}t | ƒ} | jddd | d }|d d }d}d}d	}| ¡ sTtd
ƒr,ddlm} ntdƒ‚t d¡ z| 	||¡ W n t
yN } zt|ƒ ‚ d}~ww t d¡ |||fD ]F}	||	 }
||	dd…  }|d }| ¡ ryt d|› d¡ qYtj|dd t |
¡}t||d W d  ƒ n1 s–w   Y  | ¡  qY|S )z½
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :return: the path to downloaded and extracted directory with data.
    z1https://huggingface.co/datasets/syzym/xbmu_amdo31T©ÚparentsÚexist_okÚxbmu_amdo31ÚdataÚwavztrain.tar.gzz
dev.tar.gzztest.tar.gzÚgitr   )ÚRepoz}In order to download the xbmu-amdo31 corpus from huggingface, please install the relevant dependencies: pip install gitpythonz(Start downloading the xbmu-amdo31 corpusNÚDoneiùÿÿÿz
.completedzSkipping tar of because z exists.)Úignore_errors)Úpath)r   ÚmkdirÚis_filer   r   r   ÚImportErrorÚloggingÚinfoÚ
clone_fromÚ	ExceptionÚprintÚshutilÚrmtreeÚtarfileÚopenr   Útouch)r   ÚurlÚ
corpus_dirÚwav_dirÚtrain_tar_nameÚdev_tar_nameÚtest_tar_namer   ÚerrorÚtar_nameÚtar_pathÚextracted_dirÚcompleted_detectorÚtar© r8   úW/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/xbmu_amdo31.pyÚdownload_xbmu_amdo31*   sH   ÿ
€þ
ÿ
r:   r-   Ú
output_dirc                 C   s<  t | ƒ} |  ¡ sJ d| › ƒ‚|durt |ƒ}|jddd | d }i }t|ddd"}| ¡ D ]}| ¡ }d	 |d
d… ¡}|||d < q0W d  ƒ n1 sPw   Y  ttƒ}g d¢}	t	|	ddD ]¸}
t
 d|
› ¡ g }g }| d d |
›  }d}| d¡D ]a}|j d¡d
 }|jd }||vr¥t
 d|› ¡ t
 |› d¡ q|| }| ¡ s¶t
 d|› ¡ qt |¡}| |¡ |d
7 }tt|ƒd | |d | d|jdd|| ¡ d}| |¡ qt |¡}t |¡}t||ƒ\}}t||ƒ |dur| |d|
› d ¡ | |d|
› d ¡ ||dœ||
< qc|S ) aL  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: NTr   z$data/transcript/transcript_clean.txtÚrzutf-8)Úencodingú é   r   )ÚtrainÚdevÚtestzProcess xbmu_amdo31 audio.)ÚdesczProcessing xbmu_amdo31 subset: r   r   z**/*.wavú-éþÿÿÿzNo transcript: z has no transcript.zNo such file: Ú_g        Útibetan)ÚidÚrecording_idÚstartÚdurationÚchannelÚlanguageÚspeakerÚtextÚxbmu_amdo31_supervisions_z	.jsonl.gzÚxbmu_amdo31_recordings_)Ú
recordingsÚsupervisions)r   Úis_dirr   r*   Ú	readlinesÚsplitÚjoinr   Údictr   r"   r#   ÚrglobÚstemÚpartsÚwarningr    r
   Ú	from_fileÚappendr   ÚstrrK   Ústripr   Úfrom_recordingsr   Úfrom_segmentsr   r	   Úto_file)r-   r;   Útranscript_pathÚtranscript_dictÚfÚlineÚidx_transcriptÚcontentÚ	manifestsÚdataset_partsÚpartrR   rS   Úwav_pathÚcountÚ
audio_pathÚidxrN   rO   Ú	recordingÚsegmentÚrecording_setÚsupervision_setr8   r8   r9   Úprepare_xbmu_amdo31Z   s~   	ýÿ
þ



ø




ÿÿru   )r   )N)Ú__doc__r"   Úosr'   r)   Úcollectionsr   Úpathlibr   Útypingr   r   r   Ú	tqdm.autor   Úlhotser   r	   Úlhotse.audior
   r   Úlhotse.supervisionr   r   Úlhotse.utilsr   r   r   r:   r_   ru   r8   r8   r8   r9   Ú<module>   s6    ÿÿ
þ1ÿÿÿþ