o
    2wi)%                     @   s*  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ 			ddede
e de
e defddZ 				d dede
e de
e dedede	ee	eeeef f f fddZ!dS )!al  
The AliMeeting Mandarin corpus, originally designed for ICASSP 2022 Multi-channel
Multi-party Meeting Transcription Challenge (M2MeT), is recorded from real meetings,
including far-field speech collected by an 8-channel microphone array as well as
near-field speech collected by each participants' headset microphone. The dataset
contains 118.75 hours of speech data in total, divided into 104.75 hours for training
(Train), 4 hours for evaluation (Eval) and 10 hours as test set (Test), according to
M2MeT challenge arrangement. Specifically, the Train, Eval and Test sets contain 212,
8 and 20 meeting sessions respectively, and each session consists of a 15 to 30-minute
discussion by 2-4 participants. AliMeeting covers a variety of aspects in real-world
meetings, including diverse meeting rooms, various number of meeting participants and
different speaker overlap ratios. High-quality transcriptions are provided as well.
The dataset can be used for tasks in meeting rich transcriptions, including speaker
diarization and multi-speaker automatic speech recognition.

More details and download link: https://openslr.org/119/
    N)defaultdict)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)normalize_text_alimeeting)SupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadsafe_extract.F;https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/
target_dirforce_downloadbase_urlreturnc              	   C   s   | d}t | } | jddd g d}|D ]-}| | }t| d| ||d t|}t|| d W d   n1 s>w   Y  q| S )	aR  
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    z/AliMeeting/openlrTparentsexist_ok)zTrain_Ali_far.tar.gzzTrain_Ali_near.tar.gzzEval_Ali.tar.gzzTest_Ali.tar.gz/)filenamer   )pathN)r   mkdirr   tarfileopenr   )r   r   r   urldataset_tar_namestar_nametar_pathtar r'   W/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/ali_meeting.pydownload_ali_meeting$   s   
r)   farnone
corpus_dir
output_dirmicnormalize_text	save_monoc           #      C   s  t dstdddl}|r|dkrtd d}|r!|s!td|}|d	v r)d
nd}t| } |  s:J d|  tt}|durMt|}|j	ddd dD ]}|ra|d | }	|	j	ddd g }
g }| }|dkso|dkr| | d  r| | d n| }|| d|  d }|| d|  d }t
t|dd| dD ]}|j}|d
kr|d\}}}}|dd }z
|jt|}W n ty   t| d Y qw t|| dd }|r
|	|j }| sd| d| }tj|ddd  tj||d!}ntj||d!}|
| |jD ]x}|dkrE|jd}t|d"kr4|\}}}}nt|d#kr?|\}}|dd }t|jD ]E\}}|jd$kr|j}|j}|j}t | d%| d%| |j!|t"|| d"|d&v rwdntt#d'd(||t$|% |d)d*	} ||  qJqqt&t'(|
t)*|\}!}"t+|!|" |dur|",|d+| d,|-  d-  |!,|d+| d.|-  d-  |!|"d/||- < qO|S )0az  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data. May
        also specify "ihm", "sdm", "mdm" (similar to AMI recipe), where "ihm" and "mdm" are the same as "near"
        and "far" respectively, and "sdm" is the same as "far" with a single channel.
    :param normalize_text: str, the text normalization type. Available options: "none", "m2met".
    :param save_mono: bool, if True, save the mono recordings for sdm mic. This can speed up
        feature extraction since all channels will not be loaded.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    textgridz@To prepare AliMeeting data, please 'pip install textgrid' first.r   NsdmzCsave_mono is True, but mic is not 'sdm'. Ignoring save_mono option.Fzjsave_mono is True, but output_dir is not specified. Please specify output_dir to save the mono recordings.)ihmnearr4   r*   zNo such directory: Tr   )TrainEvalTestalimeeting_sdmr6   r7   _Ali_Ali_	audio_dirtextgrid_dirz
*.TextGridz
Preparing )desc_   z0 has annotation issues. Skipping this recording.z*.wavzsox z -c 1 )shellcheck)recording_id       -)r4   r3   r2      Chinese)	normalize)	idrB   startdurationchannellanguagespeakergendertextzalimeeting-_supervisions_z	.jsonl.gz_recordings_)
recordingssupervisions).r   
ValueErrorr1   loggingwarningr   is_dirr   dictr   r   listrglobstemsplitTextGridfromFilestrnameis_file
subprocessrunr	   	from_fileappendtierslen	enumerate	intervalsmarkminTimemaxTimer   rJ   roundranger   stripr   r
   from_recordingsr   from_segmentsr   to_filelower)#r,   r-   r.   r/   r0   r1   mic_orig	manifestspartoutput_dir_monorT   rU   corpus_dir_split	wav_paths
text_paths	text_path
session_idr>   rP   spk_idtgwav_pathwav_path_monocmd	recordingtierpartsiintervalrK   endrQ   segmentrecording_setsupervision_setr'   r'   r(   prepare_ali_meetingF   s   









 

r   )r   Fr   )Nr*   r+   F)"__doc__rW   rd   r    collectionsr   pathlibr   typingr   r   r   r   lhotser   lhotse.audior	   r
   	lhotse.qar   lhotse.recipes.utilsr   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   boolra   r)   r   r'   r'   r'   r(   <module>   sZ    
$