o
    Si"                     @   sL  d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZmZmZ d dlmZmZmZ d d	lmZ 	
						d%dede
e dedededede	e defddZ	d&dede	e deeeeeeef f f fddZdedee fd d!Z	"d'de
ee ee f fd#d$Z dS )(    N)defaultdict)Path)DictListOptionalTupleUnion)tqdm)CutSetMonoCutfix_manifests)AudioSource	RecordingRecordingSet)AlignmentItemSupervisionSegmentSupervisionSet)Pathlike.traindevtestsdm240825.1_train240825.1_dev1240629.1_eval_small_with_GTF
target_dirpartsmictrain_versiondev_versiontest_versionforce_downloadreturnc              
   C   s   zddl m} W n ty } ztd|d }~ww td}	|	s%tdt| } | jddd |D ][}
|
dkr=d	}|}n|
d
krFd}|}n|
dkrOd}|}ntd|
 dd| d| dg}|dkrr|	d| d| d n|dkr|	d| d| d |dd| t
||d q2| S )Nr   )snapshot_downloadzbhuggingface_hub is required for NOTSOFAR downloads. Install it via:
  pip install huggingface_hub
HF_TOKENz{HuggingFace token not found. Please set the HF_TOKEN environment variable. If you have set it, please restart the session. T)parentsexist_okr   	train_setr   dev_setr   eval_setzUnknown part: z*. Expected one of: 'train', 'dev', 'test'.zbenchmark-datasets//z/MTG/*/*.jsonr   z/MTG/*/sc_*mdmz/MTG/*/mc_*zmicrosoft/NOTSOFARdataset)repo_id	repo_type	local_dirr#   allow_patterns)huggingface_hubr%   ImportErrorRuntimeErrorosgetenvr   mkdir
ValueErrorappendbool)r   r   r   r    r!   r"   r#   r%   import_errorhugging_face_tokenpartsubset_nameversiondownload_patterns rB   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/notsofar1.pydownload_notsofar1   s`   	

rD   
corpus_dir
output_dirc                 C   s  t | d } |d u rtdt |}|  sJ d|  tj|dd tt}t| D ]}| | }tt||< t|D ]}|| d }t|ddd\}}tt|| |< |rt	t
| d d	  \}	}
d
| d| }|	|| d  |
|| d  |	|
d|| | d< |rt	t
| d d	  \}}d| d| }||| d  ||| d  ||d|| | d< q<q,|S )Nzbenchmark-datasetszoutput_dir must be providedzNo such directory: T)r(   MTGF)
word_levelcreate_word_alignment   notsofar1_sdm__z_recordings.jsonl.gzz_supervisions.jsonl.gz)
recordingssupervisionssingle_channelnotsofar1_mdm_multi_channel)r   r9   is_dirr6   makedirsr   dict_listdir_safeprocess_datar   r
   	from_cuts	decomposeto_file)rE   rF   	manifestsr>   part_dirr@   version_dirsc_cutsmc_cutssc_recssc_supstagmc_recsmc_supsrB   rB   rC   prepare_notsofar1P   sN   
rd   pathc                 C   s   t tdd t| S )Nc                 S   s   d| vS )Nz	.DS_StorerB   )namerB   rB   rC   <lambda>   s    z_listdir_safe.<locals>.<lambda>)listfilterr6   listdir)re   rB   rB   rC   rU      s   rU   Tc                    sV  t t| }g }g }t|D ]}| | d }t ttfddt}t|d}	t|	}
W d    n1 s=w   Y  |D ]}|  | d| }d|v }|r{tt }t	
 d }||_tt||_ fdd	t|D |_n d }t	
|}||_g }|
D ]v}|d
 }|j}t|d }t|d }|d }d }|rdg i}|d D ]$\}}}d|v sd|v rqt|}t|}|d t|||| d q|t| dtt|d d dtt|d d |j||| ||||d q|r|t|jd|j|j||d qD|t|jd|jd||d qDq||fS )Nzgt_transcription.jsonc                    s   | dkot j |  S )N
close_talk)r6   re   isdir)x)meeting_rootrB   rC   rg      s    zprocess_data.<locals>.<lambda>rrL   mczch0.wavc              
      s,   g | ]}t d |gt d| d dqS )filechz.wav)typechannelssource)r   str).0i)device_pathrB   rC   
<listcomp>   s    z process_data.<locals>.<listcomp>
speaker_id
start_timeend_timetextwordword_timing<>)symbolstartdurationd      )idrecording_idr   r   channelr~   speaker	alignmentr   )r   r   r   r   rN   	recording)sortedrU   r	   rh   ri   openjsonloadlenr   	from_filer   rangechannel_idssourcesfloatr:   r   r   rv   intzfillr   r   )dataset_pathrH   rI   meetingsr]   r^   meetingtranscription_pathdevicesftranscription_jsondevice	device_idis_multi_channelnum_channelsr   recording_pathrN   segmentr{   r   r|   r}   r~   r   	alig_textalig_start_timealig_end_timerB   )ry   rn   rC   rV      s   
	

	
6WrV   )r   r   r   r   r   r   F)N)FT)!r   r6   collectionsr   pathlibr   typingr   r   r   r   r   r	   lhotser
   r   r   lhotse.audior   r   r   lhotse.supervisionr   r   r   lhotse.utilsr   rv   r;   rD   rd   rU   rV   rB   rB   rB   rC   <module>   s^    
C
4