o
    Si                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( dZ)			d$de*de&deee*ee* f  dee* de	f
ddZ+		d%de&dee& dee*ee* f de,dee*ee*eee$f f f f
ddZ-d ed!e	deeeee# f  fd"d#Z.dS )&ab  
Description taken from the abstract of paper:
"GigaSpeech: An Evolving, Multi-domain ASR Corpus with 10,000 Hours of Transcribed Audio"
https://arxiv.org/abs/2106.06909

This paper introduces GigaSpeech, an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality labeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised and unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts and YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science, sports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable for speech recognition training, and to filter out segments with low-quality transcription. For system training, GigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h. For our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage, and for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand, are re-processed by professional human transcribers to ensure high transcription quality. Baseline systems are provided for popular speech recognition toolkits, namely Athena, ESPnet, Kaldi and Pika.
    N)defaultdict)ProcessPoolExecutor)repeat)Path)AnyDictListOptionalSequenceTupleUnion)tqdm)CutSetcompute_num_samplesfix_manifests$validate_recordings_and_supervisions)AudioSource	RecordingRecordingSet)parallel_map)manifests_existread_manifests_if_cached)SupervisionSegmentSupervisionSet)PathlikeSecondsis_module_available)XLLMSXSDEVTEST.autotsinghuapassword
target_dirdataset_partshostreturnc                 C   sz   t drddlm} ntd||}|dkrd}nt|tr"|g}|D ]}td|  |j| d| d	 |d
 q$|S )Nspeechcolabr   
GigaSpeech]To process the GigaSpeech corpus, please install optional dependency: pip install speechcolabr%   r   r"   r#   zDownloading GigaSpeech part: {})r*   )	r   speechcolab.datasets.gigaspeechr.   ImportError
isinstancestrlogginginfodownload)r'   r(   r)   r*   r.   
gigaspeechpart r<   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/gigaspeech.pydownload_gigaspeech    s   
r>      
corpus_dir
output_dirnum_jobsc                 C   s$  t drddlm} ntd|dkrdn|}t|tr|g}t| } || }t|}|jddd t||d	d
dd}|D ]}t	
d|  t||d	d
drYt	
d| d q=t|d| d }	t|d| d r}
t|d| d W}ttt|d| d t|j|dddD ]8\}}tt|gt|d\}}t||d tj||d}|	|d  |D ]}|
| q||d  qW d    n1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  t|	jt|
jt|jd||< q=t|S )Nr,   r   r-   r/   r%   r0   T)parentsexist_okr:   zjsonl.gz)r)   rA   prefixsuffixlazyzProcessing GigaSpeech subset: )r;   rA   rE   rF   zGigaSpeech subset: z already prepared - skipping.gigaspeech_recordings_z	.jsonl.gzgigaspeech_supervisions_gigaspeech_cuts_r1   r2   )rB   z"Processing GigaSpeech JSON entries)desc)
recordingssupervisions)rL   rM   cuts)r   r3   r.   r4   r5   r6   r   mkdirr   r7   r8   r   r   open_writerr   r   r   r   parse_utteranceaudiosr   gigaspeech_dataset_dirr   from_recordingsfrom_segmentsr   from_manifestswritefrom_jsonl_lazypathdict)r@   rA   r)   rB   r.   subsetsr:   	manifestsr;   
rec_writer
sup_writer
cut_writer	recordingsegmentsrL   rN   sr<   r<   r=   prepare_gigaspeech:   s   



  
$

rc   audio	root_pathc                 C   s   t | d }t| d tdttt | d t|| d  dgtt| d |d|t| d d	}g }| d
 D ])}|t	|d | d t|d t
t|d |d  dddd|d |d d q7||fS )Nsample_rateaidfilechannelsrY   )typeri   sourceduration)rl   sampling_rate)idsourcesnum_samplesrm   rl   ra   sid
begin_timeend_time   )ndigitsr   Englishspeakertext_tn)rn   recording_idstartrl   channellanguagerw   text)intr   r   listranger6   r   r   appendr   round)rd   re   rm   r`   ra   segr<   r<   r=   rQ      s<   

rQ   )r$   r%   r&   )r%   r?   )/__doc__r7   collectionsr   concurrent.futuresr   	itertoolsr   pathlibr   typingr   r   r   r	   r
   r   r   	tqdm.autor   lhotser   r   r   r   lhotse.audior   r   r   lhotse.parallelr   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   GIGASPEECH_PARTSr6   r>   r~   rc   rQ   r<   r<   r<   r=   <module>   sb    $

R