o
    SiS,                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZmZ d d	lmZmZmZ d d
lm Z m!Z!m"Z"m#Z# dZ$dZ%dZ&ddddde&fde deee'ee' f  de(de(de'de'defddZ)					d,de dee  dee'ee' f d ee  d!e'd"e*de
e'e
e'eeef f f fd#d$Z+d%ed&e'de
e'ee f deeeef  fd'd(Z,d)e de
e'ee f fd*d+Z-dS )-    N)ThreadPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)manifests_existread_manifests_if_cached)AlignmentItemSupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadsafe_extract)z	dev-cleanz	dev-otherz
test-cleanz
test-otherztrain-clean-100ztrain-clean-360ztrain-other-500)zdev-clean-2ztrain-clean-5z@https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE.mini_librispeechFz http://www.openslr.org/resources
target_dirdataset_partsforce_download
alignmentsbase_urlalignments_urlreturnc              	   C   s  t | } | d }| jddd |dkrt}n|dkrt}nt|tr%|g}t|ddD ]{}td|  |tv r?| d	}n|tv rI| d
}n	t	d|  q+|| }	|	d }
|

 rktd| d|
 d q+| d}| | }t| d| ||d tj|	dd t|}t|| d W d   n1 sw   Y  |
  q+|r| d }
|

 r|s|S tdsJ dddl}t| d }|j||d t|}|j| d |
  W d   |S 1 sw   Y  |S )a  
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param alignments: should we download the alignments. The original source is:
        https://github.com/CorentinJ/librispeech-alignments
    :param base_url: str, the url of the OpenSLR resources.
    :param alignments_url: str, the url of LibriSpeech word alignments
    :return: the path to downloaded and extracted directory with data.
    LibriSpeechTparentsexist_oklibrispeechr   zDownloading LibriSpeech partsdesczProcessing split: z/12z/31zInvalid dataset part name: z
.completedz	Skipping z	 because z exists.z.tar.gz/)filenamer   )ignore_errors)pathNz.ali_completedgdownzFTo download LibriSpeech alignments, please install "pip install gdown"r   zLibriSpeech-Alignments.zip)output)r   mkdirLIBRISPEECHMINI_LIBRISPEECH
isinstancestrr
   logginginfowarningis_filer   shutilrmtreetarfileopenr   touchr   r,   downloadzipfileZipFile
extractall)r   r   r   r   r   r   
corpus_dirparturlpart_dircompleted_detectortar_nametar_pathtarr,   ali_zip_pathf rJ   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/librispeech.pydownload_librispeech"   sf   




rL   autonone   r@   alignments_dir
output_dirnormalize_textnum_jobsc                    s  t | } |durt |n| }|  sJ d|  |dkr-ttdd | dD }n(|dkrMtttdd | dD }|sLtd	|  nt	|t
rU|g}i }|durlt |}|jd
d
d t||d}t|}t|ddD ]}td|  t||ddrtd| d qwg }	g }
| | }g }t|ddddD ]B}i }||j|  |jdd d  }| rt|}t|}|D ]}||t||| qW d   n1 sw   Y  qt|dddD ]}| }|du rq|\}}|	| |
| qt|	}t !|
}|dkr)dd  t ! fdd |D }t"||\}}t#|| |durP|$|d!| d"  |$|d#| d"  ||d$||< qwW d   |S 1 sdw   Y  |S )%a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param alignments_dir: Pathlike, the path of the alignments dir. By default, it is
        the same as ``corpus_dir``.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: str, "none" or "lower",
        for "lower" the transcripts are converted to lower-case.
    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    NzNo such directory: r   c                 s       | ]}|j V  qd S Nname.0r+   rJ   rJ   rK   	<genexpr>   s    
z&prepare_librispeech.<locals>.<genexpr>*rM   c                 s   rT   rU   rV   rX   rJ   rJ   rK   rZ      s    zACould not find any of librispeech or mini_librispeech splits in: Tr"   )r   rQ   zDataset partsr&   zProcessing LibriSpeech subset: r%   )rA   rQ   prefixzLibriSpeech subset: z already prepared - skipping.z*.trans.txtzDistributing tasksF)r'   leaver   r   z.alignment.txt
Processinglowerc                 S   s   |   S rU   )r_   )textrJ   rJ   rK   <lambda>   s    z%prepare_librispeech.<locals>.<lambda>c                    s   g | ]}|  qS rJ   )transform_text)rY   sto_lowerrJ   rK   
<listcomp>   s    z'prepare_librispeech.<locals>.<listcomp>librispeech_supervisions_z	.jsonl.gzlibrispeech_recordings_)
recordingssupervisions)%r   is_dirsetr0   intersectionglobr/   union
ValueErrorr1   r2   r.   r   r   r
   r3   r4   r   rglobparentrelative_tostemsplitexistsparse_alignmentsr:   appendsubmitparse_utteranceresultr   from_recordingsr   from_segmentsr   r   to_file)r@   rP   r   rQ   rR   rS   	manifestsexrA   ri   rj   	part_pathfutures
trans_pathr   ali_pathrI   linefuturer{   	recordingsegmentrecording_setsupervision_setrJ   rd   rK   prepare_librispeechp   s   












FFr   dataset_split_pathr   c                 C   s   |  jdd\}}| t|ddj | d }| s(td|  d S tj	||d}t
||d|jd	d
tdd|j|  ||v rKd|| ind d	}||fS )NrO   )maxsplit-r(   z.flaczNo such file: )recording_id        r   Englishz-.* word)	idr   startdurationchannellanguagespeakerr`   	alignment)stripru   r   replacerr   r6   r3   r5   r   	from_filer   r   resubr   )r   r   r   r   r`   
audio_pathr   r   rJ   rJ   rK   rz      s2   rz   r   c              	   C   s   i }t |   D ]4}| \}}}|ddd}dgttt|ddd }dd t|||dd  D ||< q
|S )N"r   ,r   c              	   S   s,   g | ]\}}}t ||t|| d ddqS )   )ndigits)symbolr   r   )r   round)rY   r   r   endrJ   rJ   rK   rf     s    z$parse_alignments.<locals>.<listcomp>rO   )	r   	read_text
splitlinesru   r   listmapfloatzip)r   r   r   utt_idwords
timestampsrJ   rJ   rK   rw     s   "rw   )NrM   NrN   rO   ).r3   r   r7   r9   r=   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   lhotser   r   lhotse.audior   r   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   r   lhotse.utilsr   r   r   r   r/   r0   LIBRISPEECH_ALIGNMENTS_URLr2   boolrL   intr   rz   rw   rJ   rJ   rJ   rK   <module>   s     	
P

"!