o
    2wi4                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZmZ d d	lmZmZmZ d d
lm Z m!Z!m"Z"m#Z# 			d(de deee$ee$ f  de%defddZ&ddddde'fde deee$ee$ f  de%de%de$de$defddZ(					d)de dee  dee$ee$ f dee  d e$d!e)de
e$e
e$eeef f f fd"d#Z*d$ed%e$de
e$ee f deeeef  fd&d'Z+dS )*    N)ThreadPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)manifests_existread_manifests_if_cached)AlignmentItemSupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadsafe_extract.mini_librispeechF
target_dirdataset_partsforce_downloadreturnc              	   C   s  t dsJ dddlm} |d}t| } | d }| jddd |d	kr(t}n|d
kr/t}nt|tr7|g}t	|ddD ]{}t
d|  |tv rQt d}n|tv r[t d}n	t
d|  q=|| }|d }	|	 r}t
d| d|	 d q=| d}
| |
 }t| d|
 ||d tj|dd t|}t|| d W d   n1 sw   Y  |	  q=tr| d }	|	 r|s|S t dsJ dddl}t| d }|jt|d t|}|j| d |	  W d   |S 1 sw   Y  |S ) a  
    Download the GLOBE dataset from HuggingFace Hub.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :return: the path to downloaded and extracted directory with data.
    datasetsz(Please run `pip install datasets` first.r   )load_datasetzMushanW/GLOBELibriSpeechTparentsexist_oklibrispeechr   Downloading LibriSpeech partsdescProcessing split: /12/31Invalid dataset part name: 
.completed	Skipping 	 because  exists..tar.gz/filenamer   ignore_errorspathN.ali_completedgdownFTo download LibriSpeech alignments, please install "pip install gdown"LibriSpeech-Alignments.zipoutput)r   r   r   r   mkdirLIBRISPEECHMINI_LIBRISPEECH
isinstancestrr
   logginginfobase_urlwarningis_filer   shutilrmtreetarfileopenr   touch
alignmentsr9   downloadalignments_urlzipfileZipFile
extractall)r   r   r   r   ds
corpus_dirparturlpart_dircompleted_detectortar_nametar_pathtarr9   ali_zip_pathf r^   Q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/globe.pydownload_globe   sl   




r`   z http://www.openslr.org/resourcesrM   rE   rO   c              	   C   s  t | } | d }| jddd |dkrt}n|dkrt}nt|tr%|g}t|ddD ]{}td|  |tv r?| d	}n|tv rI| d
}n	t	d|  q+|| }	|	d }
|

 rktd| d|
 d q+| d}| | }t| d| ||d tj|	dd t|}t|| d W d   n1 sw   Y  |
  q+|r| d }
|

 r|s|S tdsJ dddl}t| d }|j||d t|}|j| d |
  W d   |S 1 sw   Y  |S )a  
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param alignments: should we download the alignments. The original source is:
        https://github.com/CorentinJ/librispeech-alignments
    :param base_url: str, the url of the OpenSLR resources.
    :param alignments_url: str, the url of LibriSpeech word alignments
    :return: the path to downloaded and extracted directory with data.
    r    Tr!   r$   r   r%   r&   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r4   r6   Nr8   r9   r:   r   r;   r<   )r   r>   r?   r@   rA   rB   r
   rC   rD   rF   rG   r   rH   rI   rJ   rK   r   rL   r   r9   rN   rP   rQ   rR   )r   r   r   rM   rE   rO   rT   rU   rV   rW   rX   rY   rZ   r[   r9   r\   r]   r^   r^   r_   download_librispeecha   sf   




ra   autonone   rT   alignments_dir
output_dirnormalize_textnum_jobsc                    s  t | } |durt |n| }|  sJ d|  |dkr-ttdd | dD }n(|dkrMtttdd | dD }|sLtd	|  nt	|t
rU|g}i }|durlt |}|jd
d
d t||d}t|}t|ddD ]}td|  t||ddrtd| d qwg }	g }
| | }g }t|ddddD ]B}i }||j|  |jdd d  }| rt|}t|}|D ]}||t||| qW d   n1 sw   Y  qt|dddD ]}| }|du rq|\}}|	| |
| qt|	}t !|
}|dkr)dd  t ! fdd |D }t"||\}}t#|| |durP|$|d!| d"  |$|d#| d"  ||d$||< qwW d   |S 1 sdw   Y  |S )%a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param alignments_dir: Pathlike, the path of the alignments dir. By default, it is
        the same as ``corpus_dir``.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: str, "none" or "lower",
        for "lower" the transcripts are converted to lower-case.
    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    NzNo such directory: r   c                 s       | ]}|j V  qd S Nname.0r7   r^   r^   r_   	<genexpr>   s    
z&prepare_librispeech.<locals>.<genexpr>*rb   c                 s   ri   rj   rk   rm   r^   r^   r_   ro      s    zACould not find any of librispeech or mini_librispeech splits in: Tr!   )r   rf   zDataset partsr&   zProcessing LibriSpeech subset: r$   )rU   rf   prefixzLibriSpeech subset: z already prepared - skipping.z*.trans.txtzDistributing tasksF)r'   leaver   r   z.alignment.txt
Processinglowerc                 S   s   |   S rj   )rt   )textr^   r^   r_   <lambda>  s    z%prepare_librispeech.<locals>.<lambda>c                    s   g | ]}|  qS r^   )transform_text)rn   sto_lowerr^   r_   
<listcomp>  s    z'prepare_librispeech.<locals>.<listcomp>librispeech_supervisions_z	.jsonl.gzlibrispeech_recordings_)
recordingssupervisions)%r   is_dirsetr@   intersectionglobr?   union
ValueErrorrA   rB   r>   r   r   r
   rC   rD   r   rglobparentrelative_tostemsplitexistsparse_alignmentsrK   appendsubmitparse_utteranceresultr   from_recordingsr   from_segmentsr   r   to_file)rT   re   r   rf   rg   rh   	manifestsexrU   r~   r   	part_pathfutures
trans_pathrM   ali_pathr]   linefuturer   	recordingsegmentrecording_setsupervision_setr^   ry   r_   prepare_librispeech   s   












FFr   dataset_split_pathr   c                 C   s   |  jdd\}}| t|ddj | d }| s(td|  d S tj	||d}t
||d|jd	d
tdd|j|  ||v rKd|| ind d	}||fS )Nrd   )maxsplit-r1   z.flaczNo such file: )recording_idg        r   Englishz-.* word)	idr   startdurationchannellanguagespeakerru   	alignment)stripr   r   replacer   rG   rC   rF   r   	from_filer   r   resubr   )r   r   rM   r   ru   
audio_pathr   r   r^   r^   r_   r   .  s2   r   )r   r   F)Nrb   Nrc   rd   ),rC   r   rH   rJ   rP   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   lhotser   r   lhotse.audior   r   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   r   lhotse.utilsr   r   r   r   rB   boolr`   LIBRISPEECH_ALIGNMENTS_URLra   intr   r   r^   r^   r^   r_   <module>   s     
O
P
