o
    …wÖiD  ã                   @   sp   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlZdd„ Z	dde
dee
 dee
 fd	d
„ZdS )é    N)ÚOptionalc                 C   s^  |r|n|}t | dƒš}t |dƒ|}|D ]q}|d | d¡d …  ¡ }| dd¡ dd¡}| ¡ }|| d¡d d… }	tj |||	|	 d	¡d |	 d	¡… |	d
 ¡}
tj |||	|	 d	¡d |	 d	¡… |	d
 ¡}dd l	}|j
 |
¡}|||dœ}t ||¡ | d¡ qW d   ƒ n1 sw   Y  W d   ƒ d S W d   ƒ d S 1 s¨w   Y  d S )NÚrÚwú(é   z<s>Ú z</s>éþÿÿÿú-ú.wavr   )Úaudio_filepathÚdurationÚtextÚ
)ÚopenÚfindÚlowerÚreplaceÚstripÚosÚpathÚjoinÚrfindÚsoxÚ	file_infor   ÚjsonÚdumpÚwrite)Útranscripts_pathÚmanifest_pathÚdata_dirÚ	mount_dirÚwav_pathÚfinÚfoutÚlineÚ
transcriptÚfile_idÚ
audio_pathÚmounted_audio_pathr   r   Úmetadata© r*   úV/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/utils/notebook_utils.pyÚbuild_manifest   s0   $ÿ$ÿéÿÿ"ÿr,   ú./r   Útrain_mount_dirÚtest_mount_dirc                 C   sH  t dƒ tj| dd tj | d ¡s#d}t || ¡}t d|› ƒ nt dƒ | d }tj | d ¡sbt |¡}|j	| d	 t d
ƒ t
j
| d dd}|D ]}|dd… d }d||g}	t |	¡ qMt dƒ t dƒ | d }
| d }tj |¡s„t|
|| |dƒ t dƒ | d }| d }tj |¡sžt||| |dƒ t dƒ t dƒ dS )aÛ  
    Function to download the AN4 dataset. This hides pre-processing boilerplate for notebook ASR examples.

    Args:
        data_dir: Path to store the data.
        train_mount_dir: If you plan to mount the dataset, use this to prepend the mount directory to the
            audio filepath in the train manifest.
        test_mount_dir: If you plan to mount the dataset, use this to prepend the mount directory to the
            audio filepath in the test manifest.
    z******T)Úexist_okz/an4_sphere.tar.gzzBhttps://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gzzDataset downloaded at: zTarfile already exists.z/an4/)r   zConverting .sph to .wav...z/an4/**/*.sph)Ú	recursiveNéüÿÿÿr
   r   zFinished conversion.
******z /an4/etc/an4_train.transcriptionz/an4/train_manifest.jsonzan4/wav/an4_clstkzTraining manifest created.z/an4/etc/an4_test.transcriptionz/an4/test_manifest.jsonzan4/wav/an4test_clstkzTest manifest created.z
***Done***)Úprintr   Úmakedirsr   ÚexistsÚwgetÚdownloadÚtarfiler   Ú
extractallÚglobÚ
subprocessÚrunÚisfiler,   )r   r.   r/   Úan4_urlÚan4_pathÚtarÚsph_listÚsph_pathr!   ÚcmdÚtrain_transcriptsÚtrain_manifestÚtest_transcriptsÚtest_manifestr*   r*   r+   Údownload_an49   s<   

rH   )r-   NN)r:   r   r   Úos.pathr;   r8   Útypingr   r6   r,   ÚstrrH   r*   r*   r*   r+   Ú<module>   s   $