o
    Si                     @   sN  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZmZmZmZmZ ddlmZmZmZ dZded	fd
ede	e de	e defddZ			d dede	e de
e dedeeeeeeef f f f
ddZ	d!dededeeeeef f fddZdedefddZ	d"dede	e deee  fddZdS )#a  
This script creates the MUSAN data directory.
Consists of babble, music and noise files.
Used to create augmented data
The required dataset is freely available at http://www.openslr.org/17/

The corpus can be cited as follows:
@misc{musan2015,
 author = {David Snyder and Guoguo Chen and Daniel Povey},
 title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
 year = {2015},
 eprint = {1510.08484},
 note = {arXiv:1510.08484v1}
}
    N)Path)DictIterableListOptionalSequenceUnion)	RecordingRecordingSetSupervisionSegmentSupervisionSetvalidate$validate_recordings_and_supervisions)Pathlikeresumable_downloadsafe_extractz1https://www.openslr.org/resources/17/musan.tar.gz.F
target_dirurlforce_downloadreturnc                 C   s   t | } | jddd d}| | }| d }| d }| r+td| d| d |S t|||d	 t|}t|| d
 |	  W d   |S 1 sMw   Y  |S )a  
    Download and untar the MUSAN corpus.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param url: str, the url that downloads file called "musan.tar.gz".
    :param force_download: bool, if True, download the archive even if it already exists.
    Tparentsexist_okzmusan.tar.gzmusanz.musan_completedz	Skipping z	 because z exists.)filenamer   )pathN)
r   mkdiris_filelogginginfor   tarfileopenr   touch)r   r   r   tar_nametar_path
corpus_dircompleted_detectortar r)   H/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/musan.pydownload_musan"   s"   

r+   musicspeechnoiseTr&   
output_dirparts
use_vocalsc              
   C   s  t | } |  sJ d|  |stdt|tr|g}i }d|v r4t| |d|d< tdi |d  d|v rJdt| d i|d< t|d d  d|v r`dt| d i|d< t|d d  |d urt |}|j	ddd	 |D ]}|| 
 D ]\}}||d
| d| d  qyqq|S )NzNo such directory: z2No MUSAN parts specified for manifest preparation.r-   )r2   r.   
recordingsr/   Tr   musan__z	.jsonl.gzr)   )r   is_dir
ValueError
isinstancestrprepare_musicr   scan_recordingsr   r   itemsto_file)r&   r0   r1   r2   	manifestspartkeymanifestr)   r)   r*   prepare_musan?   s0   
rB   c                    sJ   | d }t | t fdd|dD }|s |dd } |dS )Nr-   c                 3   sT    | ]%}t |d dD ]\}}}}t||d |||d|dkddV  q
qdS )   )
max_fieldsr   ,Y)genresvocals)idrecording_idstartdurationspeakercustomN)read_annotationsr   rL   split).0fileuttrG   rH   musicianr3   r)   r*   	<genexpr>f   s    	


z prepare_music.<locals>.<genexpr>ANNOTATIONSc                 S   s   | j d du S )NrH   F)rN   )sr)   r)   r*   <lambda>s   s    zprepare_music.<locals>.<lambda>)r3   supervisions)r;   r   from_segmentsrglobfilter)r&   r2   	music_dirrZ   r)   rU   r*   r:   a   s   	
r:   c                 C   s   t dd | dD S )Nc                 s   s    | ]}t |V  qd S N)r	   	from_file)rQ   rR   r)   r)   r*   rV   x   s    

z"scan_recordings.<locals>.<genexpr>z*.wav)r
   from_recordingsr\   )r&   r)   r)   r*   r;   w   s   
r;   r   rD   c                 c   sd    t | #}|D ]}|  }|r|d u r|n|d | V  qW d    d S 1 s+w   Y  d S r_   )r"   striprP   )r   rD   fliner)   r)   r*   rO   }   s   
"rO   )Nr,   T)Tr_   ) __doc__r   r!   pathlibr   typingr   r   r   r   r   r   lhotser	   r
   r   r   r   r   lhotse.utilsr   r   r   	MUSAN_URLr9   boolr+   rB   r:   r;   intrO   r)   r)   r)   r*   <module>   sf      

#

