o
    Si:                     @   s8  d Z ddlZddlZddlZddlZddlZddlmZm	Z	 ddl
mZ ddlmZmZmZmZ ddlZddlZddlZddlmZmZmZ ddlmZmZ ddlmZmZmZ dd	l m!Z!m"Z" dd
l#m$Z$ dZ%dd Z&dd Z'd?ddZ(	d@de$defddZ)			dAde$de$de$de$dee$ de*d e*dee+ee+ef f fd!d"Z,de$dee+ef fd#d$Z-d%ed&ee. defd'd(Z/d)ej0d*e*dej0fd+d,Z1d-e2d.ed/ede*dee f
d0d1Z3d2ed3ed4e+dee def
d5d6Z4d7edee2 fd8d9Z5d:e+d;ede*ded<ee+ef dee d e*deee ee f fd=d>Z6dS )Bag  
LibriMix dataset preparation for Lhotse.

This recipe replicates the LibriMix dataset preparation by manipulating existing recordings
instead of generating and saving new audio files. LibriMix is an open source dataset for
source separation in noisy environments, derived from LibriSpeech signals (clean subset)
and WHAM noise.

The original dataset supports:
- Multiple sources (2 or 3 speakers) in mixtures
- Different sample rates (typically 16kHz and 8kHz)
- Different mixture modes:
  - min: mixture ends when the shortest source ends
  - max: mixture ends when the longest source ends
- Different mixture types:
  - mix_clean: utterances only
  - mix_both: utterances + noise
  - mix_single: 1 utterance + noise

**Current Limitations:**
This Lhotse recipe currently supports only:
- 16kHz sample rate
- 'max' mode (mixture ends when the longest source ends)

**Important Note on Quantization:**
The original LibriMix recipe introduces a quantization error when saving audio files via soundfile,
which by default uses PCM_16 format. If you need to replicate the exact quantization behavior from
the original recipe, you can apply the following transformation:

```python
import tempfile
import soundfile as sf

with tempfile.NamedTemporaryFile(suffix=".wav") as f:
    sf.write(f.name, cut.load_audio().T, 16000)
    audio_quantized, sr = sf.read(f.name)
```

Unlike the original LibriMix generation which creates ~430GB for Libri2Mix and ~332GB for Libri3Mix,
this recipe works with existing LibriSpeech and WHAM recordings and creates virtual mixtures,
making it much more storage efficient.

For more details about LibriMix, see:
- GitHub repository: https://github.com/JorisCos/LibriMix/
- Paper: "LibriMix: An Open-Source Dataset for Generalizable Speech Separation"
  https://arxiv.org/pdf/2005.11262.pdf

Citation:
@misc{cosentino2020librimix,
    title={LibriMix: An Open-Source Dataset for Generalizable Speech Separation},
    author={Joris Cosentino and Manuel Pariente and Samuele Cornell and Antoine Deleforge and Emmanuel Vincent},
    year={2020},
    eprint={2005.11262},
    archivePrefix={arXiv},
    primaryClass={eess.AS}
}
    N)ThreadPoolExecutoras_completed)Path)DictListOptionalTuple)AudioSource	RecordingRecordingSet)info
save_audio)CutSetMonoCutmix)manifests_existread_manifests_if_cached)Pathlikei>  c                 C   sN   t jj| ddid}t j|}t|W  d    S 1 s w   Y  d S Nz
User-Agentzpython-urllib)headers)urllibrequestRequesturlopenjsonloadurlreqresp r    K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/librimix.py_fetch_jsonP   s   $r"   c                 C   sL   t jj| ddid}t j|}| W  d    S 1 sw   Y  d S r   )r   r   r   r   readr   r    r    r!   _fetch_bytesV   s   $r$   main.c           
   	   C   s   d|  d| d| d| }t |}tj|dd |D ]@}tj||d }|d d	krKt|d
}	|	t|d  W d    n1 sEw   Y  q|d dkr[t| ||d || qd S )Nzhttps://api.github.com/repos//z
/contents/z?ref=Texist_oknametypefilewbdownload_urldirpath)	r"   osmakedirsr0   joinopenwriter$   download_github_dir)
userrepor0   branchsave_dirapi_urlfilesr,   	file_pathfr    r    r!   r6   \   s   r6   
target_dirreturnc                 C   s   t | } | jddd | d }|d }| r"td| d |S td| d tj|dd	 td
ddd| |  |S )zDownload LibriMix metadata.Tparentsr)   metadata
.completedzSkipping download because z exists.zIDownloading https://github.com/JorisCos/LibriMix/tree/master/metadata to ...r(   JorisCosLibriMixmaster)	r   mkdiris_fileloggingr   r1   r2   r6   touch)r?   metadata_dircompleted_detectorr    r    r!   download_librimixk   s   
rO         librispeech_root_pathwham_recset_root_pathlibrimix_metadata_pathworkdir
output_dirn_srcnum_jobsc              	   C   s  t d |durt|}|jddd |dur#t|}|jddd i }g }t|d| d }	dd t|	D }
|
D ]}|d	d
 d}|d	d
 d}|||g q=|duret||ddd}t	|}t|d| d }	dd t|	D }
|
D ]o}|d	d
 }|d	d
 d}t
||dddrt
||dddrt d| d| d qt||	|| |||\}}t|}|dur||d| d  d|i||< t|}|dur||d| d  d|i||< q|S )aJ  
    Prepare LibriMix manifests for multi-speaker mixtures.

    Args:
        librispeech_root_path: Path to LibriSpeech manifests
        wham_recset_root_path: Path to WHAM noise manifests
        librimix_metadata_path: Path to LibriMix metadata
        output_dir: Directory to save manifests
        workdir: Working directory for temporary files
        n_src: Number of sources to for mixing
        num_jobs: Number of parallel threads used for processing (default: 1)

    Returns:
        Dict with keys for each split containing 'cuts' for both clean and noisy versions
    a>  The original LibriMix recipe introduces a quantization error when saving audio files via soundfile, which by default uses PCM_16 format. If you need to replicate the exact quantization behavior from the original recipe, you can save and load the audio using a temporary file as shown in the docstring of this function.NTrA   LibriMixc                 S       g | ]}d |vr|dkr|qS r   rD   r    .0r,   r    r    r!   
<listcomp>   
    z$prepare_librimix.<locals>.<listcomp>.csv _clean_noisylibrimix)cutset)dataset_partsrV   prefixtypesc                 S   r[   r\   r    r]   r    r    r!   r_      r`   )partrV   rh   ri   zLibriMix subset: z and z already prepared - skipping.librimix_cutset_	.jsonl.gzrf   )rK   warningr   rI   r1   listdirreplaceextendr   _load_wham_recordingsr   r   _process_metadata_filer   	from_cutsto_file)rR   rS   rT   rU   rV   rW   rX   	manifestsrg   n_src_meta_rootmd_filename_listmd_filename	part_namepart_name_noisywham_recsets
clean_cuts
noisy_cutsclean_cutsetnoisy_cutsetr    r    r!   prepare_librimix   s   


r   c                    s8   g d}g d} fdd|D }t |d ||d< |S )z>Load WHAM recordings with speed augmentation for training set.))trainzwham_recordings_tr.jsonl.gz)devzwham_recordings_cv.jsonl.gz)testzwham_recordings_tt.jsonl.gz)g?      ?g333333?c                    s$   i | ]\}}|t t | qS r    )lhotseload_manifestr   )r^   keysplitrS   r    r!   
<dictcomp>  s    z)_load_wham_recordings.<locals>.<dictcomp>r   )_augment_wham)rS   wham_splitsspeed_factorsr{   r    r   r!   rq      s   
rq   recsetr   c                    sb   ddl  dtdtf fdd}g }|D ]}|dkr| |}n| }||}|| qt|S )z,Apply speed augmentation to WHAM recordings.r   N	recordingr@   c                    s     dd| j| _| S )Nz_sp(\d+)\.(\d+)$zsp\1\2)subid)r   rer    r!   fix_rec_ids  s   z"_augment_wham.<locals>.fix_rec_idsr   )r   r
   perturb_speedmaprp   r   from_recordings)r   r   r   
new_recsetspeed_factoraugmented_recsetr    r   r!   r     s   

r   noise
max_lengthc              
   C   s   | }t td }|dt|d d  }|t|d dd }t||k r^t |dt|t|  t |t|t| d |t | dt| | | t|d f}t||k s&|d| S )z'Concatenate noise using Hanning window.rQ   NrP   )nphanningRATElenconcatenatemultiply)r   r   noise_exwindowi_wd_wr    r    r!   _extend_noise   s   	r   rowlibrispeech_cutsetwham_recsetc                 C   s  g }g }t d|d D ]}|t| d| d j || d| d  q|j|d}dd t||D }| }	|rFt|	| }	|s=| d |	_t| d	 j}
||
 }| d
 }|	|}|j
|	j
k rnt||	| d |}tdd|	j
d|d}t|	|dd}|	|fS )z'Process a single row from metadata CSV.rQ   source__path_gain)cut_idsc                 S   s   g | ]	\}}| |qS r    )perturb_volume)r^   srcgainr    r    r!   r_   C  s    z _process_row.<locals>.<listcomp>
mixture_ID
noise_path
noise_gainr   r   )r   startdurationchannelr   left)preserve_id)rangeappendr   stemsubsetzippopr   r   r   r   _extend_noise_recordingr   )r   r   r   rW   rU   srcsgainsr   normalized_cuts	clean_mixnoise_id	noise_recr   noise_rec_perturbed	noise_cut	noisy_mixr    r    r!   _process_row3  s:   	

r   r   r   
mixture_idc           	      C   s   |du rt d}|| j d| d }| s7|  }|jdkr$|d }t|t|j|j }t	||| jd t
|}tddgt|d	}t| j|g|j|j|jd
S )z3Extend noise recording to match clean mix duration.Nr&   _z.wavrQ   r   )destr   sampling_rater,   )r+   channelssource)r   sourcesr   num_samplesr   )r   r   exists
load_audiondimr   intr   r   r   r   r	   strr
   
samplerateframes)	r   r   r   rU   save_tonoise_arrayextended_noisenoise_rec_info
new_sourcer    r    r!   r   b  s,   
r   csv_pathc                 C   s   g }t | ddd.}t|}|D ]}t| D ]\}}|dr(t|||< q|| qW d   |S 1 s:w   Y  |S )zc
    Read LibriMix metadata using Python's standard csv library and cast gain fields to float.
    rrb   )newliner   N)r4   csv
DictReaderlistitemsendswithfloatr   )r   rowsr>   readerr   kvr    r    r!   _read_metadata_csv  s   


r   rx   rv   r{   c                    s  ||  }t |}tt|| d dddd dd d| d	d
d dd }	||	 g }
g }t	d|  d t
|d6  fdd|D }tjt|t|dD ]}| \}}|
| || qdW d   |
|fS 1 sw   Y  |
|fS )z?Process a single metadata file and return clean and noisy cuts.librir   r   ra   rl   c                 S   s   d | dd d S )N-r   )r3   r   )cr    r    r!   <lambda>  s    z(_process_metadata_file.<locals>.<lambda>rb   r   rQ   Nr   r   zProcessing rE   )max_workersc              
      s    g | ]}  t|qS r    )submitr   )r^   r   exr   rW   r   rU   r    r!   r_     s    z*_process_metadata_file.<locals>.<listcomp>)total)r   r   r   r   ro   
modify_idsr3   r   rK   r   r   tqdmr   r   resultr   )rx   rv   rW   rR   r{   rU   rX   r   r   
split_namer|   r}   futuresr>   r   r   r    r   r!   rr     s<   
"

rr   )r%   r&   )r&   )NrP   rQ   )7__doc__r   r   rK   r1   urllib.requestr   concurrent.futuresr   r   pathlibr   typingr   r   r   r   numpyr   r   r   lhotse.audior	   r
   r   lhotse.audio.backendr   r   lhotse.cut.setr   r   r   lhotse.recipes.utilsr   r   lhotse.utilsr   r   r"   r$   r6   rO   r   r   r   rq   r   r   ndarrayr   dictr   r   r   rr   r    r    r    r!   <module>   s    9


{
/
 
