o
    }oi3                     @   sf  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d	Zzd d
lmZm Z  d dlm!Z" W n e#yy   d dl$m%Z% dZY nw z
d dl&m'Z' d	Z(W n e)e#fy   dZ(Y nw d8ddZ*G dd de+Z,G dd de,Z-G dd de,Z.G dd de,Z/G dd de,Z0G dd de,Z1G dd de,Z2G dd  d e,Z3G d!d" d"e,Z4G d#d$ d$e,Z5G d%d& d&e,Z6G d'd( d(e,Z7G d)d* d*e,Z8e-e.e0e/e1e2e3e4e5e6e7e8d+Z9d,e:d-e,fd.d/Z;G d0d1 d1e+Z<d9d3ee< fd4d5Z=G d6d7 d7eZ>dS ):    N)NamedTemporaryFile)AnyListOptionalUnion)signal)AudioSegment)collectionsparsers)IterableDataset)loggingT)
DictConfig	OmegaConf)
webdataset)LightningNotInstalledExceptionF)numba_utilsc           
      C   s   |r&|d u r
t dt|\}}}|jd u rdn|j}|jd u r"dn|j}n t| jdd }	|	j}|	jd u r9dn|	j}|	jd u rCdn|	j}tj	||||dS )Nz*Expected augmentation dataset but got Noner      	target_sroffsetduration)
	TypeErrornextr   r   randomsampledata
audio_filer   	from_file)
manifestr   tarred_audioaudio_datasetr   file_idmanifest_entryr   r   audio_record r$   d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/perturb.pyread_one_audiosegmentK   s   r&   c                   @   s   e Zd Zdd Zdd ZdS )Perturbationc                 C   s   |S Nr$   selflengthr$   r$   r%   max_augmentation_length^      z$Perturbation.max_augmentation_lengthc                 C   s   t r(   )NotImplementedError)r*   r   r$   r$   r%   perturba   r-   zPerturbation.perturbN)__name__
__module____qualname__r,   r/   r$   r$   r$   r%   r'   ]   s    r'   c                   @   s*   e Zd ZdZdddZdd	 Zd
d ZdS )SpeedPerturbationa  
    Performs Speed Augmentation by re-sampling the data to a different sampling rate,
    which does not preserve pitch.

    Note: This is a very slow operation for online augmentation. If space allows,
    it is preferable to pre-compute and save the files to augment the dataset.

    Args:
        sr: Original sampling rate.
        resample_type: Type of resampling operation that will be performed.
            For better speed using `resampy`'s fast resampling method, use `resample_type='kaiser_fast'`.
            For high-quality resampling, set `resample_type='kaiser_best'`.
            To use `scipy.signal.resample`, set `resample_type='fft'` or `resample_type='scipy'`
        min_speed_rate: Minimum sampling rate modifier.
        max_speed_rate: Maximum sampling rate modifier.
        num_rates: Number of discrete rates to allow. Can be a positive or negative
            integer.
            If a positive integer greater than 0 is provided, the range of
            speed rates will be discretized into `num_rates` values.
            If a negative integer or 0 is provided, the full range of speed rates
            will be sampled uniformly.
            Note: If a positive integer is provided and the resultant discretized
            range of rates contains the value '1.0', then those samples with rate=1.0,
            will not be augmented at all and simply skipped. This is to unnecessary
            augmentation and increase computation time. Effective augmentation chance
            in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
            where `prob` is the global probability of a sample being augmented.
        rng: Random seed. Default is None
    ?皙?   Nc                 C   s   t ||}|dk rtd|dvrtd|| _|| _|| _|| _|dkr2tj| j| j| jdd| _|| _	|r>t
| d S d  d S )N        +Minimum sampling rate modifier must be > 0.)kaiser_bestkaiser_fastfftscipyzSSupported `resample_type` values are ('kaiser_best', 'kaiser_fast', 'fft', 'scipy')r   Tendpoint)min
ValueError_sr	_min_rate	_max_rate
_num_ratesnplinspace_rates	_res_typer   seed)r*   srresample_typemin_speed_ratemax_speed_rate	num_ratesrngmin_rater$   r$   r%   __init__   s   
zSpeedPerturbation.__init__c                 C   
   || j  S r(   rC   r)   r$   r$   r%   r,         
z)SpeedPerturbation.max_augmentation_lengthc                 C   s   | j dk rt| j| j}nt| j}|dkrd S t| j| }zt	j
j|j| j|| jd|_W d S  tyT } ztd| j d| d|  W Y d }~d S d }~ww )Nr         ?)orig_srr   res_typezFailed to resample audio from z to z . Skipping augmentation. Error: )rD   r   uniformrB   rC   choicerG   intrA   librosacoreresample_samplesrH   	Exceptionr   warning)r*   r   
speed_ratenew_srer$   r$   r%   r/      s   
zSpeedPerturbation.perturb)r4   r5   r6   Nr0   r1   r2   __doc__rQ   r,   r/   r$   r$   r$   r%   r3   e   s
    
r3   c                   @   s*   e Zd ZdZdddZd	d
 Zdd ZdS )TimeStretchPerturbationa  
    Time-stretch an audio series by a fixed rate while preserving pitch, based on [1]_, [2]_.

    Note:
    This is a simplified implementation, intended primarily for reference and pedagogical purposes.
    It makes no attempt to handle transients, and is likely to produce audible artifacts.

    References
    ----------
    .. [1] Ellis, D. P. W. "A phase vocoder in Matlab." Columbia University, 2002.
       `<http://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/>`_
    .. [2] librosa.effects.time_stretch
       `<https://librosa.org/doc/main/generated/librosa.effects.time_stretch.html>`_

    Args:
        min_speed_rate: Minimum sampling rate modifier.
        max_speed_rate: Maximum sampling rate modifier.
        num_rates: Number of discrete rates to allow. Can be a positive or negative
            integer.
            If a positive integer greater than 0 is provided, the range of
            speed rates will be discretized into `num_rates` values.
            If a negative integer or 0 is provided, the full range of speed rates
            will be sampled uniformly.
            Note: If a positive integer is provided and the resultant discretized
            range of rates contains the value '1.0', then those samples with rate=1.0,
            will not be augmented at all and simply skipped. This is to avoid unnecessary
            augmentation and increase computation time. Effective augmentation chance
            in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
            where `prob` is the global probability of a sample being augmented.
        n_fft: Number of fft filters to be computed.
        rng: Random seed. Default is None
    r4   r5   r6      Nc                 C   sr  t ||}|dk rtd|| _|| _|| _|dkr'tj| j| j| jdd| _|r.t	|nFd  t
|| _t
|d | _tdtj| j | jd | _tj| jd tjd| _tdtj| j | jd | _tj| jd tjd| _d S  t
|| _t
|d | _tdtj| j | jd | _tj| jd tjd| _tdtj| j | jd | _tj| jd tjd| _d S )	Nr7   r8   r   Tr=      r   dtype)r?   r@   rB   rC   rD   rE   rF   rG   r   rI   rZ   _n_fft_hop_lengthpi_phi_advance_fastemptyfloat32_scale_buffer_fast_phi_advance_slow_scale_buffer_slow)r*   rL   rM   rN   n_fftrO   rP   r$   r$   r%   rQ      s,   


z TimeStretchPerturbation.__init__c                 C   rR   r(   rS   r)   r$   r$   r%   r,      rT   z/TimeStretchPerturbation.max_augmentation_lengthc                 C   s   | j dk rt| j| j}nt| j}|dkrd S |dkr'd}| j}| j}nd}| j	}| j
}t| j| }t| j| }tjj|j||d}trRt||||}	ntj|||}	ttt|j| }
tjj|	|jj||
d}||_d S )Nr   rU   r   rh   )rt   
hop_length)rj   ru   r+   )rD   r   rX   rB   rC   rY   rG   rn   rq   rr   rs   rZ   rk   rl   r[   r\   stftr^   
HAVE_NUMBAr   phase_vocoderroundlenistftrj   )r*   r   ra   fft_multiplierphi_advancescale_bufferrt   ru   rv   stft_stretchlen_stretch	y_stretchr$   r$   r%   r/      s.   

zTimeStretchPerturbation.perturb)r4   r5   r6   rg   Nrd   r$   r$   r$   r%   rf      s
    
!rf   c                   @   sH   e Zd ZdZ						ddededededed	efd
dZdd ZdS )SilencePerturbationa  
    Applies random silence at the start and/or end of the audio.

    Args:
        min_start_silence_secs (float): Min start silence level in secs
        max_start_silence_secs (float): Max start silence level in secs
        min_end_silence_secs (float): Min end silence level in secs
        max_end_silence_secs (float): Max end silence level in secs
        rng (int): Random seed. Default is None
        value: (float): value representing silence to be added to audio array.
    r   Nmin_start_silence_secsmax_start_silence_secsmin_end_silence_secsmax_end_silence_secsrO   valuec                 C   s@   || _ || _|| _|| _|rt|nd  || _d S  || _d S r(   )_min_start_silence_secs_max_start_silence_secs_min_end_silence_secs_max_end_silence_secsr   rI   _value)r*   r   r   r   r   rO   r   r$   r$   r%   rQ   (  s   	

zSilencePerturbation.__init__c                 C   sl   t | j| j}t | j| j}tt||j	 f| j
}tt||j	 f| j
}t||j|g|_d S r(   )r   rX   r   r   r   r   rE   fullrZ   sample_rater   concatenater^   )r*   r   start_silence_lenend_silence_lenstartendr$   r$   r%   r/   9  s
   zSilencePerturbation.perturb)r   r   r   r   Nr   )r0   r1   r2   re   floatrZ   rQ   r/   r$   r$   r$   r%   r     s,    
r   c                   @   "   e Zd ZdZd	ddZdd ZdS )
GainPerturbationz
    Applies random gain to the audio.

    Args:
        min_gain_dbfs (float): Min gain level in dB
        max_gain_dbfs (float): Max gain level in dB
        rng (int): Random seed. Default is None
    
   Nc                 C   &   || _ || _|rt| d S d  d S r(   )_min_gain_dbfs_max_gain_dbfsr   rI   )r*   min_gain_dbfsmax_gain_dbfsrO   r$   r$   r%   rQ   L     zGainPerturbation.__init__c                 C   s(   t | j| j}|jd|d   |_d S )N      $@      4@)r   rX   r   r   r^   )r*   r   gainr$   r$   r%   r/   Q  s   zGainPerturbation.perturb)r   r   Nr0   r1   r2   re   rQ   r/   r$   r$   r$   r%   r   B      
	r   c                   @   s.   e Zd ZdZ						d	ddZdd ZdS )
ImpulsePerturbationa  
    Convolves audio with a Room Impulse Response.

    Args:
        manifest_path (list): Manifest file for RIRs
        audio_tar_filepaths (list): Tar files, if RIR audio files are tarred
        shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files
        normalize_impulse (bool): Normalize impulse response to zero mean and amplitude 1
        shift_impulse (bool): Shift impulse response to adjust for delay at the beginning
        rng (int): Random seed. Default is None
    N   Fc                 C   s|   t j|tg dd| _d | _d| _|| _|| _d | _	|r-d| _t
|||| _t| j| _	|| _|r:t| j d S d  d S NTparserindex_by_file_idF)r	   ASRAudioTextr
   make_parser	_manifest_audiodataset_tarred_audio_normalize_impulse_shift_impulse_data_iteratorAugmentationDatasetiter_rngr   rI   )r*   manifest_pathaudio_tar_filepaths	shuffle_nnormalize_impulseshift_impulserO   r$   r$   r%   rQ   c  s   	zImpulsePerturbation.__init__c                 C   s   t | j|j| j| jd}| jr!|jt|j }|t	t
| }n|j}t|j}t	t
|jdkr9td d S t|j|d|_| jrUtt
|}|j|d  |_|jd | |_t	t
|jdkrmtd d S |jt	t
|j |_d S )Nr   r    r   z6Zero audio input found, skipping impulse perturbation.r   z2Zero audio input found after impulse perturbation.)r&   r   r   r   r   r   samplesrE   meanmaxabsrz   r^   r   r`   r   fftconvolver   argmax)r*   r   impulseimpulse_normlen_datamax_indr$   r$   r%   r/   {  s.   


zImpulsePerturbation.perturb)NNr   FFNr   r$   r$   r$   r%   r   V  s    
r   c                   @   r   )
ShiftPerturbationa  
    Perturbs audio by shifting the audio in time by a random amount between min_shift_ms and max_shift_ms.
    The final length of the audio is kept unaltered by padding the audio with zeros.


    Args:
        min_shift_ms (float): Minimum time in milliseconds by which audio will be shifted
        max_shift_ms (float): Maximum time in milliseconds by which audio will be shifted
        rng (int): Random seed. Default is None
                @Nc                 C   r   r(   )_min_shift_ms_max_shift_msr   rI   )r*   min_shift_msmax_shift_msrO   r$   r$   r%   rQ     r   zShiftPerturbation.__init__c                 C   s   t | j| j}t|d |jkrd S t||j d }|dk r7|jd | |j| d < d|jd | < d S |dkrR|j|d  |jd | < d|j| d < d S d S )Ni  r   )	r   rX   r   r   r   r   rZ   r   r^   )r*   r   shift_msshift_samplesr$   r$   r%   r/     s   zShiftPerturbation.perturb)r   r   Nr   r$   r$   r$   r%   r     s    
r   c                   @   s\   e Zd ZdZ								ddd	Zed
d Zdd ZdddZdddZ	dddZ
dS )NoisePerturbationa  
    Perturbation that adds noise to input audio.

    Args:
        manifest_path (str): Manifest file with paths to noise files
        min_snr_db (float): Minimum SNR of audio after noise is added
        max_snr_db (float): Maximum SNR of audio after noise is added
        max_gain_db (float): Maximum gain that can be applied on the noise sample
        audio_tar_filepaths (list) : Tar files, if noise audio files are tarred
        shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files
        orig_sr (int): Original sampling rate of the noise files
        rng (int): Random seed. Default is None
    Nr   2        r@d   >  c	           	      C   s   t j|tg dd| _d | _d| _|| _d | _|r*d| _t	|||| _t
| j| _|r1t|nd  || _|| _|| _|| _d S  || _|| _|| _|| _d S r   )r	   r   r
   r   r   r   r   _orig_srr   r   r   r   rI   r   _min_snr_db_max_snr_db_max_gain_db)	r*   r   
min_snr_db
max_snr_dbmax_gain_dbrO   r   r   rV   r$   r$   r%   rQ     s&   

zNoisePerturbation.__init__c                 C      | j S r(   r   r*   r$   r$   r%   rV        zNoisePerturbation.orig_src                 C   s   t | j|| j| jdS )Nr   )r&   r   r   r   )r*   r   r$   r$   r%   get_one_noise_sample  s   z&NoisePerturbation.get_one_noise_sampler   c                 C   s,   t | j|j| j| jd}| j|||d dS )
        Args:
            data (AudioSegment): audio data
            ref_mic (int): reference mic index for scaling multi-channel audios
        r   )ref_micN)r&   r   r   r   r   perturb_with_input_noiser*   r   r   noiser$   r$   r%   r/     s   zNoisePerturbation.perturbc           
   	   C   s  |j |j krtd|j  d|j  dd|  kr|j k s+n td|j  d| dt| j| j}| rItd|j	 d	|j
 d
|j d |du rP|j}| rltd|j	 d	|j
 d
|j d td }n|j}| r~| r~td dS |j dkr|| ||  | }n|| | }t|| j}td|j|j }|j||j kr|j|||j d || |jjd |jjd k rtd|jjd |jjd  }	|j|	|	|jjd    |j7  < dS | j|j7  _dS )a  
        Args:
            data (AudioSegment): audio data
            noise (AudioSegment): noise data
            data_rms (Union[float, List[float]): rms_db for data input
            ref_mic (int): reference mic index for scaling multi-channel audios
        $Found mismatched channels for data () and noise ().r   , reference mic ID must be an integer in [0, ), got 	 instead.zEmpty audio segment found for z with offset z and duration .NzEmpty noise segment found for infz>Both data and noise segments are empty. Skipping perturbation.r   r7   
start_timeend_time)num_channelsr@   r   rX   r   r   is_emptyr   r`   r   r   r   rms_dbr   r?   r   
subsegmentgain_dbr^   shaperandint)
r*   r   r   data_rmsr   snr_db	noise_rmsnoise_gain_dbr   	noise_idxr$   r$   r%   r   	  sH   


 (z*NoisePerturbation.perturb_with_input_noiserh   r   c                 C   s  |j |j krtd|j  d|j  dd|  kr|j k s+n td|j  d| dt| j| j}|s8|j}|j dkrI|| |j|  | }n||j | }t|| j}t	d|}	t
|	D ]m}
td	|}td	|j}tt||j }ttt|j|| |j }t|j|| }|d
|d  9 }|jd |jjd kr|d|jjd  }t	d|jjd |jd  }|j|||jd    |7  < q`dS )a  
        Args:
            data (AudioSegment): audio data
            noise (AudioSegment): noise data
            data_rms (Union[float, List[float]): rms_db for data input
            max_noise_dur: (float): max noise duration
            max_additions (int): number of times for adding noise
            ref_mic (int): reference mic index for scaling multi-channel audios
        r   r   r   r   r   r   r   r   r7   r   r   N)r   r@   r   rX   r   r   r   r?   r   r   ranger   rZ   ry   r   rE   copyr^   r   )r*   r   r   r   max_noise_durmax_additionsr   r   r   n_additionsi	noise_durr   start_sample
end_samplenoise_samplesr   r$   r$   r%   perturb_with_foreground_noiseF  s8   

"z/NoisePerturbation.perturb_with_foreground_noise)Nr   r   r   NNr   r   r   Nr   )Nrh   r   r   )r0   r1   r2   re   rQ   propertyrV   r   r/   r   r  r$   r$   r$   r%   r     s"    



=r   c                   @   sx   e Zd ZdZ														
dddZedd Zdd ZdddZdddZ	dd Z
d ddZd!ddZdS )""NoisePerturbationWithNormalizationa  
    Perturbation that adds noise to input audio, with normalisation to specific decibel level.
    Also tiles shorter noise samples up to their corresponding clean audio length.

    Args:
        manifest_path (str or list): Manifest file with paths to noise files, can be list if using multiple noise sources
        min_snr_db (float): Minimum SNR of audio after noise is added
        max_snr_db (float): Maximum SNR of audio after noise is added
        snr_samples (list): A discrete list of SNRs DBs to sample from when mixing, will be used instead of [min_snr_db,max_snr_db]
        norm_to_db (float): Will normalise clean, noise, and mixed samples to this DB
        audio_tar_filepaths (str or list) : Tar files, if noise audio files are tarred, can be list for multiple sources
        shuffle_n (int): Shuffle parameter for shuffling buffered files from the tar files
        orig_sr (int): Original sampling rate of the noise files
        rng (int): Random seed. Default is None
        shard_strategy (str): if you're using tarred audio and wish to scatter instead of replicate, set this to 'scatter'
        epsilon (float): minimum value for RMS DB normalisation to avoid divide by zero
    Nr   r   r   r   r   r   	replicate{Gz?c              	   C   s4  ddl m} tj|tg dd| _d | _d| _|	| _	d | _
|r%t|nd  || _|r|d| _t|tr7|g}t|tr?|g}g }t||D ]\}}t||||
||d}|| qF|||r`|ntdd|
 d| _t| jdkrvtd	t| j| _
|| _|| _|| _t|trt|dkr|nd | _|| _d S )
Nr   )RandomizedChainDatasetTr   F)rank
world_sizeshard_strategyi0u  )rnd_seedzeNoisePerturbationWithNormalization detected a zero length RandomizedChainDataset, should never happen)'nemo.collections.asr.data.audio_to_textr  r	   r   r
   r   r   r   r   r   r   r   rI   r   
isinstancestrzipr   appendr   rz   RuntimeErrorr   r   r   _norm_to_dblist_snr_samples_epsilon)r*   r   r   r   snr_samples
norm_to_dbrO   r   r   rV   global_rankr  r  epsilonr  datasetstarred_audio_filepathmanifest_filepathdatasetr$   r$   r%   rQ     sN   

 
z+NoisePerturbationWithNormalization.__init__c                 C   r   r(   r   r   r$   r$   r%   rV     r   z*NoisePerturbationWithNormalization.orig_src                 C   s   | j rC| jd u rtdz
t| j\}}}W n ty-   t| j| _t| j\}}}Y nw |jd u r5dn|j}|jd u r?dn|j}n!t	
| jjdd }|j}|jd u rWdn|j}|jd u radn|j}tj||||dS )Nz$Expected valid iterator but got Noner   r   r   )r   r   r   r   StopIterationr   r   r   r   r   r   r   r   r   r   r   )r*   r   r   r!   r"   r   r   r#   r$   r$   r%   r&     s    
z8NoisePerturbationWithNormalization.read_one_audiosegmentc                 C   sD   |  |j}|jdk r|  |j}|jdk s| j|||| jd dS )r   r   )r   r  N)r&   r   r   r   r  r   r$   r$   r%   r/     s
   

z*NoisePerturbationWithNormalization.perturb      9c                 C   s@   |  ||}|  ||}d| d  }|| }|| }|||fS )a1  
        Mixes the clean audio with the noise
        Args:
            clean (numpy array): the clean audio data
            noise (numpy array): the noise audio data
            snr (float): the SNR value for the mixing
            norm_to_db (float): the DB value to normalise to before mixing
        r   r   )norm_audio_to_db)r*   cleanr   snrr  noisescalarnoisenewlevelnoisyspeechr$   r$   r%   	snr_mixer  s   	
z,NoisePerturbationWithNormalization.snr_mixerc                 C   sD   |d j ddd }tt|d| j|}d|d  | }|| S )z
        Normalises audio signal to particular db, with some epsilon in-case of divide by zero
        Args:
            x (numpy array): input audio signal
            norm_to_db (float): the db to normalise to
        rh   r   axis      ?r   r   )r   rE   whereiscloser  )r*   xr  rmsscalarr$   r$   r%   r%    s   z3NoisePerturbationWithNormalization.norm_audio_to_db      ?c                 C   s   t |t |k r>|jdkrtt|| |jd f}n
tt|| f}tj||dd}tj||dd}t |t |k s|S )a  
        Tiles the noise array to match the clean audio array, with small silence between the joins
        Args:
            clean (numpy array): clean audio data
            noise (numpy array): noise audio data
            fs (int): sample rate used by both clean and noise audio data
            silence_length (float): the amount of silence (in secs) to insert before tiling
        r   r   r,  )rz   ndimrE   zerosrZ   r   r  )r*   r&  r   fssilence_lengthr7  noiseconcatr$   r$   r%   concatenate_noise_sample  s   	
z;NoisePerturbationWithNormalization.concatenate_noise_samplec                 C   s$  |j |j krtd|j  d|j  dd|  kr|j k s+n td|j  d| d| jr8t| jdd }nt| j| j}|d	u rUt|j	t
tjfrR|j	| n|j	}|d	u r[|}|j}|j}t|dkrid	S t|t|k ry| |||j}|dt| }| j||||d
\}	}	}
|
|_d	S )a  
        Args:
            data (AudioSegment): audio data
            noise (AudioSegment): noise data
            data_rms (Union[float, List[float]): rms_db for data input
            ref_mic (int): reference mic index for scaling multi-channel audio, if set to None then
                           each channel will be scaled independently
            norm_to_db (float): will normalise all audio to this DB
        r   r   r   r   r   r   r   r   N)r&  r   r'  r  )r   r@   r  r   r   rX   r   r   r  r   r  rE   ndarrayr^   rz   r;  r   r+  )r*   r   r   r   r   r  r   	data_norm
noise_norm_	noisy_snrr$   r$   r%   r   "  s0   
"
z;NoisePerturbationWithNormalization.perturb_with_input_noise)Nr   r   NNNNr   r   r   r   r
  r  r  )r$  )r4  )Nr   r$  )r0   r1   r2   re   rQ   r  rV   r&   r/   r+  r%  r;  r   r$   r$   r$   r%   r	  v  s0    
<



r	  c                   @   r   )
WhiteNoisePerturbationa5  
    Perturbation that adds white noise to an audio file in the training dataset.

    Args:
        min_level (int): Minimum level in dB at which white noise should be added
        max_level (int): Maximum level in dB at which white noise should be added
        rng (int): Random seed. Default is None
    Nc                 C   s0   t || _t || _|rtj| d S d  d S r(   )rZ   	min_level	max_levelrE   r   rI   )r*   rD  rE  rO   r$   r$   r%   rQ   Y  s   

zWhiteNoisePerturbation.__init__c                 C   sH   t jj| j| jdd}t j|jjd d|d   }| j|7  _d S )Nint32ri   r   r   r   )rE   r   r   rD  rE  randnr^   r   )r*   r   noise_level_dbnoise_signalr$   r$   r%   r/   ^  s    zWhiteNoisePerturbation.perturb)rB  rC  Nr   r$   r$   r$   r%   rA  O  r   rA  c                   @   sJ   e Zd ZdZ														
							dddZdd ZdS )RirAndNoisePerturbationa  
    RIR augmentation with additive foreground and background noise.
    In this implementation audio data is augmented by first convolving the audio with a Room Impulse Response
    and then adding foreground noise and background noise at various SNRs. RIR, foreground and background noises
    should either be supplied with a manifest file or as tarred audio files (faster).

    Different sets of noise audio files based on the original sampling rate of the noise. This is useful while
    training a mixed sample rate model. For example, when training a mixed model with 8 kHz and 16 kHz audio with a
    target sampling rate of 16 kHz, one would want to augment 8 kHz data with 8 kHz noise rather than 16 kHz noise.

    Args:
        rir_manifest_path: Manifest file for RIRs
        rir_tar_filepaths: Tar files, if RIR audio files are tarred
        rir_prob: Probability of applying a RIR
        noise_manifest_paths: Foreground noise manifest path
        min_snr_db: Min SNR for foreground noise
        max_snr_db: Max SNR for background noise,
        noise_tar_filepaths: Tar files, if noise files are tarred
        apply_noise_rir: Whether to convolve foreground noise with a a random RIR
        orig_sample_rate: Original sampling rate of foreground noise audio
        max_additions: Max number of times foreground noise is added to an utterance,
        max_duration: Max duration of foreground noise
        bg_noise_manifest_paths: Background noise manifest path
        bg_min_snr_db: Min SNR for background noise
        bg_max_snr_db: Max SNR for background noise
        bg_noise_tar_filepaths: Tar files, if noise files are tarred
        bg_orig_sample_rate: Original sampling rate of background noise audio
        rng: Random seed. Default is None

    Nr.  rU   r   r   r   Fr6          @r   c                 C   s  || _ || _|| _|rt|nd  t|||dd| _d | _d | _|rNi | _t	t
|D ]!}|d u r5d}n|| }t|| || || |	| |d| j|< q,|| _|| _|ri | _t	t
|D ]!}|d u rhd}n|| }t|| || || || |d| j|< q_|
| _d S )NT)r   r   r   r   r   )r   r   r   r   rV   )	_rir_prob_noise_prob_bg_noise_probr   rI   r   _rir_perturber_fg_noise_perturbers_bg_noise_perturbersr   rz   r   _max_additions_max_duration_apply_noise_rir)r*   rir_manifest_pathrir_probnoise_manifest_paths
noise_probr   r   rir_tar_filepathsrir_shuffle_nnoise_tar_filepathsapply_noise_rirorig_sample_rater   max_durationbg_noise_manifest_pathsbg_noise_probbg_min_snr_dbbg_max_snr_dbbg_noise_tar_filepathsbg_orig_sample_raterO   r   rV   r$   r$   r%   rQ     sT   
z RirAndNoisePerturbation.__init__c                 C   s  t dd}|| jk r| j| |j}| jd urQt dd| jk rQ|j}|| jvr1t	| j
 }| j| }||j}| jrE| j| |j|||| j| jd | jd urt dd| jk r|j}|| jvrnt	| j
 }| j| }||j}|j|||d d S d S d S )Nr7   rU   )r   r   r   )r   )r   rX   rL  rO  r/   r   rP  rM  rV   r   keysr   r   rT  r  rS  rR  rQ  rN  r   )r*   r   probr   rV   fg_perturberr   bg_perturberr$   r$   r%   r/     s.   




zRirAndNoisePerturbation.perturb)Nr.  NrU   r   r   Nr   NFNr6   rK  NrU   r   r   NNNr   r$   r$   r$   r%   rJ  d  s0    !
ErJ  c                   @   s"   e Zd ZdZdddZdd ZdS )TranscodePerturbationak  
    Audio codec augmentation. This implementation uses sox to transcode audio with low rate audio codecs,
    so users need to make sure that the installed sox version supports the codecs used here (G711 and amr-nb).

    Args:
        codecs (List[str]):A list of codecs to be trancoded to. Default is None.
        rng (int): Random seed. Default is None.
    Nc                 C   sf   |rt |nd  |d ur|ng d| _d| _|d ur/|D ]}|dvr.td| d| dqd S d S )N)g711amr-nbogg皙?zTranscodePerturbation with z isnot supported. Only z are supported)r   rI   _codecs
att_factorr@   )r*   codecsrO   codecr$   r$   r%   rQ     s   zTranscodePerturbation.__init__c                 C   s  t t |j}|dkr| j| }||j }n|j}tdd}t|j|	 d t
dt| jd }| j| dkrgtdd}ttdd	}|t
dt|d  }	tjd
|j d|	 d|j dd}
nN| j| dkrtdd}ttdd}|t
dt|d  }	tjd
|j d|	 d|j dd}
n| j| dkrtdd}tjd
|j d|j ddd}
tj|jdd}|jd|jjd  |_d S )Nrm  z.wav)suffixr   r   r   rk  z_amr.wav   zsox z -V0 -C z2 -t amr-nb - | sox -t amr-nb - -V0 -b 16 -r 16000 T)shellrl  z_ogg.wavr5     z, -t ogg - | sox -t ogg - -V0 -b 16 -r 16000 rj  z	_g711.wavz -V0  -r 8000 -c 1 -e a-law z lowpass 3400 highpass 300)r   )rE   r   r   r^   ro  r   sfwritename	transposer   r   rz   rn  r  r   
subprocesscheck_outputr   r   r   )r*   r   rE  norm_factornorm_samplesorig_f	codec_indtranscoded_fratesrater?  new_datar$   r$   r%   r/     sB   




zTranscodePerturbation.perturbNNr   r$   r$   r$   r%   ri    s    
	ri  c                   @   s*   e Zd ZdZ	dddZdefd	d
ZdS )RandomSegmentPerturbationa  
    Returns a random segment from input of duration "duration_sec".
    If duration_sec > input audio length, pad_to_duration determines the outcome.

    RandomSegmentPerturbation is intended for self-supervised learning.
    Not for supervised, as extracting corresponding text is not facilitated.


    Args:
        duration_sec (float): duration of the segment to be extracted
        pad_to_duration (bool): zero pad if length of input audio < duration_sec
        rng: Random seed. Default is None
        min_rms_db: Minimum RMS db value for the perturbed audio. Default is None
        max_trials: Maximum number of trials to find a segment with RMS db > min_rms_db. Default is 10
        verbose: If True, logs a warning if RMS db < min_rms_db after max_trials. Default is False
          @@FNr   c                 C   sH   |dkrt d|| _|| _|| _|| _|| _|r t| d S d  d S )Nr   zduration_sec should be > 0)r@   _duration_sec_pad_to_duration_min_rms_db_max_trials_verboser   rI   )r*   duration_secpad_to_durationrO   
min_rms_db
max_trialsverboser$   r$   r%   rQ   5  s   z"RandomSegmentPerturbation.__init__r   c                 C   s`  | j |jkr| jsd S d}| j |j |j }|j|d n
td|j| j  }|| j  }t	|}|j
||d | jd ur|jdkrE|jnt|j}d}|| jk r|| jk rtd|j| j  }|| j  }t	|}|j
||d |jdkry|jnt|j}|d7 }|| jk r|| jk sV| jr|| jkr|| jk rtd| j d| j d |j
||d d S )	Nr7   )pad_sizer   r   r   z'Could not find a segment with RMS db > z after z trials.)r  r   r  r   num_samplespadr   rX   r   deepcopyr   r  r   r   r?   r  r  r   r`   )r*   r   r   r  r   r  r   trialr$   r$   r%   r/   B  s4   




z!RandomSegmentPerturbation.perturb)r  FNNr   F)r0   r1   r2   re   rQ   r   r/   r$   r$   r$   r%   r  #  s
    
r  )speedtime_stretchr   silencer   shiftr   r>  white_noiserir_noise_augtranscode_augrandom_segmentrx  perturbationc                 C   s2   | t  v rtd|  dt |   d|t | < d S )NzPerturbation with the name z  exists. Type of perturbation : r   )perturbation_typesre  KeyError)rx  r  r$   r$   r%   register_perturbationq  s
   r  c                   @   s2   e Zd Zd
ddZdd Zdd Zedd	 ZdS )AudioAugmentorNc                 C   s.   |rt |nd  |d ur|| _d S g | _d S r(   )r   rI   	_pipeline)r*   perturbationsrO   r$   r$   r%   rQ   {  s   zAudioAugmentor.__init__c                 C   s*   | j D ]\}}t |k r|| qd S r(   )r  r   r/   )r*   segmentrf  pr$   r$   r%   r/     s
   
zAudioAugmentor.perturbc                 C   s"   |}| j D ]	\}}||}q|S r(   )r  r,   )r*   r+   newlenrf  r  r$   r$   r%   r,     s   z&AudioAugmentor.max_augmentation_lengthc              	   C   sb   g }|D ]'}|d t vrtd|d  qt |d  }||d |di |d f q| |dS )Naug_typez$%s perturbation not known. Skipping.rf  cfgr  r$   )r  r   r`   r  )clsconfigptbsr  r  r$   r$   r%   from_config  s   "
zAudioAugmentor.from_configr  )r0   r1   r2   rQ   r/   r,   classmethodr  r$   r$   r$   r%   r  z  s    
r  r   returnc              	   C   sH  | du rdS t | tr| S th}trtth}t| |vr tdtr.t | tr.tj| dd} t	
| } g }|  D ]c\}}|dd}|du rOtd| d|d}|dk s\|d	kr`td
z,t| }	dt|	jv rq||d< dt|	jv r}||d< |	di |}
|||
g W q9 ty   tdt  w t|d} | S )a  Process list of online data augmentations.
    Accepts either an AudioAugmentor object with pre-defined augmentations,
    or a dictionary that points to augmentations that have been defined.
    If a dictionary is passed, must follow the below structure:
    Dict[str, Dict[str, Any]]: Which refers to a dictionary of string
    names for augmentations, defined in `asr/parts/perturb.py`.
    The inner dictionary may contain key-value arguments of the specific
    augmentation, along with an essential key `prob`. `prob` declares the
    probability of the augmentation being applied, and must be a float
    value in the range [0, 1].
    # Example in YAML config file
    Augmentations are generally applied only during training, so we can add
    these augmentations to our yaml config file, and modify the behaviour
    for training and evaluation.
    ```yaml
    AudioToSpeechLabelDataLayer:
        ...  # Parameters shared between train and evaluation time
        train:
            augmentor:
                shift:
                    prob: 0.5
                    min_shift_ms: -5.0
                    max_shift_ms: 5.0
                white_noise:
                    prob: 1.0
                    min_level: -90
                    max_level: -46
                ...
        eval:
            ...
    ```
    Then in the training script,
    ```python
    import copy
    from ruamel.yaml import YAML
    yaml = YAML(typ="safe")
    with open(model_config) as f:
        params = yaml.load(f)
    # Train Config for Data Loader
    train_dl_params = copy.deepcopy(params["AudioToTextDataLayer"])
    train_dl_params.update(params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    data_layer_train = nemo_asr.AudioToTextDataLayer(
        ...,
        **train_dl_params,
    )
    # Evaluation Config for Data Loader
    eval_dl_params = copy.deepcopy(params["AudioToTextDataLayer"])
    eval_dl_params.update(params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        ...,
        **eval_dl_params,
    )
    ```
    # Registering your own Augmentations
    To register custom augmentations to obtain the above convenience of
    the declaring the augmentations in YAML, you can put additional keys in
    `perturbation_types` dictionary as follows.
    ```python
    from nemo.collections.asr.parts import perturb
    # Define your own perturbation here
    class CustomPerturbation(perturb.Perturbation):
        ...
    perturb.register_perturbation(name_of_perturbation, CustomPerturbation)
    ```
    Args:
        augmenter: AudioAugmentor object or
            dictionary of str -> kwargs (dict) which is parsed and used
            to initialize an AudioAugmentor.
            Note: It is crucial that each individual augmentation has
            a keyword `prob`, that defines a float probability in the
            the range [0, 1] of this augmentation being applied.
            If this keyword is not present, then the augmentation is
            disabled and a warning is logged.
    Returns: AudioAugmentor object
    NzCCannot parse augmenter. Must be a dict or an AudioAugmentor object T)resolverf  zAugmentation "zW" will not be applied as keyword argument "prob" was not defined for this augmentation.r7   rU   z-`prob` must be a float value between 0 and 1.r  r  z,Invalid perturbation name. Allowed values : r  r$   )r  r  dictHAVE_OMEGACONG_WEBDATASETr   typer@   r   to_containerr   r  itemsgetr  popr  inspect	signature
parametersr  re  )	augmenterr  r  augmenter_typesaugmentationsaugment_nameaugment_kwargsrf  r?  augmentation_classaugmentationr$   r$   r%   process_augmentations  sF   P




r  c                   @   s`   e Zd ZdZ				ddedeeee f ded	ed
edefddZdd Z	dd Z
dd ZdS )r   a  
    A class that loads tarred audio files and cycles over the files in the dataset.
    Accepts a single comma-separated JSON manifest file (in the same style as for the AudioToCharDataset/AudioToBPEDataset),
    as well as the path(s) to the tarball(s) containing the wav files. Each line of the manifest should
    contain the information for one audio file, including at least the transcript and name of the audio
    file within the tarball.
    Valid formats for the audio_tar_filepaths argument include:
    (1) a single string that can be brace-expanded, e.g. 'path/to/audio.tar' or 'path/to/audio_{1..100}.tar.gz', or
    (2) a list of file paths that will not be brace-expanded, e.g. ['audio_1.tar', 'audio_2.tar', ...].
    Note: For brace expansion in (1), there may be cases where `{x..y}` syntax cannot be used due to shell interference.
    This occurs most commonly inside SLURM scripts. Therefore we provide a few equivalent replacements.
    Supported opening braces - { <=> (, [, < and the special tag _OP_.
    Supported closing braces - } <=> ), ], > and the special tag _CL_.
    For SLURM based tasks, we suggest the use of the special tags for ease of use.
    See the WebDataset documentation for more information about accepted data and input formats.
    r   r   r   r
  r   tar_filepathsr   r  r  r  c              
   C   s   ddl m} tj|tg dd| _|||||d}ts t| t	
t	j|dt	|t	 t	jddd	t	d
d| j| _d S )Nr   )expand_sharded_filepathsTr   )r  r  r  )urlszwav;ogg;flac__key__)audiokeyr  r  )r  r  r	   r   r
   r   r   r  r   wdsDataPipelineSimpleShardListshuffletarfile_to_samplesrenameto_tuple_loop_offsetsr    )r*   r   r  r   r  r  r  r  r$   r$   r%   rQ   )  s   



zAugmentationDataset.__init__c                 C   s
   t | jS r(   )rz   r   r   r$   r$   r%   __len__F  rT   zAugmentationDataset.__len__c                    s   G  fddd}|| j S )zYThis function is used to iterate through utterances with different offsets for each file.c                       s(   e Zd Z fddZdd Zdd ZdS )zAAugmentationDataset._loop_offsets.<locals>.TarredAudioLoopOffsetsc                    s"    | _ || _d | _d | _d| _d S r  )iterator
collection
current_fncurrent_bytes	offset_id)r*   r  r  r$   r%   rQ   M  s
   
zJAugmentationDataset._loop_offsets.<locals>.TarredAudioLoopOffsets.__init__c                 S   s   | S r(   r$   r   r$   r$   r%   __iter__T  r-   zJAugmentationDataset._loop_offsets.<locals>.TarredAudioLoopOffsets.__iter__c                 S   s|   | j d u rt| j\| _| _ d| _n$| jj| j  }t|| jd kr/t| j\| _| _ d| _n|  jd7  _| j| j | jfS )Nr   r   )r  r   r  r  r  r  mappingrz   )r*   offset_listr$   r$   r%   __next__W  s   
zJAugmentationDataset._loop_offsets.<locals>.TarredAudioLoopOffsets.__next__N)r0   r1   r2   rQ   r  r  r$   r  r$   r%   TarredAudioLoopOffsetsL  s    r  )r   )r*   r  r  r$   r  r%   r  I  s   
z!AugmentationDataset._loop_offsetsc           
      c   s    t | j}	 z-t|\}}}tjtj|\}}| jj| | }| j| }t	
|}	|	||fV  W n tyB   t | j}Y nw qr(   )r   r    r   ospathsplitextbasenamer   r  ioBytesIOr#  )
r*   
audio_iteraudio_bytesaudio_filenamer  r!   r?  manifest_idxr"   r   r$   r$   r%   r  g  s   


zAugmentationDataset.__iter__N)r   r   r   r
  )r0   r1   r2   re   r  r   r   rZ   rQ   r  r  r  r$   r$   r$   r%   r     s,    
r   )FN)r   r   )?r   r  r  r  r   rz  tempfiler   typingr   r   r   r   r[   numpyrE   	soundfilerv  r<   r   0nemo.collections.asr.parts.preprocessing.segmentr   +nemo.collections.common.parts.preprocessingr	   r
   nemo.core.classesr   
nemo.utilsr   r  	omegaconfr   r   r   r  ModuleNotFoundErrornemo.utils.exceptionsr    nemo.collections.asr.parts.utilsr   rw   ImportErrorr&   objectr'   r3   rf   r   r   r   r   r   r	  rA  rJ  ri  r  r  r  r  r  r  r   r$   r$   r$   r%   <module>   s   "
Im'P  1 Z <?	 