o
    7wi                     @   s  d Z ddlZddlZddlm  mZ ddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZ G dd dejjZG dd	 d	ejjZG d
d dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZd&d d!ZG d"d# d#ejjZ G d$d% d%ejjZ!dS )'a  Time-Domain Sequential Data Augmentation Classes

This module contains classes designed for augmenting sequential data in the time domain.
It is particularly useful for enhancing the robustness of neural models during training.
The available data distortions include adding noise, applying reverberation, adjusting playback speed, and more.
All classes are implemented as `torch.nn.Module`, enabling end-to-end differentiability and gradient backpropagation.

Authors:
- Peter Plantinga (2020)
- Mirco Ravanelli (2023)
    N)make_dataloader)ExtendedCSVDataset)compute_amplitude
convolve1ddB_to_amplitudenotch_filterreverberatec                       sj   e Zd ZdZdddddddddeji ddf fdd	Zd	d
 Zdd Zdd Z	e
dd Zdd Z  ZS )AddNoiseaf
  This class additively combines a noise signal to the input signal.

    Arguments
    ---------
    csv_file : str
        The name of a csv file containing the location of the
        noise audio files. If none is provided, white noise will be used.
    csv_keys : list, None, optional
        Default: None . One data entry for the noise data should be specified.
        If None, the csv file is expected to have only one data entry.
    sorting : str
        The order to iterate the csv file, from one of the
        following options: random, original, ascending, and descending.
    num_workers : int
        Number of workers in the DataLoader (See PyTorch DataLoader docs).
    snr_low : int
        The low end of the mixing ratios, in decibels.
    snr_high : int
        The high end of the mixing ratios, in decibels.
    pad_noise : bool
        If True, copy noise signals that are shorter than
        their corresponding clean signals so as to cover the whole clean
        signal. Otherwise, leave the noise un-padded.
    start_index : int
        The index in the noise waveforms to start from. By default, chooses
        a random index in [0, len(noise) - len(waveforms)].
    normalize : bool
        If True, output noisy signals that exceed [-1,1] will be
        normalized to [-1,1].
    noise_funct: funct object
        function to use to draw a noisy sample. It is enabled if the csv files
        containing the noisy sequences are not provided. By default,
        torch.randn_like is used (to sample white noise). In general, it must
        be a function that takes in input the original waveform and returns
        a tensor with the corresponding noise to add (e.g., see pink_noise_like).
    replacements : dict
        A set of string replacements to carry out in the
        csv file. Each time a key is found in the text, it will be replaced
        with the corresponding value.
    noise_sample_rate : int
        The sample rate of the noise audio signals, so noise can be resampled
        to the clean sample rate if necessary.
    clean_sample_rate : int
        The sample rate of the clean audio signals, so noise can be resampled
        to the clean sample rate if necessary.

    Example
    -------
    >>> import pytest
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clean = signal.unsqueeze(0) # [batch, time, channels]
    >>> noisifier = AddNoise('tests/samples/annotation/noise.csv',
    ...                     replacements={'noise_folder': 'tests/samples/noise'})
    >>> noisy = noisifier(clean, torch.ones(1))
    Nrandomr   F>  c                    s\   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|| _|
| _|| _|| _d S N)super__init__csv_filecsv_keyssortingnum_workerssnr_lowsnr_high	pad_noisestart_index	normalizereplacementsnoise_functnoise_sample_rateclean_sample_rate)selfr   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ \/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/augment/time_domain.pyr   Y   s   

zAddNoise.__init__c                 C   sH  |  }||jd  d}t||dd}tjt|d|jd}|| j| j	  | j	 }dt
|d  }t|jdkr?|d}|| }|d| 9 }| jdu rj| |}|jd dkrgtj|g|jd  dd}|}	n|jd }
| ||
\}}	t||	dd}|||d	  9 }||7 }| jrtjt|dd
d\}}||jdd }|S )aY  
        Arguments
        ---------
        waveforms : torch.Tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
        lengths : torch.Tensor
            Shape should be a single dimension, `[batch]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
           rms)amp_typedevice   Nr   dim+=Tr(   keepdim      ?min)cloneshape	unsqueezer   torchrandlenr%   r   r   r   r   r   cat_load_noiser   maxabsclamp)r   	waveformslengthsnoisy_waveformclean_amplitudeSNRnoise_amplitude_factornew_noise_amplitudenoise_waveformnoise_lengthtensor_lengthnoise_amplitudeabs_max_r   r   r    forwardy   s@   




zAddNoise.forwardc                 C   s  |  d}t|}t| dsM| j| jkrt| j| j| _|j| _| j	durMt
| j	| j| jdkr4| jnd| jd}t||| j| jdkd| _t| j| _| |\}}||j}||j}t| drj| |}||jd    }| jrt||k rt|}|ddd|f }tj||fdd	}||7 }t||k s}n|d|k rd
||d f}	tjj||	}| j}
| jdu rd
}
||  j dd}tj!|d|jd}
|dd|
|
| f }||
 j |d"d}||fS )zLoad a batch of noisesr!   data_loaderNr
   original)csvpathoutput_keysr   r   )
batch_sizer   shuffle	resampler)axisr   r-   r!   )highsizer%   )r7   )#longsqueezer4   hasattrr   r   ResamplerN   r%   r   r   r   r   r   r   r   rH   iter
noise_data_load_noise_batch_of_sizetor0   r   r2   anyr.   r5   rR   nn
functionalpadr   r9   randintr1   )r   r;   
max_lengthrL   datasetnoise_batch	noise_lenmin_lenprependpaddingr   max_chopr   r   r    r6      sb   





zAddNoise._load_noisec                 C   sp   |   \}}t||k r"|   \}}t||||\}}t||k st||kr4|d| }|d| }||fS )z4Concatenate noise batches, then chop to correct sizeN)_load_noise_batchr4   r	   _concat_batch)r   rL   rb   
noise_lensadded_noise
added_lensr   r   r    rY     s   z"AddNoise._load_noise_batch_of_sizec                 C   s   | j d }|j d }dt|| f}||kr%tjj||}|| | }ntjj| |} || | }t| |f} t||f}| |fS )z>Concatenate two noise batches of potentially different lengthsr!   r   )r0   r8   r2   r\   r]   r^   r5   )rb   rj   rk   rl   noise_tensor_lenadded_tensor_lenr^   r   r   r    ri     s   

zAddNoise._concat_batchc                 C   sX   zt | jd\}}W ||fS  ty+   t| j| _t | jd\}}Y ||fS w )z:Load a batch of noises, restarting iteration if necessary.r   )nextrX   at_positionStopIterationrW   rH   )r   noiseslensr   r   r    rh   *  s   zAddNoise._load_noise_batch)__name__
__module____qualname____doc__r2   
randn_liker   rG   r6   rY   staticmethodri   rh   __classcell__r   r   r   r    r	      s,    ; AI
r	   c                       s>   e Zd ZdZdddi ddf fdd	Zdd	 Zd
d Z  ZS )	AddReverbag  This class convolves an audio signal with an impulse response.

    Arguments
    ---------
    csv_file : str
        The name of a csv file containing the location of the
        impulse response files.
    sorting : str
        The order to iterate the csv file, from one of
        the following options: random, original, ascending, and descending.
    num_workers : int
        Number of workers in the DataLoader (See PyTorch DataLoader docs).
    rir_scale_factor: float
        It compresses or dilates the given impulse response.
        If 0 < scale_factor < 1, the impulse response is compressed
        (less reverb), while if scale_factor > 1 it is dilated
        (more reverb).
    replacements : dict
        A set of string replacements to carry out in the
        csv file. Each time a key is found in the text, it will be replaced
        with the corresponding value.
    reverb_sample_rate : int
        The sample rate of the corruption signals (rirs), so that they
        can be resampled to clean sample rate if necessary.
    clean_sample_rate : int
        The sample rate of the clean signals, so that the corruption
        signals can be resampled to the clean sample rate before convolution.

    Example
    -------
    >>> import pytest
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clean = signal.unsqueeze(0) # [batch, time, channels]
    >>> reverb = AddReverb('tests/samples/annotation/RIRs.csv',
    ...                     replacements={'rir_folder': 'tests/samples/RIRs'})
    >>> reverbed = reverb(clean)
    r
   r   r,   r   c                    s8   t    || _|| _|| _|| _|| _|| _|| _d S r   )	r   r   r   r   r   r   reverb_sample_rater   rir_scale_factor)r   r   r   r   r}   r   r|   r   r   r   r    r   ^  s   


zAddReverb.__init__c                 C   s   | j | jkrt| j | j| _d}t|jdkr|d}d}| |}t| dr-| |}| j	dkrFt
j|dd| j	ddd}|dd}t||d	d
}|rT|dS |S )
        Arguments
        ---------
        waveforms : torch.Tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        F   TrN   r!   linear)scale_factormodealign_cornersavg)rescale_amp)r|   r   rV   rN   r4   r0   r1   	_load_rirrU   r}   Finterpolate	transposer   rT   )r   r:   channel_addedrir_waveformrev_waveformr   r   r    rG   q  s.   






zAddReverb.forwardc                 C   s   t | ds(t| j| jdkr| jnd| jd}t|| jdk| jd| _t| j| _	zt
| j	d\}}W n tyM   t| j| _	t
| j	d\}}Y nw t|jdkrZ|d}||j}||jS )	NrH   r
   rI   )rJ   r   r   )rM   r   r   r   r   )rU   r   r   r   r   r   r   rH   rW   rir_dataro   rp   rq   r4   r0   r1   typedtyperZ   r%   )r   r:   ra   r   lengthr   r   r    r     s,   

zAddReverb._load_rir)rt   ru   rv   rw   r   rG   r   rz   r   r   r   r    r{   6  s    *0r{   c                       s2   e Zd ZdZg ddf fdd	Zdd Z  ZS )SpeedPerturba  Slightly speed up or slow down an audio signal.

    Resample the audio signal at a rate that is similar to the original rate,
    to achieve a slightly slower or slightly faster signal. This technique is
    outlined in the paper: "Audio Augmentation for Speech Recognition"

    Arguments
    ---------
    orig_freq : int
        The frequency of the original signal.
    speeds : list
        The speeds that the signal should be changed to, as a percentage of the
        original signal (i.e. `speeds` is divided by 100 to get a ratio).
    device : str
        The device to use for the resampling.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
    >>> clean = signal.unsqueeze(0)
    >>> perturbed = perturbator(clean)
    >>> clean.shape
    torch.Size([1, 52173])
    >>> perturbed.shape
    torch.Size([1, 46956])
    )Z   d   n   cpuc                    sd   t    || _|| _|| _d| _g | _| jD ]}| j| j| d d}| jtdi | qd S )Nr   r   	orig_freqnew_freqr   )	r   r   r   speedsr%   
samp_index
resamplersappendrV   )r   r   r   r%   speedconfigr   r   r    r     s   

zSpeedPerturb.__init__c                 C   s:   t dt| jd| _| j| j || j}||jS )a  
        Arguments
        ---------
        waveform : torch.Tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        torch.Tensor of shape `[batch, time]` or `[batch, time, channels]`.
        r   rP   )r2   r_   r4   r   r   r   rZ   r%   )r   waveformperturbed_waveformr   r   r    rG     s
   

zSpeedPerturb.forwardrt   ru   rv   rw   r   rG   rz   r   r   r   r    r     s    r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )rV   a  This class resamples audio using the
    :class:`torchaudio resampler <torchaudio.transforms.Resample>` based on
    sinc interpolation.

    Arguments
    ---------
    orig_freq : int
        the sampling frequency of the input signal.
    new_freq : int
        the new sampling frequency after this operation is performed.
    *args
        additional arguments forwarded to the
        :class:`torchaudio.transforms.Resample` constructor
    **kwargs
        additional keyword arguments forwarded to the
        :class:`torchaudio.transforms.Resample` constructor

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
    >>> resampler = Resample(orig_freq=16000, new_freq=8000)
    >>> resampled = resampler(signal)
    >>> signal.shape
    torch.Size([1, 52173])
    >>> resampled.shape
    torch.Size([1, 26087])
    r   c                    s4   t    || _|| _tjj|||d|| _d S )Nr   )r   r   r   r   
torchaudio
transformsrV   rN   )r   r   r   argskwargsr   r   r    r   $  s   
zResample.__init__c                 C   s   | j | jkr|S d}t|jdkr|d}d}nt|jdkr'|dd}ntd| j|j	 | |}|r@|
d}|S |dd}|S )r~   Fr   r!   Tr&   zInput must be 2 or 3 dimensions)r   r   r4   r0   r1   r   
ValueErrorrN   rZ   r%   rT   )r   r:   
unsqueezedresampled_waveformr   r   r    rG   .  s    


zResample.forward)r   r   r   r   r   r   r    rV     s    
rV   c                       s6   e Zd ZdZ						d fdd	Zd	d
 Z  ZS )DropFreqa%  This class drops a random frequency from the signal.

    The purpose of this class is to teach models to learn to rely on all parts
    of the signal, not just a few frequency bands.

    Arguments
    ---------
    drop_freq_low : float
        The low end of frequencies that can be dropped,
        as a fraction of the sampling rate / 2.
    drop_freq_high : float
        The high end of frequencies that can be
        dropped, as a fraction of the sampling rate / 2.
    drop_freq_count_low : int
        The low end of number of frequencies that could be dropped.
    drop_freq_count_high : int
        The high end of number of frequencies that could be dropped.
    drop_freq_width : float
        The width of the frequency band to drop, as
        a fraction of the sampling_rate / 2.
    epsilon : float
        A small positive value to prevent issues such as filtering 0 Hz,
        division by zero, or other numerical instabilities. This value sets
        the absolute minimum for normalized frequencies used in the filter.
        The default value is 1e-12.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = DropFreq()
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> dropped_signal = dropper(signal.unsqueeze(0))
    r)   r!   r&   皙?-q=c                    s2   t    || _|| _|| _|| _|| _|| _d S r   )r   r   drop_freq_lowdrop_freq_highdrop_freq_count_lowdrop_freq_count_highdrop_freq_widthepsilon)r   r   r   r   r   r   r   r   r   r    r   |  s   
	
zDropFreq.__init__c                 C   s2  |  }t|jdkr|d}tj| j| jd dd}| j| j	 }t
|| | j	 j| jd}d}|d }tjd|d|jd}d|d	|d	f< |D ]}	t|	|| j|j}
t||
|}qJt|jd
krw||jd	 |jd  |jd d}t|||}t|jd
kr||jd	 |jd |jd }|dS )r~   r   r   r!   rP   lowrQ   rR   r-   e   r$   r   r&   )r/   r4   r0   r1   r2   r_   r   r   r   r   r3   r9   r   zerosr%   r   r   rZ   r   reshaperT   )r   r:   dropped_waveform
drop_count
drop_rangedrop_frequencyfilter_lengthr^   drop_filter	frequencynotch_kernelr   r   r    rG     sF   

zDropFreq.forward)r)   r!   r!   r&   r   r   r   r   r   r   r    r   Y  s    $r   c                       s8   e Zd ZdZ							d fd	d
	Zdd Z  ZS )	DropChunka  This class drops portions of the input signal.

    Using `DropChunk` as an augmentation strategy helps a models learn to rely
    on all parts of the signal, since it can't expect a given part to be
    present.

    Arguments
    ---------
    drop_length_low : int
        The low end of lengths for which to set the
        signal to zero, in samples.
    drop_length_high : int
        The high end of lengths for which to set the
        signal to zero, in samples.
    drop_count_low : int
        The low end of number of times that the signal
        can be dropped to zero.
    drop_count_high : int
        The high end of number of times that the signal
        can be dropped to zero.
    drop_start : int
        The first index for which dropping will be allowed.
    drop_end : int
        The last index for which dropping will be allowed.
    noise_factor : float
        The factor relative to average amplitude of an utterance
        to use for scaling the white noise inserted. 1 keeps
        the average amplitude the same, while 0 inserts all 0's.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> signal = signal.unsqueeze(0) # [batch, time, channels]
    >>> length = torch.ones(1)
    >>> dropped_signal = dropper(signal, length)
    >>> float(dropped_signal[:, 150])
    0.0
    r     r!   r&   r   N        c           	         s   t    || _|| _|| _|| _|| _|| _|| _||kr"t	d||kr*t	d|d urL|dkrN||kr:t	d|| }t
||| _t
||| _d S d S d S )N*Low limit must not be more than high limitr   )r   r   drop_length_lowdrop_length_highdrop_count_lowdrop_count_high
drop_startdrop_endnoise_factorr   r.   )	r   r   r   r   r   r   r   r   r   r   r   r    r     s&   

zDropChunk.__init__c                 C   s  || d  }| d}| }t||d}tj| j| jd |fd}t	|D ]}|| dkr4q+tj| j
| jd || fd}| j}	|	dk rP|	|| 7 }	| j}
|
du r[|| }
|
dk re|
|| 7 }
td|
|  }
tj|	|
d || fd}|| }| jst	|| D ]}d|||| || f< qq+d||  | j }t	|| D ] }tj|| |jd}d| | | }||||| || f< qq+|S )ad  
        Arguments
        ---------
        waveforms : torch.Tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
        lengths : torch.Tensor
            Shape should be a single dimension, `[batch]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or
            `[batch, time, channels]`
        r!   r   r   Nr   r   r$   )rR   rS   r/   r   r1   r2   r_   r   r   ranger   r   r   r   r7   r   r3   r%   )r   r:   r;   rL   r   r=   
drop_timesir   	start_min	start_maxstartendj	noise_max	noise_vecr   r   r    rG     sR   
zDropChunk.forward)r   r   r!   r&   r   Nr   r   r   r   r   r    r     s    +"r   c                       s@   e Zd ZdZ							d fdd		Zd
d Zdd Z  ZS )FastDropChunka.  This class drops portions of the input signal. The difference with
    DropChunk is that in this case we pre-compute the dropping masks in the
    first time the forward function is called. For all the other calls, we only
    shuffle and apply them. This makes the code faster and more suitable for
    data augmentation of large batches.

    It can be used only for fixed-length sequences.

    Arguments
    ---------
    drop_length_low : int
        The low end of lengths for which to set the
        signal to zero, in samples.
    drop_length_high : int
        The high end of lengths for which to set the
        signal to zero, in samples.
    drop_count_low : int
        The low end of number of times that the signal
        can be dropped to zero.
    drop_count_high : int
        The high end of number of times that the signal
        can be dropped to zero.
    drop_start : int
        The first index for which dropping will be allowed.
    drop_end : int
        The last index for which dropping will be allowed.
    n_masks : int
        The number of precomputed masks.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> dropper = FastDropChunk(drop_start=100, drop_end=200)
    >>> signal = torch.rand(10, 250, 22)
    >>> dropped_signal = dropper(signal)
    r   r   r!   
   r   Nc           	         s   t    || _|| _|| _|| _|| _|| _|| _d| _	||kr%t
d||kr-t
d|d urO|dkrQ||kr=t
d|| }t||| _t||| _d S d S d S )NTr   r   )r   r   r   r   r   r   r   r   n_masksfirstr   r.   )	r   r   r   r   r   r   r   r   r   r   r   r    r     s(   

zFastDropChunk.__init__c                 C   s4  | j |jd k rtdtj| j | jg|jd}tj| j| j	d | j f|jd}t
| j D ]j}|| dkr6q-tj| j| jd || f|jd}| j}|dk rS|| j7 }| j}|du r]| j}|dk rf|| j7 }td||  }tj||d || f|jd}|| }	t
|| D ]}
d||||
 |	|
 f< qq-|S )a  
                Arguments
                ---------
                waveforms : torch.Tensor
                    Shape should be `[batch, time]` or `[batch, time, channels]`.
        `.
                Returns
                -------
                dropped_masks : torch.Tensor
                    Tensor of size `[n_masks, time]` with the dropped chunks. Dropped
                    regions are assigned to 0.
        r   z,n_mask cannot be smaller than the batch sizer$   r!   )r   rQ   rR   r%   Nr   )r   r0   r   r2   onessig_lenr%   r_   r   r   r   r   r   r   r   r7   )r   r:   dropped_masksr   r   r   r   r   r   r   r   r   r   r    initialize_masks  sP   

zFastDropChunk.initialize_masksc                 C   s   |  }| jr|jd | _| || _d| _t| jjd }| j|ddf | _tjd| jdd}tj	| j|
 dd| _t|jdkrV|| jd|jd  d	 }|S || jd|jd   }|S )

        Arguments
        ---------
        waveforms : torch.Tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        r!   Fr   NrP   r   shiftsdimsr&   r   )r/   r   r0   r   r   r   r2   randpermr_   rollitemr4   r1   )r   r:   dropped_waveforms	rand_permrand_shiftsr   r   r    rG     s*   zFastDropChunk.forward)r   r   r!   r   r   Nr   )rt   ru   rv   rw   r   r   rG   rz   r   r   r   r    r   h  s    '"Fr   c                       r   )DoClipa  This function mimics audio clipping by clamping the input tensor.
    First, it normalizes the waveforms from -1 to -1. Then, clipping is applied.
    Finally, the original amplitude is restored.

    Arguments
    ---------
    clip_low : float
        The low end of amplitudes for which to clip the signal.
    clip_high : float
        The high end of amplitudes for which to clip the signal.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> clipper = DoClip(clip_low=0.01, clip_high=0.01)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> clipped_signal = clipper(signal.unsqueeze(0))
          ?c                       t    || _|| _d S r   )r   r   clip_low	clip_high)r   r   r   r   r   r    r   4     

zDoClip.__init__c                 C   sj   t jt |ddd\}}|| }| j| j }t jd|jdd | | j }|| |}|| | }|S )r   r!   Tr*   r$   r   )r2   r7   r8   r   r   r3   r%   r9   )r   r:   rE   rF   clipping_range
clip_valueclipped_waveformr   r   r    rG   9  s   zDoClip.forward)r   r   r   r   r   r   r    r          r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	RandAmpak  This function multiples the signal by a random amplitude. First, the
    signal is normalized to have amplitude between -1 and 1. Then it is
    multiplied with a random number.

    Arguments
    ---------
    amp_low : float
        The minimum amplitude multiplication factor.
    amp_high : float
        The maximum amplitude multiplication factor.

    Example
    -------
    >>> from speechbrain.dataio.dataio import read_audio
    >>> rand_amp = RandAmp(amp_low=0.25, amp_high=1.75)
    >>> signal = read_audio('tests/samples/single-mic/example1.wav')
    >>> output_signal = rand_amp(signal.unsqueeze(0))
    r         ?c                    r   r   )r   r   amp_lowamp_high)r   r   r   r   r   r    r   m  r   zRandAmp.__init__c                 C   s|   t jt |ddd\}}|| }| j| j }t j|jd |jd| | j }|d}t	|jdkr8|d}|| }|S )r   r!   Tr*   r   r$   r&   r   )
r2   r7   r8   r   r   r3   r0   r%   r1   r4   )r   r:   rE   rF   
rand_rangeampr   r   r    rG   r  s   

zRandAmp.forward)r   r   r   r   r   r   r    r   Y  r   r   c                       r   )ChannelDropa8  This function drops random channels in the multi-channel input waveform.

    Arguments
    ---------
    drop_rate : float
        The channel dropout factor

    Example
    -------
    >>> signal = torch.rand(4, 256, 8)
    >>> ch_drop = ChannelDrop(drop_rate=0.5)
    >>> output_signal = ch_drop(signal)
    皙?c                       t    || _d S r   )r   r   	drop_rate)r   r   r   r   r    r        

zChannelDrop.__init__c                 C   s:   t j|jd |jd}|| j}||dd }|S )r   r   r$   r   r!   )r2   r3   r0   r%   ger   r1   )r   r:   xchannel_maskr   r   r    rG     s   zChannelDrop.forward)r   r   r   r   r   r    r     s    r   c                       r   )ChannelSwapaX  This function randomly swaps N channels.

    Arguments
    ---------
    min_swap : int
        The minimum number of channels to swap.
    max_swap : int
        The maximum number of channels to swap.

    Example
    -------
    >>> signal = torch.rand(4, 256, 8)
    >>> ch_swap = ChannelSwap()
    >>> output_signal = ch_swap(signal)
    r   c                    sR   t    || _|| _| jdk rtd| jdk rtd| j| jk r'tdd S )Nr   zmin_swap must be  >= 0.zmax_swap must be  >= 0.zmax_swap must be  >= min_swap)r   r   min_swapmax_swapr   )r   r  r  r   r   r    r     s   


zChannelSwap.__init__c                 C   s   t |jd }t |jd }t j| j| jd dd}||jd k r^t|D ]4}|dddd|| f }|dddd|| f |dddd|| f< ||dddd|| f< q'|S |dddd|f }|S )r   r   r!   rP   r   N)r2   r   r0   r_   r  r  r   )r   r:   
rand_perm1
rand_perm2N_swapsr   store_channelr   r   r    rG     s   0zChannelSwap.forward)r   r   r   r   r   r   r    r    s    r  c                       r   )	CutCata$  This function combines segments (with equal length in time) of the time series contained in the batch.
    Proposed for EEG signals in https://doi.org/10.1016/j.neunet.2021.05.032.

    Arguments
    ---------
    min_num_segments : int
        The number of segments to combine.
    max_num_segments : int
        The maximum number of segments to combine. Default is 10.

    Example
    -------
    >>> signal = torch.ones((4, 256, 22)) * torch.arange(4).reshape((4, 1, 1,))
    >>> cutcat =  CutCat()
    >>> output_signal = cutcat(signal)
    r   r   c                    s.   t    || _|| _| j| jk rtdd S )Nz-max_num_segments must be  >= min_num_segments)r   r   min_num_segmentsmax_num_segmentsr   )r   r	  r
  r   r   r    r     s   
zCutCat.__init__c                 C   s   |j d dkr\tj|ddd}tj| j| jd dd}tjd|j d | d tjd}t	|j d d D ]&}|d dkr[|| }||d  }|dd||d	f |dd||d	f< q5|S )
r   r   r!   r   rP   r   )r   r   N.)
r0   r2   r   r_   r	  r
  linspacer   intr   )r   r:   waveforms_rollednum_segmentsidx_cutr   r   stopr   r   r    rG     s&   zCutCat.forward)r   r   r   r   r   r   r    r    s    r  r,   2   c                 C   s@  t | }t jj|dd}|| }t j| jd | jd| | }t jd|d t|jd d | jd}dt |	d|	d }	|	dddf |	dddf< t j
|	dd}
|jd d r||	ddt|jd d d f 	d}t j|	||
gdd}	n	t j|	|
gdd}	t|jd	kr|		d}	||	 }t jj|ddj}|S )
a5  Creates a sequence of pink noise (also known as 1/f). The pink noise
    is obtained by multiplying the spectrum of a white noise sequence by a
    factor (1/f^alpha).
    The alpha factor controls the decrease factor in the frequency domain
    (alpha=0 adds white noise, alpha>>0 adds low frequency noise). It is
    randomly sampled between alpha_low and alpha_high. With negative alpha this
    function generates blue noise.

    Arguments
    ---------
    waveforms : torch.Tensor
        The original waveform. It is just used to infer the shape.
    alpha_low : float
        The minimum value for the alpha spectral smoothing factor.
    alpha_high : float
        The maximum value for the alpha spectral smoothing factor.
    sample_rate : float
        The sample rate of the original signal.

    Returns
    -------
    pink_noise : torch.Tensor
        Pink noise in the shape of the input tensor.

    Example
    -------
    >>> waveforms = torch.randn(4,257,10)
    >>> noise = pink_noise_like(waveforms)
    >>> noise.shape
    torch.Size([4, 257, 10])
    r!   r'   r   r$   r   NrP   )r   r&   )r2   rx   fftr3   r0   r%   r  r  powr1   flipr5   r4   ifftreal)r:   	alpha_low
alpha_highsample_ratewhite_noisewhite_noise_fftr   alphafspectral_maskspectral_mask_upmid_elementpink_noise_fft
pink_noiser   r   r    pink_noise_like3  s>   
!

r#  c                       r   )DropBitResolutiona6  
    This class transforms a float32 tensor into a lower resolution one
    (e.g., int16, int8, float16) and then converts it back to a float32.
    This process loses information and can be used for data augmentation.

    Arguments:
    ---------
        target_dtype: str
            One of "int16", "int8", "float16". If "random", the bit resolution
            is randomly selected among the options listed above.

    Example:
        >>> dropper = DropBitResolution()
        >>> signal = torch.rand(4, 16000)
        >>> signal_dropped = dropper(signal)
    r
   c                    sf   t    || _dtjfdtjfdtjfd| _| jdkr/| j| jvr1tdt	| j
  d S d S )N      )int16int8float16r
   ztarget_dtype must be one of )r   r   target_dtyper2   r'  r(  r)  
bit_depthsr   listkeys)r   r*  r   r   r    r     s   

zDropBitResolution.__init__c                 C   s   | j dkrtt| j }| j| \}}n| j| j  \}}|tjkr:d|d  d | 	  }|| 
|}n| }d}|
tj| }|S )aC  
        Arguments:
        ---------
            float32_tensor: torch.Tensor
                Float32 tensor with shape `[batch, time]` or `[batch, time, channels]`.

        Returns:
        ---------
            torch.Tensor
                Tensor of shape `[batch, time]` or `[batch, time, channels]` (Float32)
        r
   r   r!   )r*  r
   choicer,  r+  r-  r2   r)  r8   r7   rZ   halffloat32)r   float32_tensor
random_keybitr*  r   quantized_tensordequantized_tensorr   r   r    rG     s   

zDropBitResolution.forward)r
   r   r   r   r   r    r$    s    r$  c                       r   )SignFlipa  Flip the sign of a signal.

    This module negates all the values in a tensor with a given probability.
    If the sign is not flipped, the original signal is returned
    unchanged. This technique is outlined in the paper:
    "CADDA: Class-wise Automatic Differentiable Data Augmentation for EEG Signals"
    https://arxiv.org/pdf/2106.13695

    Arguments
    ---------
    flip_prob : float
        The probability with which to flip the sign of the signal. Default is 0.5.

    Example
    -------
    >>> import torch
    >>> x = torch.tensor([1,2,3,4,5])
    >>> flip = SignFlip(flip_prob=1) # 100% chance to flip sign
    >>> flip(x)
    tensor([-1, -2, -3, -4, -5])
    r   c                    r   r   )r   r   	flip_prob)r   r7  r   r   r    r     r   zSignFlip.__init__c                 C   s   t d | jk r| S |S )az  
        Arguments
        ---------
        waveform : torch.Tensor
            Input tensor representaing waveform, shape does not matter.

        Returns
        -------
        torch.Tensor
            The output tensor with same shape as the input, where the
            sign of all values in the tensor has been flipped with
            probability `flip_prob`.

        r!   )r2   r3   r   r7  )r   r   r   r   r    rG     s   zSignFlip.forward)r   r   r   r   r   r    r6    s    r6  )r,   r,   r  )"rw   r
   r2   torch.nn.functionalr\   r]   r   r   speechbrain.dataio.dataloaderr   speechbrain.dataio.legacyr   (speechbrain.processing.signal_processingr   r   r   r   r   Moduler	   r{   r   rV   r   r   r   r   r   r   r  r  r#  r$  r6  r   r   r   r    <module>   s8    	   DTv  997&=
@SD