o
    7wiD4                     @   sT   d Z ddlZddlZG dd dejjZG dd dejjZG dd dejjZdS )	a\  Frequency-Domain Sequential Data Augmentation Classes

This module comprises classes tailored for augmenting sequential data in the
frequency domain, such as spectrograms and mel spectrograms.
Its primary purpose is to enhance the resilience of neural models during the training process.

Authors:
- Peter Plantinga (2020)
- Mirco Ravanelli (2023)
    Nc                       s6   e Zd ZdZ						d fdd	Zd	d
 Z  ZS )SpectrogramDropa  This class drops slices of the input spectrogram.

    Using `SpectrogramDrop` as an augmentation strategy helps a models learn to rely
    on all parts of the signal, since it can't expect a given part to be
    present.

    Reference:
        https://arxiv.org/abs/1904.08779

    Arguments
    ---------
    drop_length_low : int
        The low end of lengths for which to drop the
        spectrogram, in samples.
    drop_length_high : int
        The high end of lengths for which to drop the
        signal, in samples.
    drop_count_low : int
        The low end of number of times that the signal
        can be dropped.
    drop_count_high : int
        The high end of number of times that the signal
        can be dropped.
    replace: str
        - 'zeros': Masked values are replaced with zeros.
        - 'mean': Masked values are replaced with the mean value of the spectrogram.
        - 'rand': Masked values are replaced with random numbers ranging between
                  the maximum and minimum values of the spectrogram.
        - 'cutcat': Masked values are replaced with chunks from other signals in the batch.
        - 'swap': Masked values are replaced with other chunks from the same sentence.
        - 'random_selection': A random selection among the approaches above.
    dim : int
        Corresponding dimension to mask. If dim=1, we apply time masking.
        If dim=2, we apply frequency masking.

    Example
    -------
    >>> # time-masking
    >>> drop = SpectrogramDrop(dim=1)
    >>> spectrogram = torch.rand(4, 150, 40)
    >>> print(spectrogram.shape)
    torch.Size([4, 150, 40])
    >>> out = drop(spectrogram)
    >>> print(out.shape)
    torch.Size([4, 150, 40])
    >>> # frequency-masking
    >>> drop = SpectrogramDrop(dim=2)
    >>> spectrogram = torch.rand(4, 150, 40)
    >>> print(spectrogram.shape)
    torch.Size([4, 150, 40])
    >>> out = drop(spectrogram)
    >>> print(out.shape)
    torch.Size([4, 150, 40])
                zerosc                    s~   t    || _|| _|| _|| _|| _|| _||krtd||kr'tdg d| _	| j| j	vr=tdd
| j	 d S )Nz*Low limit must not be more than high limit)r   meanrandcutcatswaprandom_selectionz(Invalid 'replace' option. Select one of z, )super__init__drop_length_lowdrop_length_highdrop_count_lowdrop_count_highreplacedim
ValueErrorreplace_optsjoin)selfr   r   r   r   r   r   	__class__ \/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/augment/freq_domain.pyr   I   s"   
	
zSpectrogramDrop.__init__c                 C   sf  |  dkr|d|jd |jd }|j\}}}| j dkr!|}n|}tj| j| jd d|jd}|dkr7|S tj| j| j	||f|jd
d}tjdtd||  ||f|jd	
d}tj||jd	ddd}	||	k|	|| k  }
|
jdd
}
| j dkr|

dn|

d}
| jdkrt| jdd | _| jdkr||
d}n| jdkr|  }||
|}nu| jdkr|  }|  }t|}|||  | }|
 }
d|
 | |
|  }nH| jdkrtj|ddd}|
 }
d|
 | |
|  }n+| jdkr-tjd|jd d|jd}tj|| dd}|
 }
d|
 | |
|  }|j|j S )a  
        Apply the DropChunk augmentation to the input spectrogram.

        This method randomly drops chunks of the input spectrogram to augment the data.

        Arguments
        ---------
        spectrogram : torch.Tensor
            Input spectrogram of shape `[batch, time, fea]`.

        Returns
        -------
        torch.Tensor
            Augmented spectrogram of shape `[batch, time, fea]`.
              r   r   r   lowhighsizedevicer   )r%   )r   r   Nr           r   r	   r
   shiftsdimsr   )r   viewshapetorchrandintr   r   r%   r   r   	unsqueezemaxarangeanyr   randomchoicer   masked_fill_r   detachmin	rand_likefloatrollitem)r   spectrogram
batch_sizetime_durationfea_sizeDn_masksmask_lenmask_posr0   maskr   max_spectrogrammin_spectrogramrand_spectrogramrolled_spectrogramshiftr   r   r   forwardm   s   







zSpectrogramDrop.forward)r   r   r   r   r   r   __name__
__module____qualname____doc__r   rI   __classcell__r   r   r   r   r      s    9$r   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
Warpinga  
    Apply time or frequency warping to a spectrogram.

    If `dim=1`, time warping is applied; if `dim=2`, frequency warping is applied.
    This implementation selects a center and a window length to perform warping.
    It ensures that the temporal dimension remains unchanged by upsampling or
    downsampling the affected regions accordingly.

    Reference:
        https://arxiv.org/abs/1904.08779

    Arguments
    ---------
    warp_window : int, optional
        The width of the warping window. Default is 5.
    warp_mode : str, optional
        The interpolation mode for time warping. Default is "bicubic."
    dim : int, optional
        Dimension along which to apply warping (1 for time, 2 for frequency).
        Default is 1.

    Example
    -------
    >>> # Time-warping
    >>> warp = Warping()
    >>> spectrogram = torch.rand(4, 150, 40)
    >>> print(spectrogram.shape)
    torch.Size([4, 150, 40])
    >>> out = warp(spectrogram)
    >>> print(out.shape)
    torch.Size([4, 150, 40])
    >>> # Frequency-warping
    >>> warp = Warping(dim=2)
    >>> spectrogram = torch.rand(4, 150, 40)
    >>> print(spectrogram.shape)
    torch.Size([4, 150, 40])
    >>> out = warp(spectrogram)
    >>> print(out.shape)
    torch.Size([4, 150, 40])
    r   bicubicr   c                    s    t    || _|| _|| _d S )N)r   r   warp_window	warp_moder   )r   rR   rS   r   r   r   r   r      s   

zWarping.__init__c           	      C   sX  | j dkr|dd}|j}| j}|  dkr|d}|jd }|| |kr,|j| S t||| dd }t|| || dd d }tjj	j
|ddddd|f ||jd f| jdd}tjj	j
|dddd|df || |jd f| jdd}||ddddd|f< ||dddd|df< |j| }| j dkr|dd}|S )	a9  
        Apply warping to the input spectrogram.

        Arguments
        ---------
        spectrogram : torch.Tensor
            Input spectrogram with shape `[batch, time, fea]`.

        Returns
        -------
        torch.Tensor
            Augmented spectrogram with shape `[batch, time, fea]`.
        r   r   r   r    r   NT)modealign_corners)r   	transposer+   rR   r.   r*   r,   r-   nn
functionalinterpolaterS   )	r   r;   original_sizewindowlen_originalcwleftrightr   r   r   rI     s:   






zWarping.forward)r   rQ   r   rJ   r   r   r   r   rP      s    )rP   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	RandomShifta  Shifts the input tensor by a random amount, allowing for either a time
    or frequency (or channel) shift depending on the specified axis.
    It is crucial to calibrate the minimum and maximum shifts according to the
    requirements of your specific task.
    We recommend using small shifts to preserve information integrity.
    Using large shifts may result in the loss of significant data and could
    potentially lead to misalignments with corresponding labels.

    Arguments
    ---------
    min_shift : int
        The minimum channel shift.
    max_shift : int
        The maximum channel shift.
    dim: int
        The dimension to shift.

    Example
    -------
    >>> # time shift
    >>> signal = torch.zeros(4, 100, 80)
    >>> signal[0,50,:] = 1
    >>> rand_shift =  RandomShift(dim=1, min_shift=-10, max_shift=10)
    >>> lengths = torch.tensor([0.2, 0.8, 0.9,1.0])
    >>> output_signal, lengths = rand_shift(signal,lengths)

    >>> # frequency shift
    >>> signal = torch.zeros(4, 100, 80)
    >>> signal[0,:,40] = 1
    >>> rand_shift =  RandomShift(dim=2, min_shift=-10, max_shift=10)
    >>> lengths = torch.tensor([0.2, 0.8, 0.9,1.0])
    >>> output_signal, lengths = rand_shift(signal,lengths)
    r   r   c                    s4   t    || _|| _|| _| j| jk rtdd S )Nzmax_shift must be  >= min_shift)r   r   	min_shift	max_shiftr   r   )r   rb   rc   r   r   r   r   r   j  s   
zRandomShift.__init__c                 C   sh   t j| j| jd d|jd}t j|| | jd}| jdkr0|||j| j   }t j	|ddd}||fS )aL  
        Arguments
        ---------
        waveforms : tensor
            Shape should be `[batch, time]` or `[batch, time, channels]`.
        lengths : tensor
            Shape should be a single dimension, `[batch]`.

        Returns
        -------
        Tensor of shape `[batch, time]` or `[batch, time, channels]`
        r   r    r!   r'   r&   g      ?)r6   r/   )
r,   r-   rb   rc   r%   r9   r:   r   r+   clamp)r   	waveformslengthsN_shiftsr   r   r   rI   t  s   
zRandomShift.forward)r   r   r   rJ   r   r   r   r   ra   G  s    "
ra   )rN   r2   r,   rW   Moduler   rP   ra   r   r   r   r   <module>   s     Fq