o
    eiEw                     @   s>  d Z ddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ G dd dejj Z!G dd dejj Z"G dd dejj Z#de$de$fddZ%eG dd dZ&G dd dejj Z'G dd dejj Z(d!dd Z)dS )"zBasic feature pipelines.

Authors
 * Mirco Ravanelli 2020
 * Peter Plantinga 2020
 * Sarthak Yadav 2020
 * Sylvain de Langen 2024
    )	dataclass)partial)OptionalN)GaborConv1d)PCEN)GaussianLowpassPooling)DCTSTFTContextWindowDeltas
Filterbankspectral_magnitude)PERIODIC_NEIGHBORScompute_autocorr_featurescompute_gnecompute_periodic_featurescompute_spectral_features)fwd_default_precision)FilterPropertiesc                       sb   e Zd ZdZ												
				d fdd	Zeejddd Zde	fddZ
  ZS )Fbanka 
  Generate features for input to the speech pipeline.

    Arguments
    ---------
    deltas : bool (default: False)
        Whether or not to append derivatives and second derivatives
        to the features.
    context : bool (default: False)
        Whether or not to append forward and backward contexts to
        the features.
    requires_grad : bool (default: False)
        Whether to allow parameters (i.e. fbank centers and
        spreads) to update during training.
    sample_rate : int (default: 160000)
        Sampling rate for the input waveforms.
    f_min : int (default: 0)
        Lowest frequency for the Mel filters.
    f_max : int (default: None)
        Highest frequency for the Mel filters. Note that if f_max is not
        specified it will be set to sample_rate // 2.
    n_fft : int (default: 400)
        Number of samples to use in each stft.
    n_mels : int (default: 40)
        Number of Mel filters.
    filter_shape : str (default: triangular)
        Shape of the filters ('triangular', 'rectangular', 'gaussian').
    param_change_factor : float (default: 1.0)
        If freeze=False, this parameter affects the speed at which the filter
        parameters (i.e., central_freqs and bands) can be changed.  When high
        (e.g., param_change_factor=1) the filters change a lot during training.
        When low (e.g. param_change_factor=0.1) the filter parameters are more
        stable during training.
    param_rand_factor : float (default: 0.0)
        This parameter can be used to randomly change the filter parameters
        (i.e, central frequencies and bands) during training.  It is thus a
        sort of regularization. param_rand_factor=0 does not affect, while
        param_rand_factor=0.15 allows random variations within +-15% of the
        standard values of the filter parameters (e.g., if the central freq
        is 100 Hz, we can randomly change it from 85 Hz to 115 Hz).
    left_frames : int (default: 5)
        Number of frames of left context to add.
    right_frames : int (default: 5)
        Number of frames of right context to add.
    win_length : float (default: 25)
        Length (in ms) of the sliding window used to compute the STFT.
    hop_length : float (default: 10)
        Length (in ms) of the hop of the sliding window used to compute
        the STFT.

    Example
    -------
    >>> import torch
    >>> inputs = torch.randn([10, 16000])
    >>> feature_maker = Fbank()
    >>> feats = feature_maker(inputs)
    >>> feats.shape
    torch.Size([10, 101, 40])
    F>  r   N  (   
triangular      ?              
   c                    sz   t    || _|| _|| _|d u r|d }t||||d| _t|||||| |	|
|d	| _t	|d| _
t||d| _d S )N   sample_raten_fft
win_length
hop_length	r!   r"   n_melsf_minf_maxfreezefilter_shapeparam_change_factorparam_rand_factor
input_sizeleft_framesright_frames)super__init__deltascontextrequires_gradr	   compute_STFTr   compute_fbanksr   compute_deltasr
   context_window)selfr4   r5   r6   r!   r'   r(   r"   r&   r*   r+   r,   r0   r1   r#   r$   	__class__ X/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/features.pyr3   b   s8   
zFbank.__init__cast_inputsc                 C   s^   |  |}t|}| |}| jr%| |}| |}tj|||gdd}| jr-| |}|S )a  Returns a set of features generated from the input waveforms.

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to transform to features.

        Returns
        -------
        fbanks : torch.Tensor
        r   dim)	r7   r   r8   r4   r9   torchcatr5   r:   )r;   wavr	   magfbanksdelta1delta2r>   r>   r?   forward   s   




zFbank.forwardreturnc                 C   s
   | j  S N)r7   get_filter_propertiesr;   r>   r>   r?   rN      s   
zFbank.get_filter_properties)FFFr   r   Nr   r   r   r   r   r   r   r   r   )__name__
__module____qualname____doc__r3   r   rD   float32rK   r   rN   __classcell__r>   r>   r<   r?   r   &   s*    =
1
r   c                       sV   e Zd ZdZ											
						d fdd	Zeejddd Z  Z	S )MFCCaW
  Generate features for input to the speech pipeline.

    Arguments
    ---------
    deltas : bool (default: True)
        Whether or not to append derivatives and second derivatives
        to the features.
    context : bool (default: True)
        Whether or not to append forward and backward contexts to
        the features.
    requires_grad : bool (default: False)
        Whether to allow parameters (i.e. fbank centers and
        spreads) to update during training.
    sample_rate : int (default: 16000)
        Sampling rate for the input waveforms.
    f_min : int (default: 0)
        Lowest frequency for the Mel filters.
    f_max : int (default: None)
        Highest frequency for the Mel filters. Note that if f_max is not
        specified it will be set to sample_rate // 2.
    n_fft : int (default: 400)
        Number of samples to use in each stft.
    n_mels : int (default: 23)
        Number of filters to use for creating filterbank.
    n_mfcc : int (default: 20)
        Number of output coefficients
    filter_shape : str (default 'triangular')
        Shape of the filters ('triangular', 'rectangular', 'gaussian').
    param_change_factor: bool (default 1.0)
        If freeze=False, this parameter affects the speed at which the filter
        parameters (i.e., central_freqs and bands) can be changed.  When high
        (e.g., param_change_factor=1) the filters change a lot during training.
        When low (e.g. param_change_factor=0.1) the filter parameters are more
        stable during training.
    param_rand_factor: float (default 0.0)
        This parameter can be used to randomly change the filter parameters
        (i.e, central frequencies and bands) during training.  It is thus a
        sort of regularization. param_rand_factor=0 does not affect, while
        param_rand_factor=0.15 allows random variations within +-15% of the
        standard values of the filter parameters (e.g., if the central freq
        is 100 Hz, we can randomly change it from 85 Hz to 115 Hz).
    left_frames : int (default 5)
        Number of frames of left context to add.
    right_frames : int (default 5)
        Number of frames of right context to add.
    win_length : float (default: 25)
        Length (in ms) of the sliding window used to compute the STFT.
    hop_length : float (default: 10)
        Length (in ms) of the hop of the sliding window used to compute
        the STFT.

    Example
    -------
    >>> import torch
    >>> inputs = torch.randn([10, 16000])
    >>> feature_maker = MFCC()
    >>> feats = feature_maker(inputs)
    >>> feats.shape
    torch.Size([10, 101, 660])
    TFr   r   Nr         r   r   r   r   r   r   c                    s   t    || _|| _|| _|d u r|d }t||||d| _t|||||| |
||d	| _t	||	d| _
t|	d| _t||d| _d S )Nr   r    r%   r.   n_outr-   r/   )r2   r3   r4   r5   r6   r	   r7   r   r8   r   compute_dctr   r9   r
   r:   )r;   r4   r5   r6   r!   r'   r(   r"   r&   n_mfccr*   r+   r,   r0   r1   r#   r$   r<   r>   r?   r3      s:   
zMFCC.__init__r@   c                 C   sh   |  |}t|}| |}| |}| jr*| |}| |}tj|||gdd}| jr2| 	|}|S )a   Returns a set of mfccs generated from the input waveforms.

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to transform to features.

        Returns
        -------
        mfccs : torch.Tensor
        r   rB   )
r7   r   r8   r[   r4   r9   rD   rE   r5   r:   )r;   rF   r	   rG   rH   mfccsrI   rJ   r>   r>   r?   rK   "  s   





zMFCC.forward)TTFr   r   Nr   rW   rX   r   r   r   r   r   r   r   )
rP   rQ   rR   rS   r3   r   rD   rT   rK   rU   r>   r>   r<   r?   rV      s*    ?
4rV   c                       sl   e Zd ZdZ													dd
ededef fddZeej	ddd Z
dd Zdd Z  ZS )Leafa  
    This class implements the LEAF audio frontend from

    Neil Zeghidour, Olivier Teboul, F{'e}lix de Chaumont Quitry & Marco Tagliasacchi, "LEAF: A LEARNABLE FRONTEND
    FOR AUDIO CLASSIFICATION", in Proc. of ICLR 2021 (https://arxiv.org/abs/2101.08596)

    Arguments
    ---------
    out_channels : int
        It is the number of output channels.
    window_len: float
        length of filter window in milliseconds
    window_stride : float
        Stride factor of the filters in milliseconds
    sample_rate : int,
        Sampling rate of the input signals. It is only used for sinc_conv.
    input_shape : tuple
        Expected shape of the inputs.
    in_channels : int
        Expected number of input channels.
    min_freq : float
        Lowest possible frequency (in Hz) for a filter
    max_freq : float
        Highest possible frequency (in Hz) for a filter
    use_pcen: bool
        If True (default), a per-channel energy normalization layer is used
    learnable_pcen: bool:
        If True (default), the per-channel energy normalization layer is learnable
    use_legacy_complex: bool
        If False, torch.complex64 data type is used for gabor impulse responses
        If True, computation is performed on two real-valued torch.Tensors
    skip_transpose: bool
        If False, uses batch x time x channel convention of speechbrain.
        If True, uses batch x channel x time convention.
    n_fft: int
        Number of FFT bins

    Example
    -------
    >>> inp_tensor = torch.rand([10, 8000])
    >>> leaf = Leaf(
    ...     out_channels=40, window_len=25., window_stride=10., in_channels=1
    ... )
    >>> out_tensor = leaf(inp_tensor)
    >>> out_tensor.shape
    torch.Size([10, 50, 40])
          9@      $@r   N      N@TF   
window_lenwindow_strider!   c                    s   t    || _t|| d d }t|| d }|d u r&|d u r&td|d u r/| |}td| ||ddd|||||dd| _t| j||dd	| _	|	r]t
| jd
ddd|
ddd| _nd | _|| _d S )Ni     z.Must provide one of input_shape or in_channelsr   sameFT)out_channelsin_channelskernel_sizestridepaddingbiasr"   r!   min_freqmax_frequse_legacy_complexskip_transpose)rh   ri   rj   rp   gQ?g{Gz?       @g-q=)alphasmooth_coefdeltafloor	trainableper_channel_smooth_coefrp   )r2   r3   rg   int
ValueError_check_input_shaper   complex_convr   poolingr   compressionrp   )r;   rg   rc   rd   r!   input_shaperh   rm   rn   use_pcenlearnable_pcenro   rp   r"   window_sizer<   r>   r?   r3   m  sR   



zLeaf.__init__r@   c                 C   s   | j s	|dd}|jdk}|r|d}| |}| |}| |}t|tj	d|j
d}| jr8| |}| j sA|dd}|S )a  
        Returns the learned LEAF features

        Arguments
        ---------
        x : torch.Tensor of shape (batch, time, 1) or (batch, time)
            batch of input signals. 2d or 3d tensors are expected.

        Returns
        -------
        outputs : torch.Tensor
        re   r   gh㈵>device)rp   	transposendim	unsqueezer{   _squared_modulus_activationr|   rD   maximumtensorr   r}   )r;   xr   outputsr>   r>   r?   rK     s    





zLeaf.forwardc                 C   s8   | dd}dtjjj|d ddd }| dd}|S )Nre   r   rq   )ri   rj   r   rD   nn
functional
avg_pool1d)r;   r   outputr>   r>   r?   r     s   

z Leaf._squared_modulus_activationc                 C   s<   t |dkr
d}|S t |dkrd}|S tdtt | )z@Checks the input shape and returns the number of input channels.r   re      z"Leaf expects 2d or 3d inputs. Got )lenry   str)r;   shaperh   r>   r>   r?   rz     s   zLeaf._check_input_shape)r_   r`   r   NNra   NTTFFrb   )rP   rQ   rR   rS   floatrx   r3   r   rD   rT   rK   r   rz   rU   r>   r>   r<   r?   r^   <  s2    3
?
!r^   torL   c                 C   s,   | dksJ | | dkr| S | | | |  S )zMIf `x` cannot evenly divide `to`, round it up to the next value that
    can.r   r>   )r   r   r>   r>   r?   upalign_value  s   r   c                   @   s    e Zd ZU dZeej ed< dS )StreamingFeatureWrapperContextzQStreaming metadata for the feature extractor. Holds some past context
    frames.left_contextN)rP   rQ   rR   rS   r   rD   Tensor__annotations__r>   r>   r>   r?   r     s   
 r   c                       s   e Zd ZdZdejjdef fddZde	fddZ
de	fd	d
Zde	de	fddZdejdedejfddZdefddZdefddZ  ZS )StreamingFeatureWrappera  Wraps an arbitrary filter so that it can be used in a streaming fashion
    (i.e. on a per-chunk basis), by remembering context and making "clever" use
    of padding.

    Arguments
    ---------
    module : torch.nn.Module
        The filter to wrap; e.g. a module list that constitutes a sequential
        feature extraction pipeline.
        The module is assumed to pad its inputs, e.g. the output of a
        convolution with a stride of 1 would end up with the same frame count
        as the input.
    properties : FilterProperties
        The effective filter properties of the provided module. This is used to
        determine padding and caching.
    module
propertiesc                    s>   t    || _|| _| jjrtd| jjdkrtdd S )Nz5Causal streaming feature wrapper is not yet supportedre   z7Dilation not yet supported in streaming feature wrapper)r2   r3   r   r   causalry   dilation)r;   r   r   r<   r>   r?   r3     s   
z StreamingFeatureWrapper.__init__rL   c                 C   s   t | jjd d | jjS )zComputes the number of padding/context frames that need to be
        injected at the past and future of the input signal in the forward pass.
        re   r   )r   r   r   rj   rO   r>   r>   r?   get_required_padding  s   z,StreamingFeatureWrapper.get_required_paddingc                 C   s   |   | jj S )zdComputes the exact number of produced frames (along the time
        dimension) per input pad frame.)r   r   rj   rO   r>   r>   r?   get_output_count_per_pad_frame'  s   z6StreamingFeatureWrapper.get_output_count_per_pad_frameframes_per_chunkc                 C   s   t |  || S )au  Get the recommended number of zero chunks to inject at the end of an
        input stream depending on the filter properties of the extractor.

        The number of injected chunks is chosen to ensure that the filter has
        output frames centered on the last input frames.
        See also :meth:`~StreamingFeatureWrapper.forward`.

        Arguments
        ---------
        frames_per_chunk : int
            The number of frames per chunk, i.e. the size of the time dimension
            passed to :meth:`~StreamingFeatureWrapper.forward`.

        Returns
        -------
        Recommended number of chunks.
        )r   r   )r;   r   r>   r>   r?   !get_recommended_final_chunk_count-  s   z9StreamingFeatureWrapper.get_recommended_final_chunk_countchunkr5   c                 O   s   |   }|  }|jdu rtjj||d df}n	t|j|fd}|dd| d df |_| j|g|R i |}|dd|| df }|S )a~  Forward pass for the streaming feature wrapper.

        For the first chunk, 0-padding is inserted at the past of the input.
        For any chunk (including the first), some future frames get truncated
        and cached to be inserted as left context for the next chunk in time.

        For further explanations, see the comments in the code.

        Note that due to how the padding is implemented, you may want to call
        this with a chunk worth full of zeros (potentially more for filters with
        large windows) at the end of your input so that the final frames have a
        chance to get processed by the filter.
        See :meth:`~StreamingFeatureWrapper.get_recommended_final_chunk_count`.
        This is not really an issue when processing endless streams, but when
        processing files, it could otherwise result in truncated outputs.

        Arguments
        ---------
        chunk : torch.Tensor
            Chunk of input of shape [batch size, time]; typically a raw
            waveform. Normally, in a chunkwise streaming scenario,
            `time = (stride-1) * chunk_size` where `chunk_size` is the desired
            **output** frame count.
        context : StreamingFeatureWrapperContext
            Mutable streaming context object; should be reused for subsequent
            calls in the same streaming session.
        *extra_args : tuple
        **extra_kwargs : dict
            Args to be passed to he module.

        Returns
        -------
        torch.Tensor
            Processed chunk of shape [batch size, output frames]. This shape is
            equivalent to the shape of `module(chunk)`.
        Nr   r   re   .)	r   r   r   rD   r   r   padrE   r   )r;   r   r5   
extra_argsextra_kwargsfeat_pad_sizenum_outputs_per_padfeatsr>   r>   r?   rK   E  s   ,

zStreamingFeatureWrapper.forwardc                 C   s   | j S rM   )r   rO   r>   r>   r?   rN     s   z-StreamingFeatureWrapper.get_filter_propertiesc                 C   s   t d S rM   )r   rO   r>   r>   r?   make_streaming_context  s   z.StreamingFeatureWrapper.make_streaming_context)rP   rQ   rR   rS   rD   r   Moduler   r3   rx   r   r   r   r   r   rK   rN   r   rU   r>   r>   r<   r?   r     s    	
Ur   c                       sp   e Zd ZdZ										
	ddedededededededededef fddZdej	fddZ
  ZS )VocalFeaturesa  Estimates the vocal characteristics of a signal in four categories of features:
     * Autocorrelation-based
     * Period-based (jitter/shimmer)
     * Spectrum-based
     * MFCCs

    Arguments
    ---------
    min_f0_Hz: int
        The minimum allowed fundamental frequency, to reduce octave errors.
        Default is 80 Hz, based on human voice standard frequency range.
    max_f0_Hz: int
        The maximum allowed fundamental frequency, to reduce octave errors.
        Default is 300 Hz, based on human voice standard frequency range.
    step_size: float
        The time between analysis windows (in seconds).
    window_size: float
        The size of the analysis window (in seconds). Must be long enough
        to contain at least 4 periods at the minimum frequency.
    sample_rate: int
        The number of samples in a second.
    log_scores: bool
        Whether to represent the jitter/shimmer/hnr/gne on a log scale,
        as these features are typically close to zero.
    eps: float
        The minimum value before log transformation, default of
        1e-3 results in a maximum value of 30 dB.
    sma_neighbors: int
        Number of frames to average -- default 3
    n_mels: int (default: 23)
        Number of filters to use for creating filterbank.
    n_mfcc: int (default: 4)
        Number of output coefficients

    Example
    -------
    >>> audio = torch.rand(1, 16000)
    >>> feature_maker = VocalFeatures()
    >>> vocal_features = feature_maker(audio)
    >>> vocal_features.shape
    torch.Size([1, 96, 17])
    P   ,  {Gz?皙?r   TMbP?r   rW      	min_f0_Hz	max_f0_Hz	step_sizer   r!   
log_scoresepssma_neighborsr&   r\   c                    s   t    t|| | _t|| | _t|| | _t|| | _|| _|| _|| _	|| _
| jt | jks=J dt dt|| j|	d| _t|	|
d| _tt||d| _d S )NzNeed at least z periods in a window)r!   r"   r&   rY   )	frame_lenhop_len)r2   r3   rx   step_sampleswindow_samplesmax_lagmin_lagr!   r   r   r   r   r   r8   r   r[   r   r   )r;   r   r   r   r   r!   r   r   r   r&   r\   r<   r>   r?   r3     s*   

zVocalFeatures.__init__audioc              
   C   s  |  dks
J d|jd| j| jd}t|| j| j\}}| j| }d| }t||\}}| 	|| j}	|	
d|
dkrL|	ddd|
df }	| jr}d|j| jd  }d|j| jd  }d|j| jd  }dd|	 j| jd  }	tj| j|jd	}
ttj||
ddd }t|}| | |}tj|||||	fdd
}tj|||fdd
}| jdkrt|d| jd}|S )a  Compute voice features.

        Arguments
        ---------
        audio: torch.Tensor
            The audio signal to be converted to voice features.

        Returns
        -------
        features: torch.Tensor
            A [batch, frame, 13+n_mfcc] tensor with the following features per-frame.
             * autocorr_f0: A per-frame estimate of the f0 in Hz.
             * autocorr_hnr: harmonicity-to-noise ratio for each frame.
             * periodic_jitter: Average deviation in period length.
             * periodic_shimmer: Average deviation in amplitude per period.
             * gne: The glottal-to-noise-excitation ratio.
             * spectral_centroid: "center-of-mass" for spectral frames.
             * spectral_spread: avg distance from centroid for spectral frames.
             * spectral_skew: asymmetry of spectrum about the centroid.
             * spectral_kurtosis: tailedness of spectrum.
             * spectral_entropy: The peakiness of the spectrum.
             * spectral_flatness: The ratio of geometric mean to arithmetic mean.
             * spectral_crest: The ratio of spectral maximum to arithmetic mean.
             * spectral_flux: The 2-normed diff between successive spectral values.
             * mfcc_{0-n_mfcc}: The mel cepstral coefficients.
        r   z4Expected audio to be 2-dimensional, [batch, samples]r   )	dimensionsizestepre   Ni)minr   rB   )rC   n)rC   unfoldr   r   r   r   r   r!   r   r   r   r   clampr   log10rD   hann_windowr   absfftrfftviewr   r[   r8   stackrE   r   moving_average)r;   r   framesharmonicity	best_lagsf0hnrjittershimmergnehannspectrumspectral_featuresr]   featuresr>   r>   r?   rK     s:   


 
zVocalFeatures.forward)
r   r   r   r   r   Tr   r   rW   r   )rP   rQ   rR   rS   rx   r   boolr3   rD   r   rK   rU   r>   r>   r<   r?   r     sD    -	
'r   re   r   c                 C   s8   |  |d} |d }tjjj| ||ddd} |  |dS )a=  Computes moving average on a given dimension.

    Arguments
    ---------
    features: torch.Tensor
        The feature tensor to smooth out.
    dim: int
        The time dimension (for smoothing).
    n: int
        The number of points in the moving average

    Returns
    -------
    smoothed_features: torch.Tensor
        The features after the moving average is applied.

    Example
    -------
    >>> feats = torch.tensor([[0., 1., 0., 1., 0., 1., 0.]])
    >>> moving_average(feats)
    tensor([[0.5000, 0.3333, 0.6667, 0.3333, 0.6667, 0.3333, 0.5000]])
    r   r   re   F)ri   rk   rj   count_include_padr   )r   rC   r   r   r>   r>   r?   r   @  s   
r   )re   r   )*rS   dataclassesr   	functoolsr   typingr   rD   speechbrain.nnet.CNNr   speechbrain.nnet.normalizationr   speechbrain.nnet.poolingr   speechbrain.processing.featuresr   r	   r
   r   r   r   %speechbrain.processing.vocal_featuresr   r   r   r   r   speechbrain.utils.autocastr   !speechbrain.utils.filter_analysisr   r   r   r   rV   r^   rx   r   r   r   r   r   r>   r>   r>   r?   <module>   s4    	    ) &  