o
    Si                      @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl	m
Z
 z)ddlmZ de	jde	jfdd	Zde	jde	jfd
dZde	jde	jfddZW n* eyu   de	jde	jfdd	Zde	jde	jfddZde	jde	jfddZY nw ddlmZmZ G dd de
jZG dd de
jZG dd deZG dd deZG dd deZG dd deZde	jded ed!ede	jf
d"d#Z		$dJde	jd eded%ee	j d!edee	je	jf fd&d'Z de	jd(e!de	jfd)d*Z"			+dKd,ed-ed.ed/e!d0ee! d1ede	jfd2d3Z#dee$ fd4d5Z%d6Z&d7Z'd8Z(d9Z)d:Z*dLd<e$fd=d>Z+d?d@ Z,dAdB Z-dedefdCdDZ.dEedFedGe!d/e!d0e!dee	je	jf fdHdIZ/dS )MuK  
 Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba)
           2021 Johns Hopkins University  (Author: Piotr Żelasko)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

This whole module is authored and contributed by Jesus Villalba,
with minor changes by Piotr Żelasko to make it more consistent with Lhotse.

It contains a PyTorch implementation of feature extractors that is very close to Kaldi's
-- notably, it differs in that the preemphasis and DC offset removal are applied in the
time, rather than frequency domain. This should not significantly affect any results, as
confirmed by Jesus.

This implementation works well with autograd and batching, and can be used neural network
layers.

Update January 2022:
These modules now expose a new API function called "online_inference" that
may be used to compute the features when the audio is streaming.
The implementation is stateless, and passes the waveform remainders
back to the user to feed them to the modules once new data becomes available.
The implementation is compatible with JIT scripting via TorchScript.
    N)ListOptionalTuple)nn)rfftxreturnc                 C   s   t | ddS Ndim)
torch_rfftr    r   P/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/features/kaldi/layers.py_rfft#      r   c                 C   s   |   d S )N   absr   r   r   r   _pow_spectrogram&   r   r   c                 C      |   S Nr   r   r   r   r   _spectrogram)      r   c                 C   s   t j| ddddS )N   FT)
normalizedonesided)torchr   r   r   r   r   r   .   s   c                 C   s   |  ddS Nr   r
   )powsumr   r   r   r   r   1   s   c                 C   s   |  dd S r   )r    r!   sqrtr   r   r   r   r   4   s   )EPSILONSecondsc                       s  e Zd ZdZdddddddd	d
edd
fdedededee dede	de
de	dede	dededdf fddZdd Zdd Zdejdeejeej f fdd Zd!ejdeejeej f fd"d#Zejj	d'd!ejd$eej deeejeej f ejf fd%d&Z  ZS )(Wav2Wina  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and partition them into overlapping frames (of audio samples).
    Note: no feature extraction happens in here, the output is still a time-domain signal.

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2Win()
        >>> t(x).shape
        torch.Size([1, 100, 400])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, window_length)``.
    When ``return_log_energy==True``, returns a tuple where the second element
    is a log-energy tensor of shape ``(batch_size, num_frames)``.
    >  皙?{Gz?NT
ףp=
?povey        Fsampling_rateframe_lengthframe_shift
pad_lengthremove_dc_offsetpreemph_coeffwindow_typedither
snip_edgesenergy_floor
raw_energyreturn_log_energyr   c                    s   t    || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|	r-td tt|| }|| _tt|| | _tjt||ddd| _|d u rU|n|| _| j|ksgJ d| d| d S )Nz|Setting snip_edges=True is generally incompatible with Lhotse -- you might experience mismatched duration/num_frames errors.)r2   Frequires_gradzpad_length (or fft_length) = z cannot be smaller than N = )super__init__r,   r-   r.   r0   r1   r2   r3   r4   r5   r6   r7   warningswarnintmathfloor_length_shiftr   	Parametercreate_frame_window_windowr/   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   N	__class__r   r   r;   P   s4   
zWav2Win.__init__c                 C   r   r   )__str__rF   r   r   r   __repr__~   r   zWav2Win.__repr__c                 C   sB   d | jj| j| j| j| j| j| j| j	| j
| j| j| j| j}|S )Nz{}(sampling_rate={}, frame_length={}, frame_shift={}, pad_length={}, remove_dc_offset={}, preemph_coeff={}, window_type={} dither={}, snip_edges={}, energy_floor={}, raw_energy={}, return_log_energy={}))formatrI   __name__r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   )rF   sr   r   r   rJ      s$   zWav2Win.__str__	x_stridedc                 C   s   | j rtj|ddd}|| }d }| jr| jrt|| j}| jdkr>tjj	j
|ddd}|| j|d d d d d df   }|| j }| j| jkrb| j| j }tjj	j
|d	d
|gdddd	}| jrn| jsnt|| j}||fS )Nr   T)r   keepdimr+   )r   r   	replicate)moder
   r   r   constantrS   value)r0   r   meanr7   r6   _get_log_energyr5   r1   r   
functionalpadrE   r/   rA   	unsqueezesqueeze)rF   rP   mu
log_energyx_offsetrZ   r   r   r   _forward_strided   s.   
$
zWav2Win._forward_stridedr   c                 C   sH   | j dkrtj|j|jd}|| j |  }t|| j| j| j}| 	|S )Nr+   device)
r3   r   randnshaperb   _get_strided_batchrA   rB   r4   r`   )rF   r   nrP   r   r   r   forward   s
   

zWav2Win.forwardcontextc                 C   s`   | j dkrtj|j|jd}|| j |  }t|| j| j|| jd\}}| 	|\}}||f|fS )z
        The same as the ``forward()`` method, except it accepts an extra argument with the
        remainder waveform from the previous call of ``online_inference()``, and returns
        a tuple of ``((frames, log_energy), remainder)``.
        r+   ra   )window_lengthwindow_shiftprev_remainderr4   )
r3   r   rc   rd   rb   _get_strided_batch_streamingrA   rB   r4   r`   )rF   r   rh   rf   rP   	remainderr^   r   r   r   online_inference   s   

zWav2Win.online_inferencer   )rN   
__module____qualname____doc__r#   r>   r$   r   boolfloatstrr;   rL   rJ   r   Tensorr   r`   rg   jitexportrn   __classcell__r   r   rH   r   r%   ;   st    	
.
$&
r%   c                       sZ  e Zd ZdZddddddddd	eddfd
edededededede	dedededededdf fddZ
edefddZedefddZedefddZedefd d!Zedefd"d#Zede	fd$d%Zedefd&d'Zd(ejd)eej dejfd*d+Zd,ejdejfd-d.Zejj	d2d,ejd/eej deejejf fd0d1Z  ZS )3Wav2FFTad  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and compute their Short-Time Fourier Transform (STFT).
    The output is a complex-valued tensor.

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2FFT()
        >>> t(x).shape
        torch.Size([1, 100, 257])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, num_fft_bins)``
    with dtype ``torch.complex64``.
    r&   r'   r(   Tr)   r*   r+   Fr,   r-   r.   round_to_power_of_twor0   r1   r2   r3   r4   r5   r6   
use_energyr   Nc                    s\   t    || _tt|| }|rt|n|| _t|||| j|||||	|
||d| _	d S )N)	r/   r0   r1   r2   r3   r4   r5   r6   r7   )
r:   r;   r{   r>   r?   r@   next_power_of_2
fft_lengthr%   wav2win)rF   r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   rG   rH   r   r   r;      s$   
zWav2FFT.__init__c                 C      | j jS r   )r~   r,   rK   r   r   r   r,        zWav2FFT.sampling_ratec                 C   r   r   )r~   r-   rK   r   r   r   r-     r   zWav2FFT.frame_lengthc                 C   r   r   )r~   r.   rK   r   r   r   r.   !  r   zWav2FFT.frame_shiftc                 C   r   r   )r~   r0   rK   r   r   r   r0   %  r   zWav2FFT.remove_dc_offsetc                 C   r   r   )r~   r1   rK   r   r   r   r1   )  r   zWav2FFT.preemph_coeffc                 C   r   r   )r~   r2   rK   r   r   r   r2   -  r   zWav2FFT.window_typec                 C   r   r   )r~   r3   rK   r   r   r   r3   1  r   zWav2FFT.ditherrP   log_ec                 C   s0   t |}| jr|d ur||d d d d df< |S Nr   )r   r{   )rF   rP   r   Xr   r   r   r`   5  s   zWav2FFT._forward_stridedr   c                 C   s   |  |\}}| j||dS )NrP   r   )r~   r`   )rF   r   rP   r   r   r   r   rg   B  s   zWav2FFT.forwardrh   c                 C   s*   | j j||d\\}}}| j||d|fS )N)rh   r   )r~   rn   r`   )rF   r   rh   rP   r   rm   r   r   r   rn   F  s   zWav2FFT.online_inferencer   )rN   ro   rp   rq   r#   r>   r$   rr   rs   rt   r;   propertyr,   r-   r.   r0   r1   r2   r3   r   ru   r   r`   rg   rv   rw   r   rn   rx   r   r   rH   r   ry      s    	
"
ry   c                          e Zd ZdZddddddddd	eddd	fd
edededededede	dededededededdf fddZ
dejdeej dejfddZ  ZS )Wav2Speca  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and compute their Short-Time Fourier Transform (STFT).
    The STFT is transformed either to a magnitude spectrum (``use_fft_mag=True``)
    or a power spectrum (``use_fft_mag=False``).

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2Spec()
        >>> t(x).shape
        torch.Size([1, 100, 257])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, num_fft_bins)``.
    r&   r'   r(   Tr)   r*   r+   Fr,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   use_fft_magr   Nc                    B   t  j|||||||||	|
||d || _|rt| _d S t| _d S N	rz   r0   r1   r2   r3   r4   r5   r6   r{   r:   r;   r   r   _to_specr   rF   r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   rH   r   r   r;   d  $   

zWav2Spec.__init__rP   r   c                 C   s:   t |}| |}| jr|d ur||d d d d df< |S r   )r   r   r{   rF   rP   r   r   pow_specr   r   r   r`     s
   
zWav2Spec._forward_stridedrN   ro   rp   rq   r#   r>   r$   rr   rs   rt   r;   r   ru   r   r`   rx   r   r   rH   r   r   P  f    	
$r   c                       r   )Wav2LogSpeca  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and compute their Short-Time Fourier Transform (STFT).
    The STFT is transformed either to a log-magnitude spectrum (``use_fft_mag=True``)
    or a log-power spectrum (``use_fft_mag=False``).

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2LogSpec()
        >>> t(x).shape
        torch.Size([1, 100, 257])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, num_fft_bins)``.
    r&   r'   r(   Tr)   r*   r+   Fr,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   r   Nc                    r   r   r   r   rH   r   r   r;     r   zWav2LogSpec.__init__rP   r   c                 C   sF   t |}| |}|d  }| jr!|d ur!||d d d d df< |S )NV瞯<r   )r   r   logr{   r   r   r   r   r`     s   
zWav2LogSpec._forward_stridedr   r   r   rH   r   r     r   r   c                %       s   e Zd ZdZddddddddd	edd	d	d
ddd	dfdedededededede	dededededededededededef$ fdd Z
d!ejd"eej d#ejfd$d%Z  ZS )&Wav2LogFilterBanka/  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and compute their log-Mel filter bank energies (also known as "fbank").

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2LogFilterBank()
        >>> t(x).shape
        torch.Size([1, 100, 80])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, num_filters)``.
    r&   r'   r(   Tr)   r*   r+   F      4@      yP   r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   low_freq	high_freqnum_filtersnorm_filterstorchaudio_compatible_mel_scalec                    s   t  j|||||||||	|
||d || _|| _|| _|| _|| _tjt	
t	t	jjdd| _|r7t| _nt| _|rUt|| j|||d\}}t	jjj|ddddj}nt|| j||||d	}tj|dd| _d S 
Nr   Fr8   )num_binswindow_length_paddedsample_freqr   r   )r   r   rT   r   rU   )r   r}   r,   r   r   r   )r:   r;   r   r   r   r   r   r   rC   r   tensorfinfors   eps_epsr   r   r   get_mel_banksr}   rY   rZ   Tcreate_mel_scale_fb)rF   r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   r   r   r   r   r   fb_rH   r   r   r;     sV   
zWav2LogFilterBank.__init__rP   r   r   c                 C   s\   t |}| |}t|| j}t|| j }| jr,|d ur,tj	|
d|fdd}|S r	   )r   r   r   matmulr   maxr   r   r{   catr[   r   r   r   r   r`   5  s   
z"Wav2LogFilterBank._forward_stridedr   r   r   rH   r   r     s    	
Gr   c                )       s   e Zd ZdZddddddddd	edd	d	d
ddd	dddfdedededededede	dedededededededededed ed!ed"ef( fd#d$Z
ed%d& Zed'd( Zd)ejd*eej d+ejfd,d-Z  ZS ).Wav2MFCCa  
    Apply standard Kaldi preprocessing (dithering, removing DC offset, pre-emphasis, etc.)
    on the input waveforms and compute their Mel-Frequency Cepstral Coefficients (MFCC).

    Example::

        >>> x = torch.randn(1, 16000, dtype=torch.float32)
        >>> x.shape
        torch.Size([1, 16000])
        >>> t = Wav2MFCC()
        >>> t(x).shape
        torch.Size([1, 100, 13])

    The input is a tensor of shape ``(batch_size, num_samples)``.
    The output is a tensor of shape ``(batch_size, num_frames, num_ceps)``.
    r&   r'   r(   Tr)   r*   r+   Fr   r            r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   r   r   r   r   num_cepscepstral_lifterr   c                    s  t  j|||||||||	|
||d || _|| _|| _|| _|| _|| _|| _t	j
tttjjdd| _|r=t| _nt| _|r[t|| j|||d\}}tj	jj|ddddj}nt|| j||||d	}t	j
|dd| _t	j
| | j| jdd| _t	j
| | j| jdd| _d S r   )r:   r;   r   r   r   r   r   r   r   r   rC   r   r   r   rs   r   r   r   r   r   r   r}   rY   rZ   r   r   r   make_dct_matrix_dctmake_lifter_lifter)rF   r,   r-   r.   rz   r0   r1   r2   r3   r4   r5   r6   r{   r   r   r   r   r   r   r   r   r   r   rH   r   r   r;   W  sf   
zWav2MFCC.__init__c              	   C   s:   |dkrdS dd| t tjt j| t  d |   S )zMakes the liftering function

        Args:
          N: Number of cepstral coefficients.
          Q: Liftering parameter
        Returns:
          Liftering vector.
        r   r         ?dtype)r   sinr?   piarangeget_default_dtype)rG   Qr   r   r   r     s
   
zWav2MFCC.make_lifterc                 C   s~   t t|d}t t| }t tjt| |d  | }|d d df  dtd 9  < |tdt| 9 }|S )Nr   r   r         ?       @)r   r   rs   r[   cosr?   r   r"   )r   r   rf   kdctr   r   r   r     s   "zWav2MFCC.make_dct_matrixrP   r   r   c                 C   sv   t |}| |}t|| j}t|| j }t|| j}| j	dkr*|| j
9 }| jr9|d ur9||d d df< |S r   )r   r   r   r   r   r   r   r   r   r   r   r{   )rF   rP   r   r   r   mfccr   r   r   r`     s   


zWav2MFCC._forward_strided)rN   ro   rp   rq   r#   r>   r$   rr   rs   rt   r;   staticmethodr   r   r   ru   r   r`   rx   r   r   rH   r   r   E  s    	
R


r   waveformri   rj   r4   c                 C   s.  |   dksJ | d}| d}|r&||k rtdS d|| |  }nV||d  | }|d | | }|| }t|| d }	||	 }
t| ddd|	f d}|
dkrht| dd|
 df d}n
tjd| j| jd}tj	|| |fdd	} | 
d|| 
d | 
df}|||g}| ||S )
a  Given a waveform (2D tensor of size ``(batch_size, num_samples)``,
    it returns a 2D tensor ``(batch_size, num_frames, window_length)``
    representing how the window is shifted along the waveform. Each row is a frame.
    Args:
        waveform (torch.Tensor): Tensor of size ``(batch_size, num_samples)``
        window_size (int): Frame length
        window_shift (int): Frame shift
        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
            depends only on the frame_shift, and we reflect the data at the ends.
    Returns:
        torch.Tensor: 3D tensor of size (m, ``window_size``) where each row is a frame
    r   r   r
   )r   r   r   r   Nr   )r   rb   r   )r   sizer   emptyr>   flipzerosr   rb   r   stride
as_strided)r   ri   rj   r4   
batch_sizenum_samples
num_framesnew_num_samplesnpad	npad_left
npad_rightpad_left	pad_rightstridessizesr   r   r   re     s.   


 
re   Frk   c                 C   sH  ||ksJ |   dksJ | d}|du r9|s8t|| d }t| ddd|f d}tj|| fdd} n|  dksAJ |d|ksJJ tj|| fdd} | d}|rq||k rht|ddf| fS d|| |  }	n
|| }
||
 | }	| dd|	| df }| d|| d | df}||	|g}| |||fS )a  
    A variant of _get_strided_batch that creates short frames of a batch of audio signals
    in a way suitable for streaming. It accepts a waveform, window size parameters, and
    an optional buffer of previously unused samples. It returns a pair of waveform windows tensor,
    and unused part of the waveform to be passed as ``prev_remainder`` in the next call to this
    function.

    Example usage::

        >>> # get the first buffer of audio and make frames
        >>> waveform = get_incoming_audio_from_mic()
        >>> frames, remainder = _get_strided_batch_streaming(
        ...     waveform,
        ...     window_shift=160,
        ...     window_length=200,
        ... )
        >>>
        >>> process(frames)  # do sth with the frames
        >>>
        >>> # get the next buffer and use previous remainder to make frames
        >>> waveform = get_incoming_audio_from_mic()
        >>> frames, remainder = _get_strided_batch_streaming(
        ...     waveform,
        ...     window_shift=160,
        ...     window_length=200,
        ...     prev_remainder=prev_remainder,
        ... )

    :param waveform: A waveform tensor of shape ``(batch_size, num_samples)``.
    :param window_shift: The shift between frames measured in the number of samples.
    :param window_length: The number of samples in each window (frame).
    :param prev_remainder: An optional waveform tensor of shape ``(batch_size, num_samples)``.
        Can be ``None`` which indicates the start of a recording.
    :param snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
        in the file, and the number of frames depends on the frame_length.  If False, the number of frames
        depends only on the frame_shift, and we reflect the data at the ends.
    :return: a pair of tensors with shapes ``(batch_size, num_frames, window_length)`` and
        ``(batch_size, remainder_len)``.
    r   r   Nr   r   r   r
   )	r   r   r>   r   r   r   r   r   r   )r   rj   ri   rk   r4   r   r   r   r   r   window_remainderrm   r   r   r   r   r   rl     s4   /


rl   r5   c                 C   sB   |  ddd  }|dkrt|tjt||jd}|S )zF
    Returns the log energy of size (m) for a strided_input (m,*)
    r   r
   r   r+   r   )r    r!   r   r   r   r   r?   r   )r   r5   r^   r   r   r   rX   [  s   rX   Tr   r}   r,   r   r   r   c                 C   s@  |d u s|dkr|d }|dk r|d | }t |}t |}t||| d }t td||}	tjt|d d | ftjd}
t| D ]J}|| }||d  }||d  }tt|d D ]/}|	| }||  k rn|k rn q^||kr|| ||  |
||f< q^|| ||  |
||f< q^qD|r|
tj|
ddd }
t	|
S )Nr   r   r   r   T)axiskeepdims)
lin2melnplinspacer   r>   float32ranger!   r   
from_numpy)r   r}   r,   r   r   r   mel_low_freqmel_high_freqmelfcmelsBr   left_mel
center_mel	right_meljmel_jr   r   r   r   i  s0    
r   c                   C   s   t ttttgS r   )HAMMINGHANNINGPOVEYRECTANGULARBLACKMANr   r   r   r   available_windows  s   r   hamminghanningr*   rectangularblackmanzG?r2   c                 C   s   |t krtj| ddS |tkrtj| ddddS |tkr&tj| dddS |tkr3tj| t	 dS |t
kr_dtj |  }tj| t	 d}|d	t||   d	| td| |   S td
| )z6Returns a window function with the given type and sizeF)periodicgHzG?gq=
ףp?)r   alphabetag333333?r   r   r   zInvalid window type: )r   r   hann_windowr   hamming_windowr   r    r   onesr   r   r?   r   r   r   	Exception)window_sizer2   blackman_coeffawindow_functionr   r   r   rD     s$   rD   c                 C   s   dt d| d   S )N     @r     )r   r   r   r   r   r   r        r   c                 C   s   dt | d d  S )Nr  r  r   )r   expr   r   r   r   mel2lin  r  r
  c                 C   s   | dkrdS d| d    S )z
    Returns the smallest power of 2 that is greater than x.

    Original source: TorchAudio (torchaudio/compliance/kaldi.py)
    r   r   r   )
bit_lengthr   r   r   r   r|     s   r|   r   r   r   c                 C   sP  | dksJ d|d dksJ |d }d| }|dkr ||7 }d|  kr*|k r<n nd|  k r6|kr<n n||k sEJ d ||||| }t|}t|}	|	| | d  }
t| d}|||
  }||d	 |
  }||d
 |
  }t|}t|t| d}|| ||  }|| ||  }ttdt||}||fS )a|  
    Ported from:
    https://github.com/pytorch/audio/blob/ea5de17755d657508c84c4dce8970b614008adcf/src/torchaudio/compliance/kaldi.py#L436-L511

    Returns:
        (Tensor, Tensor): The tuple consists of ``bins`` (which is
        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
        center frequencies of bins of size (``num_bins``)).
       zMust have at least 3 mel binsr   r   r   r+   zBBad values in options: low-freq {} and high-freq {} vs. nyquist {}r   r   r   )	rM   r   r   r   r[   r
  r   r   min)r   r   r   r   r   num_fft_binsnyquistfft_bin_widthr   r   mel_freq_deltabinr   r   r   center_freqsmelup_slope
down_slopebinsr   r   r   r     s6   
r   )NF)r   NT)r*   r   )0rq   r?   r<   typingr   r   r   numpyr   r   r   	torch.fftr   r   ru   r   r   r   ImportErrorlhotse.utilsr#   r$   Moduler%   ry   r   r   r   r   r>   rr   re   rl   rs   rX   r   rt   r   r   r   r   r   r   rD   r   r
  r|   r   r   r   r   r   <module>   s     )mEGi 
4
T
%	