o
    Si                     @   sf   d dl Zd dlZd dlZd dlmZmZmZ G dd dejZG dd dejZ	G dd dejZ
dS )	    N)nnview_as_realview_as_complexc                	       sJ   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )ISTFTa  
    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
    See issue: https://github.com/pytorch/pytorch/issues/62323
    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
    The NOLA constraint is met as we trim padded samples anyway.

    Args:
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames.
        win_length (int): The size of window frame and STFT filter.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samen_fft
hop_length
win_lengthpaddingc                    sL   t    |dvrtd|| _|| _|| _|| _t|}| 	d| d S )Ncenterr   #Padding must be 'center' or 'same'.window)
super__init__
ValueErrorr
   r   r   r	   torchhann_windowregister_buffer)selfr   r   r	   r
   r   	__class__ F/home/ubuntu/.local/lib/python3.10/site-packages/vocos/spectral_ops.pyr      s   

zISTFT.__init__specreturnc                 C   sV  | j dkrtj|| j| j| j| jddS | j dkr"| j| j d }ntd| dks0J d|j	\}}}tj
j|| jd	d
d}|| jddddf  }|d	 | j | j }tjjj|d	|fd	| jfd	| jfddddd|| f }| j d	|dd	d}	tjjj|	d	|fd	| jfd	| jfd ||  }
|
dk sJ ||
 }|S )a  
        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.

        Args:
            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
                            N is the number of frequency bins, and T is the number of time frames.

        Returns:
            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
        r   T)r   r      r      zExpected a 3D tensor as input   backward)dimnormNoutput_sizekernel_sizestrider   gdy=)r
   r   istftr   r   r	   r   r   r    shapefftirfftr   
functionalfoldsquareexpand	transposesqueezeall)r   r   padBNTifftr#   y	window_sqwindow_enveloper   r   r   forward!   s0   

zISTFT.forwardr   __name__
__module____qualname____doc__intstrr   r   Tensorr:   __classcell__r   r   r   r   r      s     r   c                       B   e Zd ZdZddedef fddZdejdejfd	d
Z	  Z
S )MDCTz
    Modified Discrete Cosine Transform (MDCT) module.

    Args:
        frame_len (int): Length of the MDCT frame.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    r   	frame_lenr
   c                    s   t    |dvrtd|| _|| _|d }|d d }ttj	|
 }| d| tdtj t| | }tdtj | t|d  | }| dt| | d	t| d S )
Nr   r   r   r   r   y             g      ?pre_twiddlepost_twiddler   r   r   r
   rG   r   
from_numpyscipysignalcosinefloatr   exppiaranger   r   rG   r
   r4   n0r   rH   rI   r   r   r   r   W   s   
&zMDCT.__init__audior   c                 C   s   | j dkrtjj|| jd | jd f}n| j dkr,tjj|| jd | jd f}ntd|d| j| jd }| jd }|| j	|j
 }tjj|t| j	|j
 dddd	|f }|t| j	|j
 td
|  }t|td S )a  
        Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.

        Args:
            audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
                and T is the length of the audio.

        Returns:
            Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
                and N is the number of frequency bins.
        r   r   r      r   r&   r    .Nr   )r
   r   r   r+   r2   rG   r   unfoldr   r.   r(   r)   r   rH   rI   npsqrtreal)r   rU   xr4   Xresr   r   r   r:   i   s   
"
"
.$zMDCT.forwardr;   r<   r   r   r   r   rF   N   s    rF   c                       rE   )IMDCTz
    Inverse Modified Discrete Cosine Transform (IMDCT) module.

    Args:
        frame_len (int): Length of the MDCT frame.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    r   rG   r
   c                    s   t    |dvrtd|| _|| _|d }|d d }ttj	|
 }| d| tdtj | t|d  | }tdtj t|d |  |d  }| dt| | dt| d S )	Nr   r   r   r   r   y              ?rH   rI   rJ   rS   r   r   r   r      s   
&*zIMDCT.__init__r]   r   c                 C   sb  |j \}}}tj|||d f|j|jd}||dd|f< dttj|dd |d|df< tjj|t	| j
|j  dd}t|t	| j|j  t| td }|| j|j  }d	|d	 | f}tjjj|d	d|d	| jfd	| jd fd
ddddddf }	| jdkr| jd }
n| jdkr| jd }
ntd|	dd|
|
 f }	|	S )a  
        Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.

        Args:
            X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
                L is the number of frames, and N is the number of frequency bins.

        Returns:
            Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
        r   )dtypedevice.Nr&   )r&   )dimsrW   r   r"   r   r   r   rV   r   )r(   r   zerosr`   ra   conjflipr)   r6   r   rH   r.   r[   rI   rY   rZ   r   r   r+   r,   r/   rG   r
   r   )r   r]   r3   Lr4   Yr7   resultr#   rU   r2   r   r   r   r:      s.   $"0


zIMDCT.forwardr;   r<   r   r   r   r   r_      s    r_   )numpyrY   rL   r   r   r   r   Moduler   rF   r_   r   r   r   r   <module>   s    G7