o
    i                     @   s*   d dl Z d dl mZ G dd dejZdS )    N)nnc                	       sJ   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )ISTFTa  
    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
    See issue: https://github.com/pytorch/pytorch/issues/62323
    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
    The NOLA constraint is met as we trim padded samples anyway.

    Args:
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames.
        win_length (int): The size of window frame and STFT filter.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samen_fft
hop_length
win_lengthpaddingc                    sL   t    |dvrtd|| _|| _|| _|| _t|}| 	d| d S )N)centerr   #Padding must be 'center' or 'same'.window)
super__init__
ValueErrorr   r   r   r   torchhann_windowregister_buffer)selfr   r   r   r   r   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/soprano/vocos/spectral_ops.pyr      s   

zISTFT.__init__specreturnc                 C   sv  | j dkr$d|dddf< d|dddf< tj|| j| j| j| jddS | j dkr2| j| j d }ntd	| d
ks@J d|j	\}}}tj
j|| jddd}|| jddddf  }|d | j | j }tjjj|d|fd| jfd| jfddddd|| f }| j d|ddd}	tjjj|	d|fd| jfd| jfd ||  }
|
dk sJ ||
 }|S )a  
        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.

        Args:
            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
                            N is the number of frequency bins, and T is the number of time frames.

        Returns:
            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
        r	   r   NT)r	   r      r
      zExpected a 3D tensor as input   backward)dimnorm)output_sizekernel_sizestridegdy=)r   r   istftr   r   r   r   r   r   shapefftirfftr   
functionalfoldsquareexpand	transposesqueezeall)r   r   padBNTifftr    y	window_sqwindow_enveloper   r   r   forward   s4   

zISTFT.forward)r   )__name__
__module____qualname____doc__intstrr   r   Tensorr6   __classcell__r   r   r   r   r      s     r   )r   r   Moduler   r   r   r   r   <module>   s    