o
    Si                     @   s   d dl mZ d dlZd dlmZ d dlmZmZ d dlmZm	Z	 d dl
mZ G dd dejZG d	d
 d
eZG dd deZG dd deZdS )    )OptionalN)nn)
_hz_to_mel
_mel_to_hz)IMDCTISTFT)symexpc                   @   s&   e Zd ZdZdejdejfddZdS )FourierHeadz'Base class for inverse fourier modules.xreturnc                 C   s   t d)aJ  
        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        z-Subclasses must implement the forward method.)NotImplementedError)selfr
    r   ?/home/ubuntu/.local/lib/python3.10/site-packages/vocos/heads.pyforward   s   	zFourierHead.forwardN)__name__
__module____qualname____doc__torchTensorr   r   r   r   r   r	      s    r	   c                	       sJ   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )	ISTFTHeada  
    ISTFT Head module for predicting STFT complex coefficients.

    Args:
        dim (int): Hidden dimension of the model.
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames, which should align with
                          the resolution of the input features.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samedimn_fft
hop_lengthpaddingc                    s8   t    |d }tj||| _t||||d| _d S )N   )r   r   
win_lengthr   )super__init__r   r   Linearoutr   istft)r   r   r   r   r   out_dim	__class__r   r   r    &   s   
zISTFTHead.__init__r
   r   c                 C   sn   |  |dd}|jddd\}}t|}tj|dd}t|}t|}||d|   }| |}|S )ay  
        Forward pass of the ISTFTHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
           r   r         Y@maxy              ?)	r"   	transposechunkr   expclipcossinr#   )r   r
   magpySaudior   r   r   r   ,   s   



zISTFTHead.forward)r   )r   r   r   r   intstrr    r   r   r   __classcell__r   r   r%   r   r      s     r   c                       sX   e Zd ZdZ			ddedededee d	ef
 fd
dZde	j
de	j
fddZ  ZS )IMDCTSymExpHeada|  
    IMDCT Head module for predicting MDCT coefficients with symmetric exponential function

    Args:
        dim (int): Hidden dimension of the model.
        mdct_frame_len (int): Length of the MDCT frame.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
        sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized
                                     based on perceptual scaling. Defaults to None.
        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
    r   NFr   mdct_frame_lenr   sample_rate
clip_audioc                    s   t    |d }t||| _t||d| _|| _|d urYt|d }t	
d||}t|}	d|	|	   }
t	  | jj|
dd W d    d S 1 sRw   Y  d S d S )Nr   	frame_lenr   r   r'   )r   r    r   r!   r"   r   imdctr=   r   r   linspacer   r+   no_gradweightmul_view)r   r   r;   r   r<   r=   r$   m_maxm_ptsf_ptsscaler%   r   r   r    U   s   

"zIMDCTSymExpHead.__init__r
   r   c                 C   sF   |  |}t|}tj|ddd}| |}| jr!tj|ddd}|S )a  
        Forward pass of the IMDCTSymExpHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        g      Yr)   minr+               ?)r"   r   r   r/   rA   r=   )r   r
   r6   r   r   r   r   m   s   

zIMDCTSymExpHead.forward)r   NF)r   r   r   r   r7   r8   r   boolr    r   r   r   r9   r   r   r%   r   r:   H   s"    r:   c                	       sJ   e Zd ZdZddedededef fdd	Zd
ej	dej	fddZ
  ZS )IMDCTCosHeadu  
    IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p)

    Args:
        dim (int): Hidden dimension of the model.
        mdct_frame_len (int): Length of the MDCT frame.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
    r   Fr   r;   r   r=   c                    s0   t    || _t||| _t||d| _d S )Nr>   )r   r    r=   r   r!   r"   r   rA   )r   r   r;   r   r=   r%   r   r   r       s   
zIMDCTCosHead.__init__r
   r   c                 C   s\   |  |}|jddd\}}t|jdd}| |t| }| jr,tj|ddd}|S )a|  
        Forward pass of the IMDCTCosHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        r   r(   r)   r*   rM   rN   rK   )r"   r-   r   r.   r/   rA   r0   r=   )r   r
   mr3   r6   r   r   r   r      s   
zIMDCTCosHead.forward)r   F)r   r   r   r   r7   r8   rO   r    r   r   r   r9   r   r   r%   r   rP      s     
rP   )typingr   r   r    torchaudio.functional.functionalr   r   vocos.spectral_opsr   r   vocos.modulesr   Moduler	   r   r:   rP   r   r   r   r   <module>   s    .: