o
    ziC                     @   sz   d dl Z d dl mZ d dlZd dlmZ ddlmZ ddlmZ G dd dej	Z
G d	d
 d
ej	ZG dd dej	ZdS )    N)List)nn   )AudioSignal
STFTParamsc                       sz   e Zd ZdZddge dddddddf	d	ee d
ej	de
de
de
de
de
dedef fddZdedefddZ  ZS )MultiScaleSTFTLossa%  Computes the multi-scale STFT loss from [1].

    Parameters
    ----------
    window_lengths : List[int], optional
        Length of each window of each STFT, by default [2048, 512]
    loss_fn : typing.Callable, optional
        How to compare each loss, by default nn.L1Loss()
    clamp_eps : float, optional
        Clamp on the log magnitude, below, by default 1e-5
    mag_weight : float, optional
        Weight of raw magnitude portion of loss, by default 1.0
    log_weight : float, optional
        Weight of log magnitude portion of loss, by default 1.0
    pow : float, optional
        Power to raise magnitude to before taking log, by default 2.0
    weight : float, optional
        Weight of this loss, by default 1.0
    match_stride : bool, optional
        Whether to match the stride of convolutional layers, by default False

    References
    ----------

    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
        "DDSP: Differentiable Digital Signal Processing."
        International Conference on Learning Representations. 2019.
          h㈵>      ?       @FNwindow_lengthsloss_fn	clamp_eps
mag_weight
log_weightpowweightmatch_stridewindow_typec
           
         sH   t     fdd|D | _|| _|| _|| _|| _|| _|| _d S )Nc                        g | ]}t ||d   dqS    )window_length
hop_lengthr   r   r   .0wr   r    O/home/ubuntu/.local/lib/python3.10/site-packages/audiotools/metrics/spectral.py
<listcomp>6       z/MultiScaleSTFTLoss.__init__.<locals>.<listcomp>)	super__init__stft_paramsr   r   r   r   r   r   )
selfr   r   r   r   r   r   r   r   r   	__class__r   r!   r%   )   s   
	
zMultiScaleSTFTLoss.__init__xyc              	   C   s   d}| j D ]D}||j|j|j ||j|j|j || j| |j| j	
| j
 |j| j	
| j
  7 }|| j| |j|j 7 }q|S )a?  Computes multi-scale STFT between an estimate and a reference
        signal.

        Parameters
        ----------
        x : AudioSignal
            Estimate signal
        y : AudioSignal
            Reference signal

        Returns
        -------
        torch.Tensor
            Multi-scale STFT loss.
                )r&   stftr   r   r   r   r   	magnitudeclampr   r   log10r   )r'   r*   r+   losssr    r    r!   forwardF   s   

zMultiScaleSTFTLoss.forward__name__
__module____qualname____doc__r   L1Lossr   inttypingCallablefloatboolstrr%   r   r3   __classcell__r    r    r(   r!   r      s>    	
r   c                       s   e Zd ZdZddgddge dddddd	d
d
gddgdfdee dee dej	de
de
de
de
de
dedee
 dee
 def fddZdedefddZ  ZS )MelSpectrogramLossa  Compute distance between mel spectrograms. Can be used
    in a multi-scale way.

    Parameters
    ----------
    n_mels : List[int]
        Number of mels per STFT, by default [150, 80],
    window_lengths : List[int], optional
        Length of each window of each STFT, by default [2048, 512]
    loss_fn : typing.Callable, optional
        How to compare each loss, by default nn.L1Loss()
    clamp_eps : float, optional
        Clamp on the log magnitude, below, by default 1e-5
    mag_weight : float, optional
        Weight of raw magnitude portion of loss, by default 1.0
    log_weight : float, optional
        Weight of log magnitude portion of loss, by default 1.0
    pow : float, optional
        Power to raise magnitude to before taking log, by default 2.0
    weight : float, optional
        Weight of this loss, by default 1.0
    match_stride : bool, optional
        Whether to match the stride of convolutional layers, by default False
       P   r	   r
   r   r   r   Fr,   Nn_melsr   r   r   r   r   r   r   r   mel_fminmel_fmaxr   c                    sZ   t     fdd|D | _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _d S )Nc                    r   r   r   r   r   r    r!   r"      r#   z/MelSpectrogramLoss.__init__.<locals>.<listcomp>)r$   r%   r&   rD   r   r   r   r   r   rE   rF   r   )r'   rD   r   r   r   r   r   r   r   r   rE   rF   r   r(   r   r!   r%   |   s   
	
zMelSpectrogramLoss.__init__r*   r+   c              	   C   s   d}t | j| j| j| jD ]Q\}}}}|j|j|jd}|j|f||d|}	|j|f||d|}
|| j	| 
|	| j| j |
| j| j  7 }|| j| 
|	|
 7 }q|S )a*  Computes mel loss between an estimate and a reference
        signal.

        Parameters
        ----------
        x : AudioSignal
            Estimate signal
        y : AudioSignal
            Reference signal

        Returns
        -------
        torch.Tensor
            Mel loss.
        r,   )r   r   r   )rE   rF   )ziprD   rE   rF   r&   r   r   r   mel_spectrogramr   r   r/   r   r   r0   r   )r'   r*   r+   r1   rD   fminfmaxr2   kwargsx_melsy_melsr    r    r!   r3      s    
zMelSpectrogramLoss.forwardr4   r    r    r(   r!   rA   b   sP    	
#rA   c                       sD   e Zd ZdZ	ddededef fdd	Zd
edefddZ  Z	S )	PhaseLossa0  Difference between phase spectrograms.

    Parameters
    ----------
    window_length : int, optional
        Length of STFT window, by default 2048
    hop_length : int, optional
        Hop length of STFT window, by default 512
    weight : float, optional
        Weight of loss, by default 1.0
    r	   r
   r   r   r   r   c                    s    t    || _t||| _d S )N)r$   r%   r   r   r&   )r'   r   r   r   r(   r    r!   r%      s   
zPhaseLoss.__init__r*   r+   c           	      C   s   | j }||j|j|j ||j|j|j |j|j }||tj k   dtj 7  < ||tjk  dtj 8  < |j	 |j
 }}|j| ||  }|| d  }|S )a.  Computes phase loss between an estimate and a reference
        signal.

        Parameters
        ----------
        x : AudioSignal
            Estimate signal
        y : AudioSignal
            Reference signal

        Returns
        -------
        torch.Tensor
            Phase loss.
        r   )r&   r-   r   r   r   phasenppir.   minmaxmean)	r'   r*   r+   r2   diffx_minx_maxweightsr1   r    r    r!   r3      s   zPhaseLoss.forward)r	   r
   r   )
r5   r6   r7   r8   r:   r=   r%   r   r3   r@   r    r    r(   r!   rN      s    rN   )r;   r   numpyrQ   torchr    r   r   Moduler   rA   rN   r    r    r    r!   <module>   s    Wa