o
    ϯiV/                     @   s   d dl Z d dl mZ d dlZd dlm  mZ d dlmZ d dlm	Z	 d dlmZ G dd dej
Z
G dd	 d	ejZG d
d dejZG dd dejZG dd dejZdS )    N)List)AudioSignal
STFTParams)nnc                       sB   e Zd ZdZddedef fddZded	ef fd
dZ  Z	S )L1Lossa  L1 Loss between AudioSignals. Defaults
    to comparing ``audio_data``, but any
    attribute of an AudioSignal can be used.

    Parameters
    ----------
    attribute : str, optional
        Attribute of signal to compare, defaults to ``audio_data``.
    weight : float, optional
        Weight of this loss, defaults to 1.0.

    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
    
audio_data      ?	attributeweightc                    s"   || _ || _t jdi | d S )N )r
   r   super__init__)selfr
   r   kwargs	__class__r   B/home/ubuntu/.local/lib/python3.10/site-packages/dacvae/nn/loss.pyr      s   zL1Loss.__init__xyc                    s0   t |trt|| j}t|| j}t ||S )a  
        Parameters
        ----------
        x : AudioSignal
            Estimate AudioSignal
        y : AudioSignal
            Reference AudioSignal

        Returns
        -------
        torch.Tensor
            L1 loss between AudioSignal attributes.
        )
isinstancer   getattrr
   r   forward)r   r   r   r   r   r   r   !   s   
zL1Loss.forward)r   r	   )
__name__
__module____qualname____doc__strfloatr   r   r   __classcell__r   r   r   r   r      s    r   c                       sT   e Zd ZdZ					ddededed	ed
ef
 fddZdedefddZ	  Z
S )	SISDRLossa  
    Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
    of estimated and reference audio signals or aligned features.

    Parameters
    ----------
    scaling : int, optional
        Whether to use scale-invariant (True) or
        signal-to-noise ratio (False), by default True
    reduction : str, optional
        How to reduce across the batch (either 'mean',
        'sum', or none).], by default ' mean'
    zero_mean : int, optional
        Zero mean the references and estimates before
        computing the loss, by default True
    clip_min : int, optional
        The minimum possible loss value. Helps network
        to not focus on making already good examples better, by default None
    weight : float, optional
        Weight of this loss, defaults to 1.0.

    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/distance.py
    TmeanNr	   scaling	reduction	zero_meanclip_minr   c                    s,   || _ || _|| _|| _|| _t   d S N)r"   r#   r$   r%   r   r   r   )r   r"   r#   r$   r%   r   r   r   r   r   N   s   zSISDRLoss.__init__r   r   c                 C   sf  d}t |tr|j}|j}n|}|}|jd }||ddddd}||ddddd}| jrA|jddd}|jddd}nd}d}|| }	|| }
|	d jdd	| }|
|	 jdd	| }| j	rk|| 
dnd}||	 }|
| }|d jdd	}|d jdd	}d
t|| |  }| jd urtj|| jd}| jdkr| }|S | jdkr| }|S )Ng:0yE>r         T)dimkeepdim)r*   i)minr!   sum)r   r   r   shapereshapepermuter$   r!   r.   r"   	unsqueezetorchlog10r%   clampr#   )r   r   r   eps
references	estimatesnbmean_referencemean_estimate_references
_estimatesreferences_projectionreferences_on_estimatesscalee_truee_ressignalnoisesdrr   r   r   r   ]   sF   




zSISDRLoss.forward)Tr!   TNr	   )r   r   r   r   intr   r   r   r   r   r   r   r   r   r   r    5   s&    r    c                       sz   e Zd ZdZddge dddddddf	d	ee d
ej	de
de
de
de
de
dedef fddZdedefddZ  ZS )MultiScaleSTFTLossa  Computes the multi-scale STFT loss from [1].

    Parameters
    ----------
    window_lengths : List[int], optional
        Length of each window of each STFT, by default [2048, 512]
    loss_fn : typing.Callable, optional
        How to compare each loss, by default nn.L1Loss()
    clamp_eps : float, optional
        Clamp on the log magnitude, below, by default 1e-5
    mag_weight : float, optional
        Weight of raw magnitude portion of loss, by default 1.0
    log_weight : float, optional
        Weight of log magnitude portion of loss, by default 1.0
    pow : float, optional
        Power to raise magnitude to before taking log, by default 2.0
    weight : float, optional
        Weight of this loss, by default 1.0
    match_stride : bool, optional
        Whether to match the stride of convolutional layers, by default False

    References
    ----------

    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
        "DDSP: Differentiable Digital Signal Processing."
        International Conference on Learning Representations. 2019.

    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
          h㈵>r	          @FNwindow_lengthsloss_fn	clamp_eps
mag_weight
log_weightpowr   match_stridewindow_typec
           
         sH   t     fdd|D | _|| _|| _|| _|| _|| _|| _d S )Nc                        g | ]}t ||d   dqS    )window_length
hop_lengthrR   rS   r   .0wrR   rS   r   r   
<listcomp>       z/MultiScaleSTFTLoss.__init__.<locals>.<listcomp>)	r   r   stft_paramsrM   rP   rO   rN   r   rQ   )
r   rL   rM   rN   rO   rP   rQ   r   rR   rS   r   r\   r   r      s   
	
zMultiScaleSTFTLoss.__init__r   r   c              	   C   s   d}| j D ]D}||j|j|j ||j|j|j || j| |j| j	
| j
 |j| j	
| j
  7 }|| j| |j|j 7 }q|S )a?  Computes multi-scale STFT between an estimate and a reference
        signal.

        Parameters
        ----------
        x : AudioSignal
            Estimate signal
        y : AudioSignal
            Reference signal

        Returns
        -------
        torch.Tensor
            Multi-scale STFT loss.
                )r_   stftrW   rX   rS   rP   rM   	magnituder5   rN   rQ   r4   rO   )r   r   r   losssr   r   r   r      s   

zMultiScaleSTFTLoss.forwardr   r   r   r   r   r   r   rF   typingCallabler   boolr   r   r   r   r   r   r   r   r   rG      s>    !	
rG   c                       s   e Zd ZdZddgddge dddddd	d
d
gddgdfdee dee dej	de
de
de
de
de
dedee
 dee
 def fddZdedefddZ  ZS )MelSpectrogramLossaf  Compute distance between mel spectrograms. Can be used
    in a multi-scale way.

    Parameters
    ----------
    n_mels : List[int]
        Number of mels per STFT, by default [150, 80],
    window_lengths : List[int], optional
        Length of each window of each STFT, by default [2048, 512]
    loss_fn : typing.Callable, optional
        How to compare each loss, by default nn.L1Loss()
    clamp_eps : float, optional
        Clamp on the log magnitude, below, by default 1e-5
    mag_weight : float, optional
        Weight of raw magnitude portion of loss, by default 1.0
    log_weight : float, optional
        Weight of log magnitude portion of loss, by default 1.0
    pow : float, optional
        Power to raise magnitude to before taking log, by default 2.0
    weight : float, optional
        Weight of this loss, by default 1.0
    match_stride : bool, optional
        Whether to match the stride of convolutional layers, by default False

    Implementation copied from: https://github.com/descriptinc/lyrebird-audiotools/blob/961786aa1a9d628cca0c0486e5885a457fe70c1a/audiotools/metrics/spectral.py
       P   rH   rI   rJ   r	   rK   Fr`   Nn_melsrL   rM   rN   rO   rP   rQ   r   rR   mel_fminmel_fmaxrS   c                    sZ   t     fdd|D | _|| _|| _|| _|| _|| _|| _|
| _	|| _
|| _d S )Nc                    rT   rU   r   rY   r\   r   r   r]     r^   z/MelSpectrogramLoss.__init__.<locals>.<listcomp>)r   r   r_   rl   rM   rN   rP   rO   r   rm   rn   rQ   )r   rl   rL   rM   rN   rO   rP   rQ   r   rR   rm   rn   rS   r   r\   r   r     s   
	
zMelSpectrogramLoss.__init__r   r   c              	   C   s   d}t | j| j| j| jD ]Q\}}}}|j|j|jd}|j|f||d|}	|j|f||d|}
|| j	| 
|	| j| j |
| j| j  7 }|| j| 
|	|
 7 }q|S )a*  Computes mel loss between an estimate and a reference
        signal.

        Parameters
        ----------
        x : AudioSignal
            Estimate signal
        y : AudioSignal
            Reference signal

        Returns
        -------
        torch.Tensor
            Mel loss.
        r`   )rW   rX   rS   )rm   rn   )ziprl   rm   rn   r_   rW   rX   rS   mel_spectrogramrP   rM   r5   rN   rQ   r4   rO   )r   r   r   rc   rl   fminfmaxrd   r   x_melsy_melsr   r   r   r   (  s    
zMelSpectrogramLoss.forwardre   r   r   r   r   ri      sP    	
#ri   c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
GANLossz
    Computes a discriminator loss, given a discriminator on
    generated waveforms/spectrograms compared to ground truth
    waveforms/spectrograms. Computes the loss for both the
    discriminator and the generator in separate functions.
    c                    s   t    || _d S r&   )r   r   discriminator)r   rv   r   r   r   r   T  s   

zGANLoss.__init__c                 C   s    |  |j}|  |j}||fS r&   )rv   r   )r   fakereald_faked_realr   r   r   r   X  s   zGANLoss.forwardc                 C   sd   |  |  |\}}d}t||D ]\}}|t|d d 7 }|td|d  d 7 }q|S )Nr   r(   r)   r'   )r   clonedetachro   r3   r!   )r   rw   rx   ry   rz   loss_dx_fakex_realr   r   r   discriminator_loss]  s   zGANLoss.discriminator_lossc           
      C   s   |  ||\}}d}|D ]}|td|d  d 7 }qd}tt|D ]!}tt|| d D ]}	|t|| |	 || |	  7 }q0q$||fS )Nr   r'   r(   r)   )r   r3   r!   rangelenFl1_lossr|   )
r   rw   rx   ry   rz   loss_gr~   loss_featureijr   r   r   generator_lossf  s   &zGANLoss.generator_loss)	r   r   r   r   r   r   r   r   r   r   r   r   r   ru   L  s    	ru   )rf   r   r3   torch.nn.functionalr   
functionalr   
audiotoolsr   r   r   Moduler    rG   ri   ru   r   r   r   r   <module>   s   ([Yc