o
    }oi.B                     @   sn  d dl mZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZ G d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd  d eZ%G d!d" d"eZ&G d#d$ d$eZ'dS )%    )ListN)	rearrange)FilterbankFeatures)mask_sequence_tensor)get_mask_from_lengths)Loss	typecheck)AudioSignalLengthsTypeLossType
NeuralTypePredictionsTypeRegressionValuesTypeVoidTypec                       sJ   e Zd Zddef fddZedd Zedd Ze d	d
 Z	  Z
S )
MaskedLoss      ?
loss_scalec                    s   t t|   || _|| _d S N)superr   __init__r   loss_fn)selfr   r   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/losses/audio_codec_loss.pyr   %   s   
zMaskedLoss.__init__c                 C   s(   t dt t dt t tdt dS )N)BDTr   	predictedtarget
target_len)r   r   r   tupler
   r   r   r   r   input_types*      

zMaskedLoss.input_typesc                 C      dt t diS Nlosselements_typer   r   r$   r   r   r   output_types2      zMaskedLoss.output_typesc                 C   sh   |j d |j d ksJ | j||d}tj|dd}tj|ddtj|dd }t|}| j| }|S )N   )inputr!      dimr   )min)shaper   torchmeansumclampr   )r   r    r!   r"   r)   r   r   r   forward8   s   

zMaskedLoss.forwardr   )__name__
__module____qualname__floatr   propertyr%   r-   r   r:   __classcell__r   r   r   r   r   $   s    

r   c                       $   e Zd Zddef fddZ  ZS )MaskedMAELossr   r   c                    &   t jjdd}tt| j||d d S Nnone)	reduction)r   r   )r6   nnL1Lossr   rC   r   r   r   r   r   r   r   r   K      zMaskedMAELoss.__init__r;   r<   r=   r>   r?   r   rA   r   r   r   r   rC   J       rC   c                       rB   )MaskedMSELossr   r   c                    rD   rE   )r6   rH   MSELossr   rN   r   rJ   r   r   r   r   Q   rK   zMaskedMSELoss.__init__r;   rL   r   r   r   r   rN   P   rM   rN   c                       sB   e Zd Z fddZedd Zedd Ze dd Z  Z	S )	TimeDomainLossc                    s   t t|   t | _d S r   )r   rP   r   rC   r   r$   r   r   r   r   W   s   zTimeDomainLoss.__init__c                 C   (   t dt t dt t tdt dS N)r   r   r   
audio_real	audio_gen	audio_lenr   r	   r#   r
   r$   r   r   r   r%   [   r&   zTimeDomainLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-   c   r.   zTimeDomainLoss.output_typesc                 C   s(   t |d}t |d}| j|||d}|S )NzB T -> B 1 T)r!   r    r"   )r   r   )r   rT   rU   rV   r)   r   r   r   r:   i   s   

zTimeDomainLoss.forward)
r<   r=   r>   r   r@   r%   r-   r   r:   rA   r   r   r   r   rP   V   s    

rP   c                	       sb   e Zd ZdZddedee dee def fddZed	d
 Z	edd Z
e dd Z  ZS )MultiResolutionMelLossa  
    Multi-resolution log mel spectrogram loss.

    Args:
        sample_rate: Sample rate of audio.
        resolutions: List of resolutions, each being 3 integers ordered [num_fft, hop_length, window_length]
        mel_dims: Dimension of mel spectrogram to compute for each resolution. Should be same length as 'resolutions'.
        log_guard: Value to add to mel spectrogram to avoid taking log of 0.
    r   sample_rateresolutionsmel_dims	log_guardc           
         s   t t|   t|t|ksJ t | _t | _tj	
 | _t||D ]\}\}}}t|||||ddd|d d d ddd}	| j|	 q$d S )Nr1   r   add        T)rY   nfiltn_window_sizen_window_striden_fftpad_to	mag_powerlog_zero_guard_typelog_zero_guard_valuemel_norm	normalizepreemphdither	use_grads)r   rX   r   lenrC   
l1_loss_fnrN   
l2_loss_fnr6   rH   
ModuleListmel_featureszipr   append)
r   rY   rZ   r[   r\   mel_dimrb   hop_lenwin_lenmel_featurer   r   r   r   |   s0   zMultiResolutionMelLoss.__init__c                 C   rQ   rR   rW   r$   r   r   r   r%      r&   z"MultiResolutionMelLoss.input_typesc                 C   s   t t dt t ddS )Nr*   )l1_lossl2_lossr,   r$   r   r   r   r-      s   

z#MultiResolutionMelLoss.output_typesc                 C   s   d}d}| j D ]&}|||d\}}|||d\}	}
|| j|	||d7 }|| j|	||d7 }q|t| j  }|t| j  }||fS )Nr^   )xseq_lenr   )rp   rm   rn   rl   )r   rT   rU   rV   rw   rx   rv   mel_realmel_real_lenmel_gen_r   r   r   r:      s   
zMultiResolutionMelLoss.forwardr;   )r<   r=   r>   __doc__intr   r?   r   r@   r%   r-   r   r:   rA   r   r   r   r   rX   q   s    (


rX   c                       sb   e Zd ZdZddee dedef fddZd	d
 Ze	dd Z
e	dd Ze dd Z  ZS )STFTLossaM  
    Log magnitude STFT loss.

    Args:
        resolution: Resolution of spectrogram, a list of 3 numbers ordered [num_fft, hop_length, window_length]
        log_guard: Value to add to magnitude spectrogram to avoid taking log of 0.
        sqrt_guard: Value to add to when computing absolute value of STFT to avoid NaN loss.
    r   h㈵>
resolutionr\   
sqrt_guardc                    sN   t t|   t | _|\| _| _| _| dt	j
| jdd || _|| _d S )NwindowF)periodic)r   r   r   rC   r   rb   
hop_length
win_lengthregister_bufferr6   hann_windowr\   r   )r   r   r\   r   r   r   r   r      s   
zSTFTLoss.__init__c                 C   sb   t j|| j| j| j| jdd}t |}t |d	d| j
 }t || j }t||}|S )NT)rb   r   r   r   return_complexr/   )r6   stftrb   r   r   r   view_as_realsqrtpowr8   r   logr\   r   )r   audiospec_lenspecspec_magspec_logr   r   r   _compute_spectrogram   s   
	
zSTFTLoss._compute_spectrogramc                 C   rQ   rR   rW   r$   r   r   r   r%      r&   zSTFTLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-         zSTFTLoss.output_typesc                 C   s>   || j  d }| j||d}| j||d}| j|||d}|S )Nr1   )r   r   r   )r   r   r   )r   rT   rU   rV   r   	spec_realspec_genr)   r   r   r   r:      s
   zSTFTLoss.forwardr   r   )r<   r=   r>   r   r   r   r?   r   r   r@   r%   r-   r   r:   rA   r   r   r   r   r      s     	

r   c                       sZ   e Zd ZdZddee dedef fddZed	d
 Zedd Z	e
 dd Z  ZS )MultiResolutionSTFTLossa[  
    Multi-resolution log magnitude STFT loss.

    Args:
        resolutions: List of resolutions, each being 3 integers ordered [num_fft, hop_length, window_length]
        log_guard: Value to add to magnitude spectrogram to avoid taking log of 0.
        sqrt_guard: Value to add to when computing absolute value of STFT to avoid NaN loss.
    r   r   rZ   r\   r   c                    s0   t t|   tj fdd|D | _d S )Nc                    s   g | ]	}t | d qS ))r   r\   r   )r   ).0r   r\   r   r   r   
<listcomp>   s    z4MultiResolutionSTFTLoss.__init__.<locals>.<listcomp>)r   r   r   r6   rH   ro   loss_fns)r   rZ   r\   r   r   r   r   r      s   
z MultiResolutionSTFTLoss.__init__c                 C   rQ   rR   rW   r$   r   r   r   r%      r&   z#MultiResolutionSTFTLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r   z$MultiResolutionSTFTLoss.output_typesc                 C   s4   d}| j D ]}|||||d7 }q|t| j  }|S )Nr^   rS   )r   rl   )r   rT   rU   rV   r)   r   r   r   r   r:     s
   
zMultiResolutionSTFTLoss.forwardr   )r<   r=   r>   r   r   r?   r   r@   r%   r-   r   r:   rA   r   r   r   r   r      s     	

r   c                       sN   e Zd ZdZddef fddZedd Zedd	 Ze	 d
d Z
  ZS )	SISDRLossz
    SI-SDR loss based off of torchmetrics.functional.audio.sdr.scale_invariant_signal_distortion_ratio
    with added support for masking.
    :0yE>epsilonc                       t t|   || _d S r   )r   r   r   r   )r   r   r   r   r   r        
zSISDRLoss.__init__c                 C   rQ   rR   rW   r$   r   r   r   r%     r&   zSISDRLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-   &  r   zSISDRLoss.output_typesc                 C   s   t ||d}t|d}tj|ddd| }tj|ddd| }|| }|| }|| }|| }tj|| ddd}	tj|d ddd}
|	| j |
| j  }|| }|| }tj|d dd}tj|d dd}|| j || j  }dt| }t| }|S )	N)ry   lengthszB -> B 1r   T)r3   keepdimr/   r2   
   )r   r   r6   r8   r   log10r7   )r   rT   rU   rV   masktarget_mean	pred_meanr!   predref_pred
ref_targetalphatarget_scaled
distortiontarget_scaled_powerdistortion_powerratiosi_sdrr)   r   r   r   r:   *  s&   
zSISDRLoss.forward)r   )r<   r=   r>   r   r?   r   r@   r%   r-   r   r:   rA   r   r   r   r   r     s    

r   c                       sF   e Zd ZdZ fddZedd Zedd Ze dd	 Z	  Z
S )
FeatureMatchingLossz
    Standard feature matching loss measuring the difference in the internal discriminator layer outputs
    (usually leaky relu activations) between real and generated audio, scaled down by the total number of
    discriminators and layers.
    c                    s   t t|   d S r   )r   r   r   r$   r   r   r   r   V  s   zFeatureMatchingLoss.__init__c                 C   "   t t dggt t dggdS Nr*   )
fmaps_real	fmaps_genr   r   r$   r   r   r   r%   Y     zFeatureMatchingLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-   `  r.   z FeatureMatchingLoss.output_typesc           
      C   sd   d}t ||D ]"\}}t ||D ]\}}t|| }t|t| }	||	7 }qq|t| }|S )Nr^   )rq   r6   absr7   rl   )
r   r   r   r)   	fmap_realfmap_gen	feat_realfeat_gendiff	feat_lossr   r   r   r:   f  s   
zFeatureMatchingLoss.forwardr<   r=   r>   r   r   r@   r%   r-   r   r:   rA   r   r   r   r   r   O  s    

r   c                       sH   e Zd ZdZd fdd	Zedd Zedd Ze d	d
 Z	  Z
S )RelativeFeatureMatchingLossa  
    Relative feature matching loss as described in https://arxiv.org/pdf/2210.13438.pdf.

    This is similar to standard feature matching loss, but it scales the loss by the absolute value of
    each feature averaged across time. This might be slightly different from the paper which says the
    "mean is computed over all dimensions", which could imply taking the average across both time and
    features.

    Args:
        div_guard: Value to add when dividing by mean to avoid large/NaN values.
    MbP?c                    r   r   )r   r   r   	div_guard)r   r   r   r   r   r     r   z$RelativeFeatureMatchingLoss.__init__c                 C   r   r   r   r$   r   r   r   r%     r   z'RelativeFeatureMatchingLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r.   z(RelativeFeatureMatchingLoss.output_typesc                 C   s   d}t ||D ]8\}}t ||D ].\}}tjt|dd}tjt|| dd}	|	|| j  }
t|
t| }
||
7 }qq|t| }|S )Nr^   r   r2   )rq   r6   r7   r   r   rl   )r   r   r   r)   r   r   r   r   	feat_meanr   r   r   r   r   r:     s   
	z#RelativeFeatureMatchingLoss.forward)r   r   r   r   r   r   r   v  s    

r   c                   @   2   e Zd Zedd Zedd Ze dd ZdS )GeneratorHingedLossc                 C      dt dt giS Ndisc_scores_genr   Cr   r   r$   r   r   r   r%        zGeneratorHingedLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r   z GeneratorHingedLoss.output_typesc                 C   s6   d}|D ]}|t td| 7 }q|t| }|S )Nr^   r1   )r6   r7   Frelurl   r   r   r)   disc_score_genr   r   r   r:     s
   zGeneratorHingedLoss.forwardNr<   r=   r>   r@   r%   r-   r   r:   r   r   r   r   r         

r   c                   @   r   )GeneratorSquaredLossc                 C   r   r   r   r$   r   r   r   r%     r   z GeneratorSquaredLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r   z!GeneratorSquaredLoss.output_typesc                 C   s4   d}|D ]}|t d| d 7 }q|t| }|S Nr^   r1   r/   )r6   r7   rl   r   r   r   r   r:     s
   zGeneratorSquaredLoss.forwardNr   r   r   r   r   r     r   r   c                   @   r   )DiscriminatorHingedLossc                 C      t dt gt dt gdS Nr   )disc_scores_realr   r   r$   r   r   r   r%        z#DiscriminatorHingedLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r   z$DiscriminatorHingedLoss.output_typesc                 C   s`   d}t ||D ] \}}ttd| }ttd| }||| d 7 }q|t| }|S r   )rq   r6   r7   r   r   rl   r   r   r   r)   disc_score_realr   	loss_realloss_genr   r   r   r:     s   zDiscriminatorHingedLoss.forwardNr   r   r   r   r   r         

r   c                   @   r   )DiscriminatorSquaredLossc                 C   r   r   r   r$   r   r   r   r%     r   z$DiscriminatorSquaredLoss.input_typesc                 C   r'   r(   r,   r$   r   r   r   r-     r   z%DiscriminatorSquaredLoss.output_typesc                 C   sX   d}t ||D ]\}}td| d }t|d }||| d 7 }q|t| }|S r   )rq   r6   r7   rl   r   r   r   r   r:     s   z DiscriminatorSquaredLoss.forwardNr   r   r   r   r   r     r   r   )(typingr   r6   torch.nn.functionalrH   
functionalr   einopsr   1nemo.collections.asr.parts.preprocessing.featuresr   #nemo.collections.common.parts.utilsr   (nemo.collections.tts.parts.utils.helpersr   nemo.core.classesr   r   nemo.core.neural_typesr	   r
   r   r   r   r   r   r   rC   rN   rP   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s.   $&E9%;'1