o
    wid                     @   sj  d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZmZmZmZmZmZ d dlmZ dd	gZ			
		d1dejdeej deej deeee f dededejfddZ			d2dejdejdeej deej dedejfddZ 					d3dejdejdeej deej dedededejfdd Z!						!		d4dejdejdeej deej d"ed#ed$ee d%ed&ee dedejfd'd(Z"G d)d deeZ#		d5dejdejdeej deej dejf
d*d+Z$G d,d	 d	eeZ%		d5dejdejdeej deej dejf
d-d.Z&G d/d0 d0eeZ'dS )6    N)ListOptionalTupleUnion)make_seq_mask_like)toeplitz)LossTyping	typecheck)AudioSignalLengthsTypeLossTypeMaskType
NeuralTypeVoidType)loggingSDRLossMSELossF绽|=inputinput_lengthmaskdimkeepdimepsreturnc                 C   s   |dur|durt dt|| ddd}|| }|du r'tj| ||d}|S ||  }tj|||d}tj|||d}|||  }|S )aG  Calculate mean along dimension `dim` with optionally
    averaging only over valid samples (based on the input length).

    Args:
        input: signal, for example (B, C, T) or (B, C, D, T)
        input_length: Optional, length of each example in the batch, shape (B,)
        mask: Optional, temporal mask for each example in the batch, same shape as the input signal
        dim: dimension or dimensions to reduce
        keepdim: Whether to keep the temporal dimension
        eps: Regularization to avoid division by zero

    Returns:
        Mean over dimensions `dim`.
    N`Argument `input_length` is mutually exclusive with `mask`. Both cannot be used at the same time.r   Tlengthsliketime_dim
valid_ones)r   r   )RuntimeErrorr   	expand_astorchmeansum)r   r   r   r   r   r   r&   normalization r)   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/audio/losses/audio.pycalculate_mean   s   
r+   estimatetargetc           	      C   s   |dur|durt dt|| ddd}|| }t| | |dd|d}tt|d |dd|d}|||  }|| }|durD|| }|S )aC  Calculate optimal scale-invariant target.
    Assumes time dimension is the last dimension in the array.

    Calculate scaled target obtained by solving

        min_scale || scale * target - estimate ||^2

    for each example in batch and each channel (b, c).

    Args:
        estimate: tensor, shape (B, C, T)
        target: tensor, shape (B, C, T)
        input_length: optional, length of valid samples, shape (B,)
        mask: optional, mask for input samples, shape (B, T)
        eps: regularization constant

    Returns:
        Scaled target, shape (B, C, T)
    Nr   r   Tr   r   r   r   r      )r#   r   r$   r+   r%   abs)	r,   r-   r   r   r   estimate_dot_target
target_powscaletarget_scaledr)   r)   r*   scale_invariant_targetJ   s   
r5      ư>:0yE>filter_lengthdiag_regc                 C   s  |dur*|durt dt||k rtdt|| t|| ddd}|| }|dur6||  } || }| j}| d|d } |d|d }dt	
t	d|d  d  }tjj||d	}	tjj| |d	}
tjjt|	d |d	}tjj|	 |
 |d	}|d
d|f }|d
d|f }|dur|d  ||d  | 7  < t|}tj||}|	tjj||d	 }tjj||d	}|d
d|d f j| }|dur|| }|S )a  Calculate optimal convolution-invariant target for a given estimate.
    Assumes time dimension is the last dimension in the array.

    Calculate target filtered with a linear f obtained by solving

        min_filter || conv(filter, target) - estimate ||^2

    for each example in batch and each channel (b, c).

    Args:
        estimate: tensor, shape (B, C, T)
        target: tensor, shape (B, C, T)
        input_length: optional, length of valid samples, shape (B,)
        mask: optional, mask for input samples, shape (B, T)
        filter_length: length of the (convolutional) filter for target
        diag_reg: relative diagonal regularization for the linear system
        eps: absolute regularization for the diagonal

    Returns:
        Filtered target, shape (B, C, T)

    Reference:
        C. Boeddeker et al., Convolutive Transfer Function Invariant SDR training criteria for Multi-Channel Reverberant Speech Separation, 2021
    Nr   zoCurrent min input_length (%d) is smaller than filter_length (%d). This will result in a singular linear system.r   Tr   r/      )n.).r   )r#   r%   minr   warningr   r$   shapeviewmathceillog2fftrfftirfftr0   conjr   linalgsolve)r,   r-   r   r   r9   r:   r   input_shapen_fftTEtt_corrte_corrTTfiltT_filttarget_filtr)   r)   r*   convolution_invariant_targetz   sH   !
 rT   Tscale_invariantconvolution_invariantconvolution_filter_lengthremove_meansdr_maxc
                 C   sL  |r|rt d| j|jksJ d| j d|j d|dur4|dur'tdt|| ddd	}|| }|rL| t| |dd|	d
 } |t||dd|	d
 }|sT|r]|dkr]t| |||	d}n|rht| ||||	d}| | }
tt	|d |d|	d}tt	|
d |d|	d}|dur|d| d  |  }|||	  }dt
||	  }|S )aE  Calculate signal-to-distortion ratio per channel.

        SDR = 10 * log10( ||t||_2^2 / (||e-t||_2^2 + alpha * ||t||^2)

    where
        alpha = 10^(-sdr_max/10)

    Optionally, use scale- or convolution- invariant target signal.

    Args:
        estimate: estimated signal, shape (B, C, T)
        target: target signal, shape (B, C, T)
        input_length: Optional, length of valid samples, shape (B,)
        mask: Optional, temporal mask, shape (B, T)
        scale_invariant: Use scale invariant SDR
        convolution_invariant: Use convolution invariant SDR
        convolution_filter_length: Filter length for convolution invariant SDR
        remove_mean: If True, mean will be removed before calculating SDR
        eps: Small regularization constant

    Returns:
        SDR in dB for each channel, shape (B, C)
    zRArguments scale_invariant and convolution_invariant cannot be used simultaneously.Estimate shape () not matching target shape ()Nr   r   Tr   r.   r;   )r,   r-   r   r   )r,   r-   r   r9   r   r/   )r   r   r   
   )
ValueErrorr?   r#   r   r$   r+   r5   rT   r%   r0   log10)r,   r-   r   r   rU   rV   rW   rX   rY   r   
distortionr2   distortion_powsdrr)   r)   r*   calculate_sdr_batch   sB   #
rc   c                       s   e Zd ZdZ								ddeee  d	ed
ededee	 dedee def fddZ
edd Zedd Ze 		ddejdejdeej deej dejf
ddZ  ZS )r   aA  
    Computes signal-to-distortion ratio (SDR) loss with weighted average across channels.

    Args:
        weight: weight for SDR of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
        reduction: batch reduction. Defaults to `mean` over the batch.
        scale_invariant: If `True`, use scale-invariant SDR. Defaults to `False`.
        remove_mean: Remove mean before calculating the loss. Defaults to `True`.
        sdr_max: Soft thresholding of the loss to SDR_max.
        eps: Small value for regularization.
    Nr&   Fr6   Tr8   weight	reductionrU   rV   rW   rX   rY   r   c	           	         s   t    |d ur9tdd |D rtd| tjt|ddds*td| t|	dd}t
d	| | d
| |  || _|dkrMtj| _ntd| d|rb|rbt| jj d|| _|| _|| _|| _|| _|| _d S )Nc                 S      g | ]}|d kqS r   r)   .0wr)   r)   r*   
<listcomp>I      z$SDRLoss.__init__.<locals>.<listcomp>(Weight must be positive! Current value: r;   r7   atol*Weight should add to one, current weight: r   Channel weight set to %srd   r&   Unexpected reduction mode .zT: arguments scale_invariant and convolution_invariant cannot be used simultaneously.)super__init__anyr^   npiscloser'   r%   tensorreshaper   inforegister_bufferre   r&   reduce	__class____name__rU   rV   rW   rX   rY   r   )	selfrd   re   rU   rV   rW   rX   rY   r   r~   r)   r*   ru   :  s0   


zSDRLoss.__init__c                 C   s>   d}t |t t |t t tdt ddt dt dddS )$Input types definitions for SDRLoss.BCrL   r   Toptionalr,   r-   r   r   )r   r   tupler   r   )r   signal_shaper)   r)   r*   input_typese  s   

zSDRLoss.input_typesc                 C      dt t diS )z%Output types definitions for SDRLoss.losselements_typer   r   r   r)   r)   r*   output_typesp     zSDRLoss.output_typesr,   r-   r   r   r   c                 C   sj   t ||||| j| j| j| j| j| jd
}| jdu r!tj	|dd}n|| j }tj
|dd}| |}| S )a  For input batch of multi-channel signals, calculate SDR between estimate and target for each channel,
        perform averaging across channels (weighting optional), and apply reduction across the batch.

        Args:
            estimate: Batch of signals, shape (B, C, T)
            target: Batch of signals, shape (B, C, T)
            input_length: Batch of lengths, shape (B,)
            mask: Batch of temporal masks for each channel, shape (B, C, T)

        Returns:
            Scalar loss.
        )
r,   r-   r   r   rU   rV   rW   rX   rY   r   Nr;   r   )rc   rU   rV   rW   rX   rY   r   rd   r%   r&   r'   r}   )r   r,   r-   r   r   rb   r)   r)   r*   forwardu  s$   


zSDRLoss.forward)Nr&   FFr6   TNr8   NN)r   
__module____qualname____doc__r   r   floatstrboolintru   propertyr   r   r
   r%   Tensorr   __classcell__r)   r)   r   r*   r   -  sZ    
	+


c                 C   s   | j |j ksJ d| j  d|j  d|dur,|durtdt|| ddd}|| }| | }| jd	kr8d}n| jd
kr@d}ntd| j  tt|d ||d}|S )a  Calculate MSE per channel.

        MSE = ||estimate - target||_2^2 / input_length

    Args:
        estimate: estimated signal, shape (B, C, T) or (B, C, D, T)
        target: target signal, shape (B, C, T) or (B, C, D, T)
        input_length: Optional, length of valid samples, shape (B,)
        mask: Optional, temporal mask, same shape as signals

    Returns:
        MSE for each channel, shape (B, C)
    rZ   r[   r\   Nr   r   Tr         r   #Unexpected dimension of the input: r/   r   r   r?   r#   r   r$   ndimr+   r%   r0   r,   r-   r   r   errr   mser)   r)   r*   calculate_mse_batch  s$   


r   c                          e Zd ZdZ			ddeee  dedef fdd	Z	e
d
d Ze
dd Ze 		ddejdejdeej deej dejf
ddZ  ZS )r   aQ  
    Computes MSE loss with weighted average across channels.

    Args:
        weight: weight for loss of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
        reduction: batch reduction. Defaults to `mean` over the batch.
        ndim: Number of dimensions for the input signal
    Nr&   r   rd   re   r   c                    0  t    |d ur9tdd |D rtd| tjt|ddds*td| t|	dd}t
d	| | d
| |  || _|dkrMtj| _ntd| d|| _| jdkrad| _n| jdkrjd| _ntd| j t
d| jj t
d| j t
d| j t
d| j t
d| j d S )Nc                 S   rf   rg   r)   rh   r)   r)   r*   rk     rl   z$MSELoss.__init__.<locals>.<listcomp>rm   r;   r7   rn   rp   r   rq   rd   r&   rr   rs   r   r   r   r   r   DrL   Unexpected input dimension: Initialized %s with	weight:       %s	reduction:    %s	ndim:         %s	signal_shape: %srt   ru   rv   r^   rw   rx   r'   r%   ry   rz   r   r{   r|   re   r&   r}   r   r   debugr~   r   rd   r   rd   re   r   r   r)   r*   ru     2   



zMSELoss.__init__c                 C   @   t | jt t | jt t tdt ddt | jt dddS )r   r   Tr   r   r   r   r   r   r   r   r   r)   r)   r*   r     
   zMSELoss.input_typesc                 C   r   )z%Output types definitions for MSELoss.r   r   r   r   r)   r)   r*   r     r   zMSELoss.output_typesr,   r-   r   r   r   c                 C   P   t ||||d}| jdu rtj|dd}n|| j }tj|dd}| |}|S )a  For input batch of multi-channel signals, calculate SDR between estimate and target for each channel,
        perform averaging across channels (weighting optional), and apply reduction across the batch.

        Args:
            estimate: Estimate of the target signal
            target: Target signal
            input_length: Length of each example in the batch
            mask: Mask for each signal

        Returns:
            Scalar loss.
        r   Nr;   r   )r   rd   r%   r&   r'   r}   )r   r,   r-   r   r   r   r)   r)   r*   r        


zMSELoss.forwardNr&   r   r   r   r   r   r   r   r   r   r   r   ru   r   r   r   r
   r%   r   r   r   r)   r)   r   r*   r     <    
,
	
c                 C   s   | j |j ksJ d| j  d|j  d|dur,|durtdt|| ddd}|| }| | }| jd	kr8d}n| jd
kr@d}ntd| j  tt|||d}|S )a  Calculate mean absolute error (MAE) per channel.

        MAE = ||estimate - target||_1 / input_length

    Args:
        estimate: estimated signal, shape (B, C, T) or (B, C, D, T)
        target: target signal, shape (B, C, T) or (B, C, D, T)
        input_length: Optional, length of valid samples, shape (B,)
        mask: Optional, temporal mask, same shape as signals

    Returns:
        MAE for each channel, shape (B, C)
    rZ   r[   r\   Nr   r   Tr   r   r   r   r   r   r   r   r)   r)   r*   calculate_mae_batchG  s$   


r   c                       r   )MAELossak  
    Computes the mean absolute error (MAE) loss with weighted average across channels.

    Args:
        weight: weight for loss of each output channel, used for averaging the loss across channels. Defaults to `None` (averaging).
        reduction: batch reduction. Defaults to `mean` over the batch.
        ndim: Number of dimensions for the input signal
    Nr&   r   rd   re   r   c                    r   )Nc                 S   rf   rg   r)   rh   r)   r)   r*   rk     rl   z$MAELoss.__init__.<locals>.<listcomp>rm   r;   r7   rn   rp   r   rq   rd   r&   rr   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r)   r*   ru     r   zMAELoss.__init__c                 C   r   )z$Input types definitions for MAELoss.r   Tr   r   r   r   r)   r)   r*   r     r   zMAELoss.input_typesc                 C   r   )z%Output types definitions for MAELoss.r   r   r   r   r)   r)   r*   r     r   zMAELoss.output_typesr,   r-   r   r   r   c                 C   r   )a  For input batch of multi-channel signals, calculate MAE between estimate and target for each channel,
        perform averaging across channels (weighting optional), and apply reduction across the batch.

        Args:
            estimate: Estimate of the target signal
            target: Target signal
            input_length: Length of each example in the batch
            mask: Mask for each signal

        Returns:
            Scalar loss.
        r   Nr;   r   )r   rd   r%   r&   r'   r}   )r   r,   r-   r   r   maer)   r)   r*   r     r   zMAELoss.forwardr   r   r   r)   r)   r   r*   r   {  r   r   )NNr   Fr   )NNr   )NNr6   r7   r8   )NNFFr6   TNr8   r   )(rA   typingr   r   r   r   numpyrw   r%   1nemo.collections.asr.parts.preprocessing.featuresr   (nemo.collections.audio.parts.utils.audior   nemo.core.classesr   r	   r
   nemo.core.neural_typesr   r   r   r   r   r   
nemo.utilsr   __all__r   r   r   r   r+   r5   rT   rc   r   r   r   r   r   r)   r)   r)   r*   <module>   s    
/
3
e	

Q{
4q
4