o
    ei                     @   sZ   d Z ddlZddlZddlZddlmZ e  edjZ	dd Z
dd
dZdddZdS )zLibrary for computing STOI computation.
Reference: "End-to-End Waveform Utterance Enhancement for Direct Evaluation
Metrics Optimization by Fully Convolutional Neural Networks", TASLP, 2018

Authors:
    Szu-Wei, Fu 2020
    N)check_torchaudio_backendfloatc                 C   s  t d| |d }|dt|d d  }t tt|t}t 	d|| }|t 	dd| d d  }|t 	dd| d d  }t 
|t|}	tt|D ]2}
t t |||
  }|| ||
< |}t t |||
  }|| ||
< |}d|	|
||f< qT|	S )aS  Returns the 1/3 octave band matrix.

    Arguments
    ---------
    fs : int
        Sampling rate.
    nfft : int
        FFT size.
    num_bands : int
        Number of 1/3 octave bands.
    min_freq : int
        Center frequency of the lowest 1/3 octave band.

    Returns
    -------
    obm : tensor
        Octave Band Matrix.
    r      N   gr(?g       @   )torchlinspaceint
from_numpynparrayrangeastyper   powzeroslenargminsquare)fsnfft	num_bandsmin_freqfkcffreq_low	freq_highobmif_binfl_iifh_ii r"   ]/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/nnet/loss/stoi_loss.pythirdoct   s    r$   (         c              
   C   s   t t t|dt j}| dt| jd | |  	t| jd | |j
}| |t| jd | | | |  	t| jd | | |j
}t ||jd |jd  }||dddddf< ||dddddf< dt t t |d |d d t  }	t |	}
t |	|
 | dk}|dt|jd | |  	t|jd | |j
}||t|jd | | | |  	t|jd | | |j
}t ||jd |jd  }||dddddf< ||dddddf< |j
d|dd|f jd |dd|f  }|j
d|dd|f jd |dd|f  }t j|d|df |d|ddf ||dddf  j
 |||df fdd}t j|d|df |d|ddf ||dddf  j
 |||df fdd}||gS )	a  Removes silent frames from the STOI computation.

    This function can be used as a loss function for training
    with SGD-based updates.

    Arguments
    ---------
    x: torch.Tensor
        The clean (reference) waveforms.
    y: torch.Tensor
        The degraded (enhanced) waveforms.
    dyn_range: int
        Dynamic range used for mask computation.
    N: int
        Window length.
    K: int
        Step size.

    Returns
    -------
    list with 2 elements, x and y with silence removed.
    r   r   Nr      g      0@)axis)r   	unsqueezer
   r   hanningtor   r	   shapereshapeTr   log10sqrtmatmulsmallValmaxsqueezerepeatcatflatten)xy	dyn_rangeNKwX1X2Xenergy
Max_energymskY1Y2Yx_sily_silr"   r"   r#   removeSilentFrames<   sN    6$ 
6$00,,	rK   meanc                 C   s  t j| dd} t j|dd}| jd }d}d}d}tddd	d
d}d}	t |}
tj|d| j	}t
d|D ]}||dt|| | jd  f }| |dt|| | jd  f }||||}}t||\}}tjjddddd|}tjjddddd|}t t ||d }t t ||d }t|jd |d  }t d	| d}t d	| d}t
d|D ]6}|dd||| f ||d	 |d d	 ddf< |dd||| f ||d	 |d d	 ddf< qt j|dddt j|dddt  }|| }t ||||	  }|t j|ddd }|t j|dddt  }|t j|ddd }|t j|dddt  }t || }|||  |
|< q8|dkrX|
  S |
 S )a  Compute the STOI score and return -1 * that score.

    This function can be used as a loss function for training
    with SGD-based updates.

    Arguments
    ---------
    y_pred_batch : torch.Tensor
        The degraded (enhanced) waveforms.
    y_true_batch : torch.Tensor
        The clean (reference) waveforms.
    lens : torch.Tensor
        The relative lengths of the waveforms within the batch.
    reduction : str
        The type of reduction ("mean" or "batch") to use.

    Returns
    -------
    The computed STOI loss.

    Example
    -------
    >>> a = torch.sin(torch.arange(16000, dtype=torch.float32)).unsqueeze(0)
    >>> b = a + 0.001
    >>> -stoi_loss(b, a, torch.ones(1))
    tensor(0.7...)
    r)   )dimr   i>     g      .@i'  i         )r   r   r   r   g`~@r   r&   r'   r   )n_fft
win_length
hop_lengthpowerg+=NT)rM   keepdimrL   )r   r6   r.   r$   r   
torchaudio
transformsResampler-   devicer   r	   rK   Spectrogramr2   r3   normr4   minrL   sum)y_pred_batchy_true_batchlens	reduction
batch_sizer   r=   Joctave_bandcD	resamplerr   y_truey_pred
y_sil_true
y_sil_pred	stft_true	stft_predOCT_trueOCT_predMrB   rH   malphaayr;   xnyndr"   r"   r#   	stoi_loss   sf   

""46

rw   )r%   r&   r'   )rL   )__doc__numpyr   r   rV   %speechbrain.utils.torch_audio_backendr   finfoepsr4   r$   rK   rw   r"   r"   r"   r#   <module>   s    
)K