o
    oiN4                     @   s   d dl Z d dl mZ d dlZd dlmZmZ d dlZd dlm	Z	m
Z
mZmZmZmZmZ d dlmZ dZG dd dejZd	d
 Zdd ZdS )    N)nn)unfoldpad)FSN_FRAMENUMBANDMINFREQNBETA	DYN_RANGE)thirdoctg:0yE>c                	       s|   e Zd ZdZ			ddedededef fdd	Zd
ejdejdejfddZ	e
dd Ze
dddZe
dd Z  ZS )NegSTOILossa  Negated Short Term Objective Intelligibility (STOI) metric, to be used
        as a loss function.
        Inspired from [1, 2, 3] but not exactly the same due to a different
        resampling technique. Use pystoi when evaluating your system.

    Args:
        sample_rate (int): sample rate of audio input
        use_vad (bool): Whether to use simple VAD (see Notes)
        extended (bool): Whether to compute extended version [3].
        do_resample (bool): Whether to resample audio input to `FS`

    Shapes:
        (time,) --> (1, )
        (batch, time) --> (batch, )
        (batch, n_src, time) --> (batch, n_src)

    Returns:
        torch.Tensor of shape (batch, *, ), only the time dimension has
        been reduced.

    Warnings:
        This function does not exactly match the "real" STOI metric due to a
        different resampling technique. Use pystoi when evaluating your system.

    Notes:
        `use_vad` can be set to `False` to skip the VAD for efficiency. However
        results can become substantially different compared to the "real" STOI.
        When `True` (default), results are very close but still slightly
        different due to a different resampling technique.
        Compared against mpariente/pystoi@84b1bd8.

    References
        [1] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'A Short-Time
            Objective Intelligibility Measure for Time-Frequency Weighted Noisy
            Speech', ICASSP 2010, Texas, Dallas.
        [2] C.H.Taal, R.C.Hendriks, R.Heusdens, J.Jensen 'An Algorithm for
            Intelligibility Prediction of Time-Frequency Weighted Noisy Speech',
            IEEE Transactions on Audio, Speech, and Language Processing, 2011.
        [3] Jesper Jensen and Cees H. Taal, 'An Algorithm for Predicting the
            Intelligibility of Speech Masked by Modulated Noise Maskers',
            IEEE Transactions on Audio, Speech and Language Processing, 2016.
    TFsample_rateuse_vadextendeddo_resamplec                    s   t    || _|| _|| _t| _t| _t	| _
|| _| jr*t}tjj| jtdd| _t| t | _d| j | _tt| jd dd  }tj|dd| _t|| jttd }tjt| dd| _d S )	Nsinc_interpolation)	orig_freqnew_freqresampling_method      F)requires_gradr   ) super__init__r   r   r   r	   intel_framesr
   betar   	dyn_ranger   r   
torchaudio
transformsResampleresampler   win_lennffttorch
from_numpynphanningfloatr   	Parameterwinr   r   r   OBM)selfr   r   r   r   r+   obm_mat	__class__ M/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch_stoi/stoi.pyr   9   s*   
"zNegSTOILoss.__init__est_targetstargetsreturnc              	   C   sR  |j |j krtd|j |j |jdkr!| |d |d d S |jdkr=|j ^ }}| |d||d||S |j d }| jrT| jtkrT| 	|}| 	|}| j
r{| ||| j| j| j| jd \}}}|jddd\}}|d	ddf }| j|| j| jdd
}| j|| j| jdd
}	| j
r||d9 }|	|d9 }		 t| j| dt  }
t| j|	 dt  }t|
dd| jfdd||
j d | jd}t|dd| jfdd||j d | jd}| j
r|d	| jd df }||dd9 }||dd9 }| jr,| |}| |}|| }|d}n[|jdddd|jddddt  }|| }d| j  d  }t!||d|  }||j"ddd }||j"ddd }||jddddt  }||jddddt  }|| }|d}|"d}| j
r|d|dt  }	 | S |"d}	 | S )a  Compute negative (E)STOI loss.

        Args:
            est_targets (torch.Tensor): Tensor containing target estimates.
            targets (torch.Tensor): Tensor containing clean targets.

        Shapes:
            (time,) --> (1, )
            (batch, time) --> (batch, )
            (batch, n_src, time) --> (batch, n_src)

        Returns:
            torch.Tensor, the batch of negative STOI loss
        zCtargets and est_targets should have the same shape, found {} and {}r   Nr   r   r   T)
descending.)overlap)r   r   kernel_sizestridepdimkeepdim
      r>   )#shapeRuntimeErrorformatndimforwardviewr   r   r   r"   r   remove_silent_framesr   r+   r#   sortstftr$   	unsqueezer%   matmulr,   abspowEPSsqrtr   r   r   rowcol_normsumnormr   minmean)r-   r3   r4   innerwav_len
batch_sizemask_x_specy_specx_toby_tobx_segy_segx_ny_n	corr_comp
norm_consty_seg_normedclip_valy_primoutputr1   r1   r2   rF   Y   s   







	  



	
	zNegSTOILoss.forwardc                 C   s,  t | ddddddf d|fd|fd}t |ddddddf d|fd|fd}||dddf 9 }||dddf 9 }dttj|dddt  }|jddd| | d	k }	|	d}	|d	dd}|d	dd}t||	}t||	}t	||}
t	||}|d	dd}|d	dd}|
||	
 fS )
a  Detects silent frames on input tensor.
        A frame is excluded if its energy is lower than max(energy) - dyn_range

        Args:
            x (torch.Tensor): batch of original speech wav file  (batch, time)
            dyn_range : Energy range to determine which frame is silent
            framelen : Window size for energy evaluation
            hop : Hop size for energy evaluation

        Returns:
            torch.BoolTensor, framewise mask.
        Nr   r8   r@   T)r=   r>   r   rA   r   )r   r%   log10rS   rO   amaxsqueezepermute_mask_audio_overlap_and_addlong)xyr   windowframelenhopx_framesy_frames
x_energiesrY   x_sily_silr1   r1   r2   rH      s(   ""




z NegSTOILoss.remove_silent_frames   c                 C   sn   |j d }t|| }t| ddddddf d|fd|fddddf }tjj||dddf  |ddS )a*  We can't use torch.stft:
        - It's buggy with center=False as it discards the last frame
        - It pads the frame left and right before taking the fft instead
        of padding right
        Instead we unfold and take rfft. This gives the same result as
        pystoi.utils.stft.
        r   Nr   r8   .r   )nr=   )rB   intr   r%   fftrfft)rp   r+   fft_sizer7   r#   rt   framesr1   r1   r2   rJ     s   
	(
"zNegSTOILoss.stftc                 C   s8   dD ]}| | j |dd } | | jd|ddt  } q| S )z4Mean/variance normalize axis 2 and 1 of input vector)r   r   TrA   r   r;   )rU   rS   rO   )rp   r=   r1   r1   r2   rQ   $  s   zNegSTOILoss.rowcol_norm)TFT)rz   )__name__
__module____qualname____doc__r|   boolr   r%   TensorrF   staticmethodrH   rJ   rQ   __classcell__r1   r1   r/   r2   r      s8    . 
 
*r   c                 C   s   | j \}}}| |  }t| d|| | d|f}|||| ||f}|dddd}||d|f}|d d d | f }||||| d |f}|jdd}|d | | }||dfd | }|S )Nr   r   r      r   )axis)rB   r   reshaperl   rR   )ru   rt   rX   
num_framesrs   segmentssignalendr1   r1   r2   rn   -  s   rn   c                 C   s   t dd t| |D }|S )Nc              
   S   s2   g | ]\}}t || d d d t||  fqS )r   )r   lenrR   ).0ximir1   r1   r2   
<listcomp>K  s   2 z_mask_audio.<locals>.<listcomp>)r%   stackzip)rp   rY   masked_audior1   r1   r2   rm   I  s   rm   )r%   r   numpyr'   torch.nn.functionalr   r   r   pystoi.stoir   r   r   r   r	   r
   r   pystoi.utilsr   rO   Moduler   rn   rm   r1   r1   r1   r2   <module>   s    $  "