o
    oi%                     @   s   d dl mZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z d dlmZ edjZdd	ed
ededefddZefd	ededededededeee ee f fddZdd ZdddZdd ZdS )    )ListTupleN)logger)Tensor)
functional)resamplefloat   xwindow_lengthstepreturnc                 C   sz   | j d | | | |f| j dd  }g }t|  D ]
}|| | q|d|d  |d | |d< | ||S )a  Returns a tensor with chunks of overlapping windows of the first dim of x.

    Args:
        x (Tensor): Input of shape [N, B, H, W]
        window_length (int): Length of each window
        step (int): Step/hop of each window w.r.t. the original signal x

    Returns:
        windowed tensor (Tensor): Output tensor with shape
            [(N - window_length + step) // step, window_length, B, H, W]
    r   r	   N)shaperangedimappendstrideinsert
as_strided)r
   r   r   r   r   i r   E/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/df/stoi.pyas_windowed   s   (r   y	dyn_rangeframelenhopepsc                    s  j d   }t|d\}}|| }	t||	ft|||	f}j \}
} | }tjd djjddd 	ddt
 t
| 	dtjddt |  }tj|ddd | | ddk fd	d
t|
D 	fdd
t|
D 	dd
 D fdd
D fdd
t|
D }fdd
t|
D }dd
 D dd
 	D 	tjjjdd  fdd
D } fdd
	D }fdd
t|
D  fdd
D dd
 t|D }dd
 t|D }t|
D ]A}ddd|f r=|| |d ||< || |d ||< ddd|f r]|| d|	  ||< || d|	  ||< q||fS )a  Remove the silent frames from each signal in the batch.

    Note:
        This implementation is based on https://github.com/mpariente/pystoi
        The overlap add code is based on https://github.com/pytorch/pytorchaudio

    Args:
        x (Tensor): Reference signal of shape [batch, samples].
        y (Tensor): Second signal of the same shape where the same frames are removed.
        dyn_range (int): Dynamic range / energy in [dB] to determin which frames to
            remove.
        framelen (int): Length for a frame that might be removed.
        hop (int): Hop between to subsequent frames.

    Returns:
        x (List[Tensor]): x without silent frames. Since each signal might have a
            different number of removed frames, the resulting signals will have
            different lengths. Thus a batch is return as list of tensors.
        y (List[Tensor]): y without silent frames.
    r	      Fperiodicdevicedtype   )r   r   c                    .   g | ]}d |f  d |f d qS .r#   masked_selectview.0r   )r   maskx_wr   r   
<listcomp>Q      . z(remove_silent_frames.<locals>.<listcomp>c                    r%   r&   r'   r*   )r   r,   y_wr   r   r.   R   r/   c                 S   s   g | ]}|j d  qS r   r   r+   r
   r   r   r   r.   S   s    c                    s"   g | ]}|j d  d    qS )r   r	   r2   r3   )r   r   r   r   r.   U   s   " c                        g | ]}t j | jd qS )r!   torchzerosr!   r*   n_no_silr
   r   r   r.   V        c                    r4   r5   r6   r*   r9   r   r   r.   W   r;   c                 S      g | ]	}|  d qS r1   t	unsqueezer3   r   r   r   r.   Y       c                 S   r<   r1   r=   r+   r   r   r   r   r.   Z   r@   r!   r"   c                        g | ]}t j| d  qS )r   Fconv_transpose1dsqueezer3   eyer   r   r   r.   ]   r;   c                    rC   rD   rE   rA   rI   r   r   r.   ^   r;   c                    s$   g | ]} d  | fdqS )r	   r   )repeatr?   r*   )
n_no_sil_wwr   r   r.   `   s   $ c                    rC   rD   rE   )r+   w_rI   r   r   r.   b   r;   c                 S      g | ]\}}|| qS r   r   )r+   r
   rM   r   r   r   r.   c       c                 S   rO   r   r   )r+   r   rM   r   r   r   r.   d   rP   N)r   divmodrF   padr>   r7   hann_windowr!   r"   r)   r   log10normnpsqrtmaxr?   r   rJ   zip)r
   r   r   r   r   r   rR   	pad_frontmodpad_endB_
x_energiesx_no_sily_no_silr   r   )
rJ   r   r   r,   r:   rL   rM   r
   r-   r0   r   remove_silent_frames#   sJ   
$$rb   c                 C   s  t d| |d }|dt|d d  }t t|t}t d|| }|t dd| d d  }|t dd| d d  }t |t	|f}	tt	|D ]2}
t 
t |||
  }|| ||
< |}t 
t |||
  }|| ||
< |}d|	|
||f< qR|	|fS )a{  Returns the 1/3 octave band matrix and its center frequencies
    # Arguments :
        fs : sampling rate
        nfft : FFT size
        num_bands : number of 1/3 octave bands
        min_freq : center frequency of the lowest 1/3 octave band
    # Returns :
        obm : Octave Band Matrix
        cf : center frequencies
    # Credit: https://github.com/mpariente/pystoi
    r   r	   Nr   gr(?g       @   )rV   linspaceintarrayr   astyper   powerr8   lenargminsquare)fsnfft	num_bandsmin_freqfkcffreq_low	freq_highobmr   f_binfl_iifh_iir   r   r   thirdoctq   s    ry   Tc           	   	   C   s   |d u r|d }t j|d| j| jddd }|| }t| |d |d f} t j| ||||ddd}|r@|| d	  }|S )Nr   Fr   r	   r#   )centerreturn_complex)
r7   rS   r!   r"   rF   rR   stftsumpowrW   )	r
   win_sizefft_sizehop_size
normalizedwindowwsmissing_lenspecr   r   r   _stft   s   r   c                 C   s  | j |j ks
J d|  dksJ d| j  d}d}d}d}d}d	}d
}	d}
| j d }tj|| j| jd}t||||\}}t|| }t	| ||} t	|||}t
| ||||d \}}t|D ]}||  |k rttd qdt|| |||d d} t|| |||d d}| dd} |dd}t||  } t|| }| j d |	kr| d|	dddd} |d|	dddd}n| ddd} |ddd}tj| dddtj|dddt  }|| }d|
 d  }t|| d|  }||jddd }| | jddd } | tj| dddt  } |tj|dddt  }| | }| j d }|}t|||  ||< qd|S )a  Pytorch STOI implementation. Should only used for validation/developement, use pystoi for reporting test results.

    Arguments:
        x (Tensor): Target signal
        y (Tensor): Degraded signal
        fs_source (int): Sampling rate of input signals
    zInputs must have the same shaper   z7Expected input shape of [batch_size, samples], but got i'  (      i            g      .r   rB   z<Could not calculate STOI (not enough frames left). Skipping.)r   r   r   r#   r	   T)r   keepdim
   r$   )r   r   r7   emptyr!   r"   ry   
from_numpytor   rb   r   numelr   warningr   r~   r}   matmulrW   unfoldpermute	transposer?   rU   EPSminmean)r
   r   	fs_sourcerl   r   N_frameN_fftN_bandsro   NBetar]   outru   r^   x_y_r   rU   ccorrJMr   r   r   stoi   sZ   

$
r   )r	   )TN)typingr   r   numpyrV   r7   logurur   r   torch.nnr   rF   df.ior   finfor   r   re   r   r   rb   ry   r   r   r   r   r   r   <module>   s8    
N
!