o
    ؉ii                     @   s   d dl mZ d dlT d dlmZ d dlmZmZ d dlZ	d dl
Z
d dlZd dlZdZddeeee	jf dee fd	d
Zdd Zdd ZdddZdS )    )binary_dilation)*)Path)OptionalUnionNi  fpath_or_wav	source_src                 C   sb   t | ts
t | trtjt| dd\}}n| }|dur$tj||td}t|tdd}t	|}|S )a  
    Applies preprocessing operations to a waveform either on disk or in memory such that  
    The waveform will be resampled to match the data hyperparameters.

    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
    just .wav), either the waveform as a numpy array of floats.
    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
    preprocessing. After preprocessing, the waveform'speaker sampling rate will match the data 
    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
    this argument will be ignored.
    N)sr)orig_sr	target_srT)increase_only)

isinstancestrr   librosaloadresamplesampling_ratenormalize_volumeaudio_norm_target_dBFStrim_long_silences)r   r   wav r   E/home/ubuntu/.local/lib/python3.10/site-packages/resemblyzer/audio.pypreprocess_wav   s   r   c                 C   s<   t jj| tttt d ttt d td}|t	j
jS )z
    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
    Note: this not a log-mel spectrogram.
      )yr	   n_fft
hop_lengthn_mels)r   featuremelspectrogramr   intmel_window_lengthmel_window_stepmel_n_channelsastypenpfloat32T)r   framesr   r   r   wav_to_mel_spectrogram*   s   r*   c           	      C   s   t t d }| dt| t| |   } tjdt|  gt| t tj	R  }g }t
jdd}tdt| |D ]}|| }||j||d |d  td q:t|}d	d
 }||t}t|t}t|ttd }t||}| |dk S )a+  
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats 
    :return: the same waveform with silences trimmed away (length <= original wav length)
    r   Nz%dh   )moder      )sample_ratec                 S   sl   t t |d d | t |d f}t j|td}||d  |d |   ||d < ||d d  | S )N   r-   )dtype)r&   concatenatezeroscumsumfloat)arraywidtharray_paddedretr   r   r   moving_averageT   s   ("z*trim_long_silences.<locals>.moving_averager/   T)vad_window_lengthr   lenstructpackr&   round	int16_maxr%   int16	webrtcvadVadrangeappend	is_speechr5   vad_moving_average_widthboolr   onesvad_max_silence_lengthrepeat)	r   samples_per_windowpcm_wavevoice_flagsvadwindow_start
window_endr9   
audio_maskr   r   r   r   9   s"   	,


r   Fc                 C   sn   |r|rt dtt| t d }dt|t  }|| }|dk r'|s-|dkr/|r/| S | d|d   S )Nz,Both increase only and decrease only are setr-      r   
   )
ValueErrorr&   sqrtmeanr?   log10)r   target_dBFSr   decrease_onlyrms	wave_dBFSdBFS_changer   r   r   r   d   s   r   )N)FF)scipy.ndimage.morphologyr   resemblyzer.hparamspathlibr   typingr   r   numpyr&   rA   r   r<   r?   r   ndarrayr!   r   r*   r   r   r   r   r   r   <module>   s    $+