o
    wiS                     @   s
  d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ eG d	d
 d
eZdeeef dedededef
ddZde	e
e  deeef dededef
ddZde
e dejdeeef dededdfddZ 	d)deeef de
e deeef dedede
ee!e!f  ddfddZ"deeef de!de!de
ee!e!f  fddZ#deeef de
e d e	eeef  ddfd!d"Z$G d#d$ d$eZ%G d%d& d&eZ&G d'd( d(eZ'dS )*    N)ABCabstractmethod)Path)AnyDictListOptionalTupleUnion)Tensor)!AudioToMelSpectrogramPreprocessor)get_audio_filepathsnormalize_volumestack_tensors)experimentalc                   @   s   e Zd Zeddeeef dedededdf
dd	Z	edeeef dededeee
f fd
dZedeeee
f  deee
f fddZdS )
FeaturizerTmanifest_entry	audio_dirfeature_dir	overwritereturnNc                 C      dS )a\  
        Save feature value to disk for given manifest entry.

        Args:
            manifest_entry: Manifest entry dictionary.
            audio_dir: base directory where audio is stored.
            feature_dir: base directory where features will be stored.
            overwrite: whether to overwrite features if they already exist.
        N )selfr   r   r   r   r   r   n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/parts/preprocessing/features.pysave!       zFeaturizer.savec                 C   r   )aZ  
        Read saved feature value for given manifest entry.

        Args:
            manifest_entry: Manifest entry dictionary.
            audio_dir: base directory where audio is stored.
            feature_dir: base directory where features were stored by save().

        Returns:
            Dictionary of feature names to Tensors
        Nr   )r   r   r   r   r   r   r   load-   r   zFeaturizer.loadtrain_batchc                 C   r   )zK
        Combine list/batch of features into a feature dictionary.
        Nr   )r   r   r   r   r   
collate_fn;   r   zFeaturizer.collate_fnT)__name__
__module____qualname__r   r   strr   r   boolr   r   r   r   r   r   r   r   r   r      s    *,,r   r   r   r   feature_namer   c                 C   s&   t | |d\}}|| |d }|S )z
    Get the absolute path for the feature file corresponding to the input manifest entry

    Example: audio_filepath "<audio_dir>/speaker1/audio1.wav" becomes
        feature_filepath "<feature_dir>/<feature_name>/speaker1/audio1.npy"
    r   r   z.npy)r   with_suffix)r   r   r   r&   _audio_filepath_relfeature_filepathr   r   r   _get_feature_filepathB   s   	r,   feature_namesc                 C   s6   | D ]}|d u r	qt ||||d}| s dS qdS )Nr   r   r   r&   FT)r,   exists)r-   r   r   r   r&   r+   r   r   r   _features_existsP   s   r0   featuresc                 C   sB   | du rdS t |||| d}|jjddd tjt||d dS )zA
    If feature_name is provided, save feature as .npy file.
    Nr.   T)exist_okparents)filearr)r,   parentmkdirnpr   r$   )r&   r1   r   r   r   r+   r   r   r   _save_feature^   s   
r9   feature_dictindicesc           
      C   sv   |du rdS t ||||d}t|}|r+tj|dd}||d |d  }t|}nt|}t|}	|	| |< dS )zU
    If feature_name is provided, load feature into feature_dict from .npy file.
    Nr.   r)	mmap_moder      )r,   r$   r8   r   copytorch
from_numpy)
r:   r&   r   r   r   r;   r+   feature_mmapfeature_arrayfeature_tensorr   r   r   _load_featurer   s   

rE   sample_rate
hop_lengthc                 C   sP   d| vrd S | d }| d }t jj|||d}d| t jj|||d }||fS )Noffsetduration)srrG   r>   )librosacoretime_to_frames)r   rF   rG   rH   rI   start_iend_ir   r   r   _get_frame_indices   s   rP   r   c                 C   sX   |d u rd S g }|D ]}|| }| | q
tdd |D }t||gd}|| |< d S )Nc                 S   s   g | ]}|j d  qS )r   )shape).0fr   r   r   
<listcomp>   s    z$_collate_feature.<locals>.<listcomp>)max_lens)appendmaxr   )r:   r&   r   feature_tensorsexamplerD   max_lenstacked_featuresr   r   r   _collate_feature   s   r\   c                   @   s  e Zd Z											
		d(dededededededededededeeeef  deddfddZ	de
eef dedejfddZd)de
eef deded eddf
d!d"Zde
eef dedede
eef fd#d$Zd%ee
eef  de
eef fd&d'ZdS )*MelSpectrogramFeaturizermel_spec"V  P         r   @  Tadd      ?Nr&   rF   mel_dim
win_lengthrG   lowfreqhighfreqloglog_zero_guard_typelog_zero_guard_valuemel_normvolume_normr   c                 C   s   || _ || _|| _|| _|| _tdi d|d|ddd|d|ddd	dd
|d|d|ddd|d|	d|
d|dd dd dd| _d S )NrF   r1   pad_tor>   n_window_sizen_window_stridewindow_sizeFwindow_striden_fftrh   ri   	mag_powerre   rj   rk   rl   rm   	normalizepreemphdither        r   )r&   rF   rg   rG   rn   r   preprocessor)r   r&   rF   rf   rg   rG   rh   ri   rj   rk   rl   rm   rn   r   r   r   __init__   sT   	

z!MelSpectrogramFeaturizer.__init__r   r   c           
      C   s   t ||d\}}tj|| jd\}}| jrt|}tj|tj	ddf tj
d}tj|jd gtjd}| j||d\}}| d }| }	|	S )a(  
        Computes mel spectrogram for the input manifest entry.

        Args:
            manifest_entry: Manifest entry dictionary.
            audio_dir: base directory where audio is store

        Returns:
            [spec_dim, T_spec] float tensor containing spectrogram features.
        r'   rJ   N)dtyper   )input_signallength)r   rK   r   rF   rn   r   r@   tensorr8   newaxisfloat32rQ   int32rz   detachnumpy)
r   r   r   audio_filepath_absr)   audioaudio_tensoraudio_len_tensorspec_tensor
spec_arrayr   r   r   compute_mel_spec   s   z)MelSpectrogramFeaturizer.compute_mel_specr   r   c                 C   B   |st | jg|||drd S | j||d}t| j||||d d S N)r-   r   r   r   r'   )r&   r1   r   r   r   )r0   r&   r   r9   )r   r   r   r   r   specr   r   r   r         
zMelSpectrogramFeaturizer.savec                 C   s0   i }t || j| jd}t|| j||||d |S N)r   rF   rG   )r:   r&   r   r   r   r;   )rP   rF   rG   rE   r&   r   r   r   r   r:   r;   r   r   r   r     s   
zMelSpectrogramFeaturizer.loadr   c                 C      i }t || j|d |S N)r:   r&   r   r\   r&   r   r   r:   r   r   r   r        z#MelSpectrogramFeaturizer.collate_fn)r^   r_   r`   ra   rb   r   rc   Trd   re   NTr    )r!   r"   r#   r$   intr%   floatr   r
   r{   r   r   r   r8   ndarrayr   r   r   r   r   r   r   r   r   r   r]      sX    	

 *(**r]   c                   @   s   e Zd ZddededdfddZdeeef d	ede	j
fd
dZddeeef d	edededdf
ddZdeeef d	ededeeef fddZdeeeef  deeef fddZdS )EnergyFeaturizerenergyspec_featurizerr&   r   Nc                 C   s   || _ || _d S N)r&   r   )r   r   r&   r   r   r   r{     s   
zEnergyFeaturizer.__init__r   r   c                 C   s$   | j j||d}tjj|dd}|S )a  
        Computes energy for the input manifest entry.

        Args:
            manifest_entry: Manifest entry dictionary.
            audio_dir: base directory where audio is store

        Returns:
            [T_spec] float tensor containing energy features.
        r'   r   axis)r   r   r8   linalgnorm)r   r   r   r   r   r   r   r   compute_energy  s   zEnergyFeaturizer.compute_energyTr   r   c                 C   r   r   )r0   r&   r   r9   )r   r   r   r   r   r   r   r   r   r   /  r   zEnergyFeaturizer.savec                 C   s4   i }t || jj| jjd}t|| j||||d |S r   )rP   r   rF   rG   rE   r&   r   r   r   r   r   A  s   zEnergyFeaturizer.loadr   c                 C   r   r   r   r   r   r   r   r   R  r   zEnergyFeaturizer.collate_fn)r   r    )r!   r"   r#   r]   r$   r{   r   r   r   r8   r   r   r%   r   r   r   r   r   r   r   r   r   r     s     (**r   c                   @   s$  e Zd ZdZddddddeded	d
ddfdee dee dee dededededede	dee
 deddfddZdeeef dedeejejejf fddZd(deeef deded e	ddf
d!d"Zdeeef dededeeef fd#d$Zd%eeeef  deeef fd&d'ZdS ))PitchFeaturizera  
    Class for computing pitch features.

    Args:
        pitch_name: Optional directory name to save pitch features under.
            If None, then pitch will not be saved.
        voiced_mask_name: Optional directory name to save voiced mask under.
            If None, then voiced mask will not be saved.
        voiced_prob_name: Optional directory name to save voiced probabilities under.
            If None, then voiced probabilities will not be saved.
        sample_rate: Sample rate to use when loading audio.
        win_length: Audio frame length to use for pitch computation.
        hop_length: Audio hop length to use for pitch computation.
        pitch_fmin: Minimum pitch value to compute. Defaults to librosa.note_to_hz('C2') = 65.41 Hz.
        pitch_fmax: Maximum pitch value to compute. Defaults to librosa.note_to_hz('C7') = 2093.00 Hz.
            Setting this to a lower value will speed up computation, but may lose some pitch information.
        volume_norm: Whether to apply volume normalization to the audio.
        batch_seconds: Optional float, if provided then long audio files will have their pitch computed after
            splitting them into segments batch_seconds seconds long, to avoid running out of memory.
        batch_padding: If batch_seconds is provided, then this determines how many audio frames will be padded on
            both sides of each segment to ensure that the pitch values at the boundary are correct.
            If batch_seconds is not provided then this parameter is ignored.
    pitchvoiced_maskNr_   ra   rb   C2C7Tg      >@
   
pitch_namevoiced_mask_namevoiced_prob_namerF   rg   rG   
pitch_fmin
pitch_fmaxrn   batch_secondsbatch_paddingr   c                 C   s   || _ || _|| _|| _|| _|| _|	| _|| _|| _|
rG|d us#J t	|
| }t	t
|| j | _| j| j | _|| _| j| j | _d S d | _d | _d S r   )r   r   r   rF   rg   rG   rn   r   r   r   mathceilbatch_framesbatch_samplesbatch_padding_framesbatch_padding_samplesr   )r   r   r   r   rF   rg   rG   r   r   rn   r   r   r   r   r   r   r{   q  s$   
zPitchFeaturizer.__init__r   r   c              
   C   s  t ||d\}}tj|| jd\}}| jrt|}| jr$|jd | jk r9tj|| j	| j
| j| j| jdd\}}}ntt|jd | j }	g }
g }g }t|	D ]}|| j }|d | j }|dkrg|| j8 }||	d krr|| j7 }||| }tj|| j	| j
| j| j| jdd\}}}|dkr|| jd }|| jd }|| jd }||	d kr|d| j }|d| j }|d| j }|
| || || qPtj|
dd}tj|dd}tj|dd}|tj}|tj}|||fS )	a  
        Computes pitch and optional voiced mask for the input manifest entry.

        Args:
            manifest_entry: Manifest entry dictionary.
            audio_dir: base directory where audio is store

        Returns:
            pitch: [T_spec] float tensor containing pitch for each audio frame.
            voiced_mask: [T_spec] bool tensor indicating whether each audio frame is voiced.
            voiced_prob: [T_spec] float array with [0, 1] probability that each audio frame is voiced.
        r'   r|   r   ry   )fminfmaxrJ   frame_lengthrG   fill_nar>   Nr   )r   rK   r   rF   rn   r   r   rQ   pyinr   r   rg   rG   r   r8   r   ranger   r   r   rV   concatenateastyper   )r   r   r   r   r)   r   r   r   voiced_prob
num_chunks
pitch_listvoiced_mask_listvoiced_prob_listirN   rO   audio_chunkpitch_ivoiced_mask_ivoiced_prob_ir   r   r   compute_pitch  sh   







zPitchFeaturizer.compute_pitchr   r   c                 C   sx   |st | j| j| jg|||drd S | j||d\}}}t| j||||d t| j||||d t| j||||d d S r   )r0   r   r   r   r   r9   )r   r   r   r   r   r   r   r   r   r   r   r     s:   
zPitchFeaturizer.savec                 C   s\   i }t || j| jd}t|| j||||d t|| j||||d t|| j||||d |S r   )rP   rF   rG   rE   r   r   r   r   r   r   r   r     s:   
zPitchFeaturizer.loadr   c                 C   s8   i }t || j|d t || j|d t || j|d |S r   )r\   r   r   r   r   r   r   r   r   $  s
   zPitchFeaturizer.collate_fnr    )r!   r"   r#   __doc__rK   
note_to_hzr   r$   r   r%   r   r{   r   r   r   r	   r8   r   r   r   r   r   r   r   r   r   r   r   r   X  s`    	

#

(Q* *r   r   )(r   abcr   r   pathlibr   typingr   r   r   r   r	   r
   rK   r   r8   r@   r   nemo.collections.asr.modulesr   2nemo.collections.tts.parts.utils.tts_dataset_utilsr   r   r   nemo.utils.decoratorsr   r   r$   r,   r%   r0   r   r9   r   rE   rP   r\   r]   r   r   r   r   r   r   <module>   s    "









.

p?