o
    wiq                     @   s>  d dl Z d dlZd dlmZmZmZ d dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ zd dlZdZW n eyI   dZY nw dZd	d
 Zd"de	jde	jde	jfddZdd Ze	jj	d#de	jde	jdedede	jf
ddZG dd deZ G dd deZ!G dd dej"Z#G d d! d!ej"Z$dS )$    N)OptionalTupleUnion)AudioAugmentor)AudioSegment)loggingTFh㈵>c              
   C   sx  d }d }|dkr| j d }| j d }tj r)tj s)t|dk r)tdtj|| j	d
d||}||
dk }t|
d| djdd}	|jdd}
|	|

d }ttjt|
d| |
d dd dd|

dd	  }|| d}|t7 }| |
d |
d ||fS |d
krtj|j | j| j	d}tj|j | j| j	d}t| j d D ](}| |d d d ||  f  ||< | |d d d ||  f  ||< q|t7 }| |ddd |ddd ||fS d|v r7d|v r7tj|d | j	d}tj|d | j	d}| || j d | j d 
d || j d | j d 
d ||fS | ||fS )Nper_featurer         znormalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result in torch.std() returning nan. Make sure your audio length has enough samples for a single feature (ex. at least `hop_length` for Mel Spectrograms).device        )axis      ?all_featuresdtyper   
fixed_mean	fixed_std)shapetorchcudais_availableis_current_stream_capturinganyitem
ValueErroraranger   	unsqueezeexpandwheresumsqrtmasked_fillisnanCONSTANTzerosr   rangemeanstdviewtensor)xseq_lennormalize_typex_meanx_std
batch_sizemax_time
time_steps
valid_maskx_mean_numeratorx_mean_denominatori r:   n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/features.pynormalize_batch;   sV   

*&(&>
r<   r   spectrogramspectrogram_lenreturnc                 C   sX   | j }| j\}}}tj||ddddf |dddf k}|d| }| ||S )a  
    Fill spectrogram values outside the length with `fill_value`

    Args:
        spectrogram: Tensor with shape [B, C, L] containing batched spectrograms
        spectrogram_len: Tensor with shape [B] containing the sequence length of each batch element
        fill_value: value to fill with, 0.0 by default

    Returns:
        cleaned spectrogram, tensor with shape equal to `spectrogram`
    r   Nr   )r   r   r   r   r    	expand_asr%   )r=   r>   
fill_valuer   r3   _max_lenmaskr:   r:   r;   clean_spectrogram_batchs   s
   *rE   c              
   C   sh   | g}t d|D ]$}|tj| ddddd|f | dddd|df gdd qtj|ddS )zStacks frames together across feature dim

    input is batch_size, feature_dim, num_frames
    output is batch_size, feature_dim*frame_splicing, num_frames

    r   Nr
   dim)r)   appendr   cat)r.   frame_splicingseqnr:   r:   r;   splice_frames   s   FrM   r   lengthsliketime_dim
valid_onesc                 C   s   t j|j| |jd| jd d| dd}t| |  D ]}|	d}q#|dkr=|| d kr=|
d|}|sB| }|S )am  

    Args:
        lengths: Tensor with shape [B] containing the sequence length of each batch element
        like: The mask will contain the same number of dimensions as this Tensor, and will have the same max
            length in the time dimension of this Tensor.
        time_dim: Time dimension of the `shape_tensor` and the resulting mask. Zero-based.
        valid_ones: If True, valid tokens will contain value `1` and padding will be `0`. Else, invert.

    Returns:
        A :class:`torch.Tensor` containing 1's and 0's for valid and invalid tokens, respectively, if `valid_ones`, else
        vice-versa. Mask will have the same number of dimensions as `like`. Batch and time dimensions will match
        the `like`. All other dimensions will be singletons. E.g., if `like.shape == [3, 4, 5]` and
        `time_dim == -1', mask will have shape `[3, 1, 5]`.
    r   r   r   r   )r   r   r   r   repeatltr,   r)   rG   r    	transpose)rN   rO   rP   rQ   rD   rB   r:   r:   r;   make_seq_mask_like   s   2rU   c                
   @   sT   e Zd ZdddZdd Zdddejd	d
ddddf
ddZdd Ze	dddZ
dS )WaveformFeaturizer>  FNc                 C   s$   |d ur|nt  | _|| _|| _d S N)r   	augmentorsample_rate
int_values)selfrZ   r[   rY   r:   r:   r;   __init__   s   
zWaveformFeaturizer.__init__c                 C   s   | j |S rX   )rY   max_augmentation_length)r\   lengthr:   r:   r;   r^      s   z*WaveformFeaturizer.max_augmentation_lengthr   <   i   i   c                 C   s2   t j|| j| j||||||||	|
|d}| |S )N)	target_srr[   offsetdurationtrimtrim_reftrim_top_dbtrim_frame_lengthtrim_hop_lengthorig_srchannel_selectornormalize_db)r   	from_filerZ   r[   process_segment)r\   	file_pathrb   rc   rd   re   rf   rg   rh   ri   rj   rk   audior:   r:   r;   process   s    
zWaveformFeaturizer.processc                 C   s   | j | tj|jtjdS )Nr   )rY   perturbr   r-   samplesfloat)r\   audio_segmentr:   r:   r;   rm      s   z"WaveformFeaturizer.process_segmentc                 C   s>   |d ur
t |}nd }|dd}|dd}| |||dS )NrZ   rW   r[   F)rZ   r[   rY   )r   from_configget)clsinput_configperturbation_configsaarZ   r[   r:   r:   r;   rv      s   zWaveformFeaturizer.from_config)rW   FNrX   )__name__
__module____qualname__r]   r^   npmaxrp   rm   classmethodrv   r:   r:   r:   r;   rV      s"    

rV   c                   @   s"   e Zd Zdd ZedddZdS )FeaturizerFactoryc                 C   s   d S rX   r:   r\   r:   r:   r;   r]      s   zFeaturizerFactory.__init__Nc                 C   s   t j||dS )N)rz   )rV   rv   )rx   	input_cfgrz   r:   r:   r;   rv      s   zFeaturizerFactory.from_configrX   )r|   r}   r~   r]   r   rv   r:   r:   r:   r;   r      s    r   c                       s   e Zd ZdZdddddddd	d
ddddeddddd
ddddddddf fdd	Zdd Zdd Zdd Ze	dd Z
d"d d!Z  ZS )#FilterbankFeatureszkFeaturizer that converts wavs to Mel Spectrograms.
    See AudioToMelSpectrogramPreprocessor for args.
    rW   @     hannr	   N
ףp=
?@   r   Tadd      p>   333330@r   F       @r     slaneyc           "   	      s  t    |s	|rtd |r|d dkrt|  d|| _|d u s:|d u s:t|tr:t|tr:|dks:|dkrAt|  dt	d|  || _
|| _|| _|p^dtt| j | _|rj| j| j d nd | _|| _|| _
|rzt	d tjtjtjtjd d	}||d }|r|| jd
dnd }| d| || _|| _|| _|| _|| _|| _|| _ |
p|d }
tj!t"j#j$|| j||	|
|dtj%d&d}| d| | 'tj!|| tj%d} |dkr|| |  nd}!| |! | _(|| _)|| _*|dvrt|  d| d|| _+|st, | j-| _-|d u rt./ n|| _0|| _1| j1dkr>||d kr5d| _1n	t|| | | _2|| _3t4d|  t4d| j  t4d| j  t4d| j  t4d|  t4d|	  t4d|
  t4d|  t4d|  d S )NzUsing torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.r
   r   z received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size.r   z^ got an invalid value for either n_window_size or n_window_stride. Both must be positive ints.z	PADDING: zSTFT using exact pad)r   hammingblackmanbartlettnoneF)periodicwindow)srn_fftn_melsfminfmaxnormrq   fb)r   clamp
 received zK for the log_zero_guard_type parameter. It must be either 'add' or 'clamp'.r   zsr: zn_fft: zwin_length: zhop_length: zn_mels: zfmin: zfmax: zusing grads: znb_augmentation_prob: )5superr]   r   warningNotImplementedErrorlog_zero_guard_value
isinstanceintr   inforZ   
win_length
hop_lengthmathceillog2r   stft_pad_amount	exact_padr   hann_windowhamming_windowblackman_windowbartlett_windowrw   register_buffer	normalizelogditherrJ   nfiltpreemphpad_tor-   librosafiltersmelrt   r    get_seq_len
max_length	pad_value	mag_power	use_gradsno_gradforwardrandomRandom_rngnb_augmentation_prob_nb_max_fft_binlog_zero_guard_typedebug)"r\   rZ   n_window_sizen_window_strider   r   r   r   r   lowfreqhighfreqr   r   r   r   r   max_durationrJ   r   r   r   r   rngr   nb_max_freqmel_normstft_exact_pad	stft_convtorch_windows	window_fnwindow_tensorfilterbanksr   max_pad	__class__r:   r;   r]      s   



zFilterbankFeatures.__init__c              
   C   s:   t j|| j| j| j| jrdnd| jjt j|j	ddddS )NFTr   constant)r   r   r   centerr   return_complexpad_mode)
r   stftr   r   r   r   r   tort   r   r\   r.   r:   r:   r;   r     s   zFilterbankFeatures.stftc                 C   sX   t | jtr)| jdkrt|jjS | jdkrt|jjS t|  d| j d| jS )Ntinyepsr   zT for the log_zero_guard_type parameter. It must be either a number, 'tiny', or 'eps')	r   r   strr   finfor   r   r   r   r   r:   r:   r;   log_zero_guard_value_fn  s   

z*FilterbankFeatures.log_zero_guard_value_fnc                 C   sH   | j d ur
| j d n| jd d }t|| | j | j}|jtjdS )Nr
   rq   )r   r   r   floor_divider   r   long)r\   r/   
pad_amountr:   r:   r;   r     s   "zFilterbankFeatures.get_seq_lenc                 C   s   | j S rX   )r   r   r:   r:   r;   filter_banks  s   zFilterbankFeatures.filter_banksc                 C   s  |}|  |}t|dkt||}| jd ur+tjj|d| j| jfd	d}| j
r=| jdkr=|| jt| 7 }| jd urtj|jd |jdd|dk }tj|d d df d|d d dd f | j|d d d df   fdd}|| d}tjj|jjdd	 | |}W d    n1 sw   Y  | jsdnt}t|}t|d
d| }| j
r| jdkrt|jd D ]}| j  | jk rd||| j!d d d f< q| j"dkr|| j"}|r||fS tjj|jjdd	 t#| j$%|j&|}W d    n	1 sw   Y  | j'rE| j(dkr-t'|| )| }n| j(dkrAt'tj*|| )|d}nt+d| j,dkrQt-|| j,}| j.r`t/||| j.d\}}	}	|0d}
tj|
|jd}|1|0dd|dk}||dtj2j%|jd| j3}~| j4}|dkrtjj|d| j5|0d f| j3d}||fS |dkr|0d| }|dkrtjj|d|| f| j3d}||fS )Nr   r   r   r   r   rF   r   Fenabledr
   r   r   r   minz&log_zero_guard_type was not understood)r0   r   )value)6r   r   r"   
zeros_liker   nn
functionalpadr    squeezetrainingr   
randn_liker   r   r   r   rI   r%   ampautocasttyper   r   r'   view_as_realr$   powr#   r   r)   r   r   r   r   matmulr   r   r   r   r   r   r   r   rJ   rM   r   r<   sizerR   boolr   r   r   )r\   r.   r/   linear_specseq_len_timeseq_len_unfixedtimemaskguardidxrB   rC   rD   r   pad_amtr:   r:   r;   r     sr   


&N


&
$

zFilterbankFeatures.forward)F)r|   r}   r~   __doc__r'   r]   r   r   r   propertyr   r   __classcell__r:   r:   r   r;   r      sJ     
r   c                5       s  e Zd ZdZ											
																		dOdedededee dedee dededee ded ed!e	eef d"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ee
j d.ed/ef4 fd0d1Zed2d3 Zd4ejd5efd6d7Zd8ejd5ejfd9d:Zd8ejd5ejfd;d<Zd=ejd5ejfd>d?Zd@ejd5ejfdAdBZd@ejd5ejfdCdDZd8ejd5ejfdEdFZdPd@ejdGejdHed5ejfdIdJZdKejdLejd5eejejf fdMdNZ  ZS )QFilterbankFeaturesTAz
    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.

    See `AudioToMelSpectrogramPreprocessor` for args.

    rW   r   r   r	   r   Nr   r   Tr   r   r   r   r   r   Fr   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rJ   r   r   r   r   r   r   r   c                    s  t    tstd| jj ddh}t|tr$||vr$td| tj	tj
tjtjtjtjd| _|| jvrFtd| d| j  || _|| _|| _|| _|
| _|| _|| _|| _|| _|| _|| _|| _tjj| j| j| j|| j| d|||	|d	d
id| _d S )Nz,Need to install torchaudio to instantiate a r   r   z;Log zero guard value must either be a float or a member of )r   r   r   r   onesNzGot window value 'z' but expected a member of r   r   F)rZ   r   r   r   r   	mel_scaler   r   f_maxf_minwkwargs) r   r]   HAVE_TORCHAUDIOr   r   r|   r   r   r   r   r   r   r   r  r   keysr   r   _sample_rate_normalize_strategy_use_log_preemphasis_valuer   r   r   r   r   r   
torchaudio
transformsMelSpectrogram_mel_spec_extractor)r\   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rJ   r   r   r   r   r   r   r    supported_log_zero_guard_stringsr   r:   r;   r]     sT   


zFilterbankFeaturesTA.__init__c                 C   s
   | j jjS )zMatches the analogous class)r  r  r   r   r:   r:   r;   r   S  s   
z!FilterbankFeaturesTA.filter_banksr   r?   c                 C   s$   t | jtr	| jS tt|| jS rX   )r   r   rt   getattrr   r   )r\   r   r:   r:   r;   _resolve_log_zero_guard_valueX  s   z2FilterbankFeaturesTA._resolve_log_zero_guard_valuesignalsc                 C   s,   | j r| jdkrt|| j }|| }|S )Nr   )r   r   r   r   )r\   r  noiser:   r:   r;   _apply_dithering]  s   z%FilterbankFeaturesTA._apply_ditheringc                 C   s<   | j d urtjj|d}|| j |d d d df   }|S )N)r   r   r   )r  r   r   r   r   )r\   r  paddedr:   r:   r;   _apply_preemphasisc  s   
z'FilterbankFeaturesTA._apply_preemphasisinput_lengthsc                 C   s   |j | jddd }|S )Nfloor)rounding_moder   )divr   r   r   )r\   r"  out_lengthsr:   r:   r;   _compute_output_lengthsi  s   z,FilterbankFeaturesTA._compute_output_lengthsfeaturesc                 C   sX   | j r| jdks|jd | j dkr|S | j|jd | j  }tjjj|d|f| jdS )Nr   r   )r   r   )r   r   r   r   r   r   r   r   )r\   r(  
pad_lengthr:   r:   r;   _apply_pad_tom  s   $z"FilterbankFeaturesTA._apply_pad_toc                 C   s\   | j r,| |j}| jdkr|| }n| jdkr|j|d}n	td| j d| }|S )Nr   r   r   z"Unsupported log zero guard type: '')r  r  r   r   r   r   r   )r\   r(  
zero_guardr:   r:   r;   
_apply_logt  s   


zFilterbankFeaturesTA._apply_logc                 C   sB   t jjddd | j|d}W d    |S 1 sw   Y  |S )Nr   Fr   )waveform)r   r   r   r  )r\   r  r(  r:   r:   r;   _extract_spectrograms  s   
z*FilterbankFeaturesTA._extract_spectrogramsrN   r   c           	      C   s   t ||ddd}||d}| jd u r|S | |j}| jdks%| jdkrid}| jdkr0ddg}|j|d	d
|ddd}|||d	dj|d	d
|dddd j
|d }|| ||  }ntd| j ||d}|S )Nr   F)rN   rO   rP   rQ   r   r	   r   r
   r   T)rG   keepdimr   r   zUnsupported norm type: ')rU   r%   r  r  r   r#   r%  r,   subr   r   r$   r   )	r\   r(  rN   r   rD   guard_value
reduce_dimmeansstdsr:   r:   r;   _apply_normalization  s0   

	z)FilterbankFeaturesTA._apply_normalizationinput_signalr_   c                 C   s^   | j |d}| j|d}| j|d}| j|d}| j|d}| j||d}| j|d}||fS )N)r"  )r  )r(  )r(  rN   )r'  r  r!  r/  r-  r6  r*  )r\   r7  r_   feature_lengthsr  r(  r:   r:   r;   r     s   zFilterbankFeaturesTA.forward)rW   r   r   r	   r   Nr   r   NTr   r   r   r   r   r   r   Fr   r   Fr   r   r   NFF)r   )r|   r}   r~   r  r   r   r   rt   r   r   r   r   r]   r  r   r   r   r  Tensorr  r!  r'  r*  r-  r/  r6  r   r   r	  r:   r:   r   r;   r
    s    		

R
". r
  )r   )r   T)%r   r   typingr   r   r   r   numpyr   r   torch.nnr   0nemo.collections.asr.parts.preprocessing.perturbr   0nemo.collections.asr.parts.preprocessing.segmentr   
nemo.utilsr   r  r  ModuleNotFoundErrorr'   r<   r9  rE   rM   jitscript_if_tracingr   r   rU   objectrV   r   Moduler   r
  r:   r:   r:   r;   <module>   sN   "8 9	  