o
    }oip                     @   s>  d dl Z d dlZd dlmZmZmZ d dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ zd dlZdZW n eyI   dZY nw dZd	d
 Zd"de	jde	jde	jfddZdd Ze	jj	d#de	jde	jdedede	jf
ddZG dd deZ G dd deZ!G dd dej"Z#G d d! d!ej"Z$dS )$    N)OptionalTupleUnion)AudioAugmentor)AudioSegment)loggingTFh㈵>c              
   C   sh  d }d }|dkr| j d }| j d }tj r)tj s)t|dk r)tdtj|| j	d
d||}||
dk }t|
d| djdd}	|jdd}
|	|

d }ttjt|
d| |
d dd dd|

dd	  }|t7 }| |
d |
d ||fS |d
krtj|j | j| j	d}tj|j | j| j	d}t| j d D ](}| |d d d ||  f  ||< | |d d d ||  f  ||< q|t7 }| |ddd |ddd ||fS d|v r/d|v r/tj|d | j	d}tj|d | j	d}| || j d | j d 
d || j d | j d 
d ||fS | ||fS )Nper_featurer         znormalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result in torch.std() returning nan. Make sure your audio length has enough samples for a single feature (ex. at least `hop_length` for Mel Spectrograms).device        )axis      ?all_featuresdtyper   
fixed_mean	fixed_std)shapetorchcudais_availableis_current_stream_capturinganyitem
ValueErroraranger   	unsqueezeexpandwheresumsqrtCONSTANTzerosr   rangemeanstdviewtensor)xseq_lennormalize_typex_meanx_std
batch_sizemax_time
time_steps
valid_maskx_mean_numeratorx_mean_denominatori r8   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/features.pynormalize_batch;   sT   

*&(&>
r:   r   spectrogramspectrogram_lenreturnc                 C   sX   | j }| j\}}}tj||ddddf |dddf k}|d| }| ||S )a  
    Fill spectrogram values outside the length with `fill_value`

    Args:
        spectrogram: Tensor with shape [B, C, L] containing batched spectrograms
        spectrogram_len: Tensor with shape [B] containing the sequence length of each batch element
        fill_value: value to fill with, 0.0 by default

    Returns:
        cleaned spectrogram, tensor with shape equal to `spectrogram`
    r   Nr   )r   r   r   r   r    	expand_asmasked_fill)r;   r<   
fill_valuer   r1   _max_lenmaskr8   r8   r9   clean_spectrogram_batchr   s
   *rD   c              
   C   sh   | g}t d|D ]$}|tj| ddddd|f | dddd|df gdd qtj|ddS )zStacks frames together across feature dim

    input is batch_size, feature_dim, num_frames
    output is batch_size, feature_dim*frame_splicing, num_frames

    r   Nr
   dim)r'   appendr   cat)r,   frame_splicingseqnr8   r8   r9   splice_frames   s   FrL   r   lengthsliketime_dim
valid_onesc                 C   s   t j|j| |jd| jd d| dd}t| |  D ]}|	d}q#|dkr=|| d kr=|
d|}|sB| }|S )am  

    Args:
        lengths: Tensor with shape [B] containing the sequence length of each batch element
        like: The mask will contain the same number of dimensions as this Tensor, and will have the same max
            length in the time dimension of this Tensor.
        time_dim: Time dimension of the `shape_tensor` and the resulting mask. Zero-based.
        valid_ones: If True, valid tokens will contain value `1` and padding will be `0`. Else, invert.

    Returns:
        A :class:`torch.Tensor` containing 1's and 0's for valid and invalid tokens, respectively, if `valid_ones`, else
        vice-versa. Mask will have the same number of dimensions as `like`. Batch and time dimensions will match
        the `like`. All other dimensions will be singletons. E.g., if `like.shape == [3, 4, 5]` and
        `time_dim == -1', mask will have shape `[3, 1, 5]`.
    r   r   r   r   )r   r   r   r   repeatltr*   r'   rF   r    	transpose)rM   rN   rO   rP   rC   rA   r8   r8   r9   make_seq_mask_like   s   2rT   c                
   @   sT   e Zd ZdddZdd Zdddejd	d
ddddf
ddZdd Ze	dddZ
dS )WaveformFeaturizer>  FNc                 C   s$   |d ur|nt  | _|| _|| _d S N)r   	augmentorsample_rate
int_values)selfrY   rZ   rX   r8   r8   r9   __init__   s   
zWaveformFeaturizer.__init__c                 C   s   | j |S rW   )rX   max_augmentation_length)r[   lengthr8   r8   r9   r]      s   z*WaveformFeaturizer.max_augmentation_lengthr   <   i   i   c                 C   s2   t j|| j| j||||||||	|
|d}| |S )N)	target_srrZ   offsetdurationtrimtrim_reftrim_top_dbtrim_frame_lengthtrim_hop_lengthorig_srchannel_selectornormalize_db)r   	from_filerY   rZ   process_segment)r[   	file_pathra   rb   rc   rd   re   rf   rg   rh   ri   rj   audior8   r8   r9   process   s    
zWaveformFeaturizer.processc                 C   s   | j | tj|jtjdS )Nr   )rX   perturbr   r+   samplesfloat)r[   audio_segmentr8   r8   r9   rl      s   z"WaveformFeaturizer.process_segmentc                 C   s>   |d ur
t |}nd }|dd}|dd}| |||dS )NrY   rV   rZ   F)rY   rZ   rX   )r   from_configget)clsinput_configperturbation_configsaarY   rZ   r8   r8   r9   ru      s   zWaveformFeaturizer.from_config)rV   FNrW   )__name__
__module____qualname__r\   r]   npmaxro   rl   classmethodru   r8   r8   r8   r9   rU      s"    

rU   c                   @   s"   e Zd Zdd ZedddZdS )FeaturizerFactoryc                 C   s   d S rW   r8   r[   r8   r8   r9   r\      s   zFeaturizerFactory.__init__Nc                 C   s   t j||dS )N)ry   )rU   ru   )rw   	input_cfgry   r8   r8   r9   ru      s   zFeaturizerFactory.from_configrW   )r{   r|   r}   r\   r   ru   r8   r8   r8   r9   r      s    r   c                       s   e Zd ZdZdddddddd	d
ddddeddddd
ddddddddf fdd	Zdd Zdd Zdd Ze	dd Z
d"d d!Z  ZS )#FilterbankFeatureszkFeaturizer that converts wavs to Mel Spectrograms.
    See AudioToMelSpectrogramPreprocessor for args.
    rV   @     hannr	   N
ףp=
?@   r   Tadd      p>   333330@r   F       @r     slaneyc           "   	      s  t    |s	|rtd |r|d dkrt|  d|| _|d u s:|d u s:t|tr:t|tr:|dks:|dkrAt|  dt	d|  || _
|| _|p[dtt| j
 | _|rg| j| j d nd | _|| _|rtt	d tjtjtjtjd d	}||d }|r|| j
d
dnd }| d| || _|| _|| _|| _|| _|| _|| _|
p|d }
tj t!j"j#|| j||	|
|dtj$d%d}| d| | &tj || tj$d} |dkr|| |  nd}!| |! | _'|| _(|| _)|dvrt|  d| d|| _*|st+ | j,| _,|d u rt-. n|| _/|| _0| j0dkr8||d kr/d| _0n	t|| | | _1|| _2t3d|  t3d| j  t3d| j
  t3d| j  t3d|  t3d|	  t3d|
  t3d|  t3d|  d S )NzUsing torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.r
   r   z received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size.r   z^ got an invalid value for either n_window_size or n_window_stride. Both must be positive ints.z	PADDING: zSTFT using exact pad)r   hammingblackmanbartlettnoneF)periodicwindow)srn_fftn_melsfminfmaxnormrp   fb)r   clamp
 received zK for the log_zero_guard_type parameter. It must be either 'add' or 'clamp'.r   zsr: zn_fft: zwin_length: zhop_length: zn_mels: zfmin: zfmax: zusing grads: znb_augmentation_prob: )4superr\   r   warningNotImplementedErrorlog_zero_guard_value
isinstanceintr   info
win_length
hop_lengthmathceillog2r   stft_pad_amount	exact_padr   hann_windowhamming_windowblackman_windowbartlett_windowrv   register_buffer	normalizelogditherrI   nfiltpreemphpad_tor+   librosafiltersmelrs   r    get_seq_len
max_length	pad_value	mag_power	use_gradsno_gradforwardrandomRandom_rngnb_augmentation_prob_nb_max_fft_binlog_zero_guard_typedebug)"r[   rY   n_window_sizen_window_strider   r   r   r   r   lowfreqhighfreqr   r   r   r   r   max_durationrI   r   r   r   r   rngr   nb_max_freqmel_normstft_exact_pad	stft_convtorch_windows	window_fnwindow_tensorfilterbanksr   max_pad	__class__r8   r9   r\      s   



zFilterbankFeatures.__init__c              
   C   s8   t j|| j| j| j| jrdnd| jjt j|j	dddS )NFTr   )r   r   r   centerr   return_complex)
r   stftr   r   r   r   r   tors   r   r[   r,   r8   r8   r9   r     s   zFilterbankFeatures.stftc                 C   sX   t | jtr)| jdkrt|jjS | jdkrt|jjS t|  d| j d| jS )Ntinyepsr   zT for the log_zero_guard_type parameter. It must be either a number, 'tiny', or 'eps')	r   r   strr   finfor   r   r   r   r   r8   r8   r9   log_zero_guard_value_fn  s   

z*FilterbankFeatures.log_zero_guard_value_fnc                 C   sL   | j d ur
| j d n| jd d }t|| | j | jd }|jtjdS )Nr
   r   rp   )r   r   r   floor_divider   r   long)r[   r-   
pad_amountr8   r8   r9   r     s   "zFilterbankFeatures.get_seq_lenc                 C   s   | j S rW   )r   r   r8   r8   r9   filter_banks  s   zFilterbankFeatures.filter_banksc                 C   sf  |  |}t|dkt||}| jd ur)tjj|d| j| jfd	d}| j
r;| jdkr;|| jt| 7 }| jd urgtj|d d df d|d d dd f | j|d d d df   fdd}tjj|jjdd | |}W d    n1 sw   Y  | jsdnt}t|}t|dd| }| j
r| jd	krt|jd D ]}| j | jk rd	||| jd d d f< q| j d
kr|| j }|r||fS tjj|jjdd t!| j"#|j$|}W d    n1 sw   Y  | j%r(| j&dkrt%|| '| }n| j&dkr$t%tj(|| '|d}nt)d| j*dkr4t+|| j*}| j,rCt-||| j,d\}}}|.d}tj/||jd}	|	0|.dd|dk}	|1|	dtj2j#|jd| j3}~	| j4}
|
dkrtjj|d| j5|.d f| j3d}||fS |
dkr|.d|
 }|dkrtjj|d|
| f| j3d}||fS )Nr   r   reflectr   rE   Fenabledr
   r   r   r   r   minz&log_zero_guard_type was not understood)r.   r   r   )value)6r   r   r"   
zeros_liker   nn
functionalpadr    squeezetrainingr   
randn_liker   rH   ampautocastr   typer   r   r%   view_as_realr$   powr#   r   r'   r   r   r   r   r   matmulr   r   r   r   r   r   r   r   rI   rL   r   r:   sizer   rQ   r?   boolr   r   r   )r[   r,   r-   linear_specseq_len_unfixedguardidxrA   rB   rC   r   pad_amtr8   r8   r9   r     sl   


N


&
$

zFilterbankFeatures.forward)F)r{   r|   r}   __doc__r%   r\   r   r   r   propertyr   r   __classcell__r8   r8   r   r9   r      sJ     
r   c                5       s  e Zd ZdZ											
																		dOdedededee dedee dededee ded ed!e	eef d"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ee
j d.ed/ef4 fd0d1Zed2d3 Zd4ejd5efd6d7Zd8ejd5ejfd9d:Zd8ejd5ejfd;d<Zd=ejd5ejfd>d?Zd@ejd5ejfdAdBZd@ejd5ejfdCdDZd8ejd5ejfdEdFZdPd@ejdGejdHed5ejfdIdJZdKejdLejd5eejejf fdMdNZ  ZS )QFilterbankFeaturesTAz
    Exportable, `torchaudio`-based implementation of Mel Spectrogram extraction.

    See `AudioToMelSpectrogramPreprocessor` for args.

    rV   r   r   r	   r   Nr   r   Tr   r   r   r   r   r   Fr   r   r   r   rY   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   r   c                    s  t    tstd| jj ddh}t|tr$||vr$td| tj	tj
tjtjtjtjd| _|| jvrFtd| d| j  || _|| _|| _|| _|
| _|| _|| _|| _|| _|| _|| _|| _tjj| j| j| j|| j| d|||	|d	d
id| _d S )Nz,Need to install torchaudio to instantiate a r   r   z;Log zero guard value must either be a float or a member of )r   r   r   r   onesNzGot window value 'z' but expected a member of r   r   F)rY   r   r   r   r   	mel_scaler   r   f_maxf_minwkwargs) r   r\   HAVE_TORCHAUDIOr   r   r{   r   r   r   r   r   r   r   r  r   keysr   r   _sample_rate_normalize_strategy_use_log_preemphasis_valuer   r   r   r   r   r   
torchaudio
transformsMelSpectrogram_mel_spec_extractor)r[   rY   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   r    supported_log_zero_guard_stringsr   r8   r9   r\     sT   


zFilterbankFeaturesTA.__init__c                 C   s
   | j jjS )zMatches the analogous class)r  r  r   r   r8   r8   r9   r   L  s   
z!FilterbankFeaturesTA.filter_banksr   r=   c                 C   s$   t | jtr	| jS tt|| jS rW   )r   r   rs   getattrr   r   )r[   r   r8   r8   r9   _resolve_log_zero_guard_valueQ  s   z2FilterbankFeaturesTA._resolve_log_zero_guard_valuesignalsc                 C   s,   | j r| jdkrt|| j }|| }|S )Nr   )r   r   r   r   )r[   r  noiser8   r8   r9   _apply_ditheringV  s   z%FilterbankFeaturesTA._apply_ditheringc                 C   s<   | j d urtjj|d}|| j |d d d df   }|S )N)r   r   r   )r  r   r   r   r   )r[   r  paddedr8   r8   r9   _apply_preemphasis\  s   
z'FilterbankFeaturesTA._apply_preemphasisinput_lengthsc                 C   s   |j | jddd }|S )Nfloor)rounding_moder   )divr   r   r   )r[   r  out_lengthsr8   r8   r9   _compute_output_lengthsb  s   z,FilterbankFeaturesTA._compute_output_lengthsfeaturesc                 C   sX   | j r| jdks|jd | j dkr|S | j|jd | j  }tjjj|d|f| jdS )Nr   r   )r   r   )r   r   r   r   r   r   r   r   )r[   r$  
pad_lengthr8   r8   r9   _apply_pad_tof  s   $z"FilterbankFeaturesTA._apply_pad_toc                 C   s\   | j r,| |j}| jdkr|| }n| jdkr|j|d}n	td| j d| }|S )Nr   r   r   z"Unsupported log zero guard type: '')r  r  r   r   r   r   r   )r[   r$  
zero_guardr8   r8   r9   
_apply_logm  s   


zFilterbankFeaturesTA._apply_logc                 C   sB   t jjddd | j|d}W d    |S 1 sw   Y  |S )Nr   Fr   )waveform)r   r   r   r  )r[   r  r$  r8   r8   r9   _extract_spectrogramsy  s   
z*FilterbankFeaturesTA._extract_spectrogramsrM   r   c           	      C   s   t ||ddd}||d}| jd u r|S | |j}| jdks%| jdkrid}| jdkr0ddg}|j|d	d
|ddd}|||d	dj|d	d
|dddd j
|d }|| ||  }ntd| j ||d}|S )Nr   F)rM   rN   rO   rP   r   r	   r   r
   r   T)rF   keepdimr   r   zUnsupported norm type: ')rT   r?   r  r  r   r#   r!  r*   subr   r   r$   r   )	r[   r$  rM   r   rC   guard_value
reduce_dimmeansstdsr8   r8   r9   _apply_normalization  s0   

	z)FilterbankFeaturesTA._apply_normalizationinput_signalr^   c                 C   s^   | j |d}| j|d}| j|d}| j|d}| j|d}| j||d}| j|d}||fS )N)r  )r  )r$  )r$  rM   )r#  r  r  r+  r)  r2  r&  )r[   r3  r^   feature_lengthsr  r$  r8   r8   r9   r     s   zFilterbankFeaturesTA.forward)rV   r   r   r	   r   Nr   r   NTr   r   r   r   r   r   r   Fr   r   Fr   r   r   NFF)r   )r{   r|   r}   r  r   r   r   rs   r   r   r   r   r\   r  r   r   r   r  Tensorr  r  r#  r&  r)  r+  r2  r   r   r  r8   r8   r   r9   r    s    		

R
". r  )r   )r   T)%r   r   typingr   r   r   r   numpyr~   r   torch.nnr   0nemo.collections.asr.parts.preprocessing.perturbr   0nemo.collections.asr.parts.preprocessing.segmentr   
nemo.utilsr   r  r  ModuleNotFoundErrorr%   r:   r5  rD   rL   jitscript_if_tracingr   r   rT   objectrU   r   Moduler   r  r8   r8   r8   r9   <module>   sL   "7 9	 ~