o
    wiW*                     @   s   d Z ddlmZmZ ddlZddlmZ ddlZddl	m
Z
 ddlm
  mZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZmZ d
ZdZeed  ZG dd dejZ G dd deZ!dS )a  
Implementation of Maxine BNR2 denoising network

Maxine Background Noise Removal (BNR) 2.0 is an audio background noise removal
model from NVIDIA. This is the second generation of BNR from
Maxine Audio Effects SDK. BNR 2.0 removes unwanted noises from audio improving
speech intelligibility and also improving the speech recognition accuracy of
various ASR systems under noisy environments.

BNR 2.0 uses the SEASR architecture described in https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10837982
    )DictOptionalN)Trainer)
DictConfig)AudioToAudioModel)apply_weight_norm_lstmremove_weight_norm_lstm)PretrainedModelInfo	typecheck)AudioSignal
NeuralTypei>  
   i  c                       s<   e Zd ZdZ	d fd	d
	Zdd Zdd Zdd Z  ZS )_Seasrz*Internal implementation of the model class   F@                 ?c	           
         s  |t krtdt   || _|| _|d | _| j| _|r dnd}	tj	d| j||dd| _
tj| jdd	| _tj| j| jdd
d| _tj	| j| jd|	d| _tj| jdd	| _tj| j| jdd
d| _tj	| j| jd|	d| _tj| jdd	| _tj| j| jdd
d| _tj	| j| jd|	d| _tj| jdd	| _tjd| j | j | jd
|d| _tj| j| jdd
|d| _tj| j| jdd
|d| _tj| j| jd
d| _t| j| j| _t | _tj| jd||d| _ t! | _"d S )NzCurrently only 16k is supported   r   same   F)kernel_sizestridebiasgMbP?)epsT)
num_layersbatch_first   )r   padding)r   dropout)r   r   r    )r   )r   r   )#SUPPORTED_SAMPLE_RATEAssertionErrorsuper__init__f1f2f3	gru_nodesnnConv1dconv1dBatchNorm1dbn0GRUfeature_gru0conv1d_out1bn1feature_gru1conv1d_out2bn2feature_gru2conv1d_out3bn3denoise_grudenoise_gru_1denoise_gru_2denoise_gru_3Lineardenoise_maskSigmoidmask_actConvTranspose1dinv_convTanhinv_conv_activation)
selfsample_ratehidden_nodes	streamingr   r%   r&   r   r    r   	__class__ e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/audio/models/maxine/bnr.pyr$   2   s6   

"
z_Seasr.__init__c                 K   sn  | d}t| |}| |}t| |}| |}| |	ddd\}}t| 
|}| |}| |	ddd\}}t| |}	| |	}	|		ddd}	|	ddd}| |\}
}t|
|||	fd}| |\}}|| }| |\}}|| }| |\}}|| }| |\}}| | |}|	ddd}|| }| |}| |}|S )Nx0r   r   r   )getFrelur+   r-   
leaky_relur0   r1   r2   permuter3   r4   r5   r6   r7   r/   torchcatr8   r9   r:   r;   r?   r=   rA   rC   )rD   kwargsrL   xc0xc1fg1_xc2fg2xc3fg0xixi1xi2maskrJ   rJ   rK   forwardZ   s8   






z_Seasr.forwardc                 C      dd }|  | dS )z2Apply weight normalization module from all layers.c                 S   sN   t | tjjtjjfrtjj|  d S t | tjjtjjfr%t	|  d S d S N)

isinstancerR   r)   r*   r<   utilsweight_normLSTMr.   r   mrJ   rJ   rK   _apply_weight_norm   s
   z4_Seasr.apply_weight_norm.<locals>._apply_weight_normNapply)rD   rj   rJ   rJ   rK   apply_weight_norm   s   z_Seasr.apply_weight_normc                 C   rb   )z3Remove weight normalization module from all layers.c                 S   sj   z*t | tjjtjjfrtjj|  W d S t | tjjtjjfr(t	|  W d S W d S  t
y4   Y d S w rc   )rd   rR   r)   r*   r<   re   remove_weight_normrg   r.   r   
ValueErrorrh   rJ   rJ   rK   _remove_weight_norm   s   z6_Seasr.remove_weight_norm.<locals>._remove_weight_normNrk   )rD   rp   rJ   rJ   rK   rn      s   	z_Seasr.remove_weight_norm)r   Fr   r   r   r   r   )	__name__
__module____qualname____doc__r$   ra   rm   rn   __classcell__rJ   rJ   rH   rK   r   /   s    ((r   c                       s   e Zd ZdZddedef fddZedee	e
f fdd	Zedee	e
f fd
dZe dd Zdd Zddede	fddZedee fddZ  ZS )BNR2z!Implementation of the BNR 2 modelNcfgtrainerc                    s   d| _ |d ur|j | _ t j||d | jj| _|   t| j| _t| jdr;t| jj	dr=| jj	j
r?| j  d S d S d S d S )Nr   )rw   rx   trainenable_weight_norm)
world_sizer#   r$   _cfgrE   setup_optimization_flagsr   seasrhasattrry   rz   rm   )rD   rw   rx   rH   rJ   rK   r$      s   

zBNR2.__init__returnc                 C      dt dt| jdiS )Ninput_signalBCTfreqr   r   rE   rD   rJ   rJ   rK   input_types      zBNR2.input_typesc                 C   r   )Noutput_signalr   r   r   r   rJ   rJ   rK   output_types   r   zBNR2.output_typesc                 C   sj   |j dkr|jd dkrtdn|j dkrtd|j|jd t dkr.tdt| jj|d	S )
a  
        Forward pass of the model.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        r   r   zBThis network currently only supports single channel audio signals.r   zKInvalid shape for input signal (received {}, supported [B, 1, T] or [B, T])r   z&Input samples must be a multiple of {})rL   )ndimshapero   formatSUPPORTED_INPUT_ALIGN_SAMPLESr~   ra   )rD   r   rJ   rJ   rK   ra      s   

zBNR2.forwardc           	      C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j|d}| j|||d}| d| | d	| jj	d
 d  | dt
j| jjt
jd |S )Nr   input_lengthtarget_signalr   B T -> B 1 Tr   targetestimater   
train_losslearning_rater   lrglobal_stepdtype)rd   dictr   einops	rearrangera   losslog
_optimizerparam_groupsrR   tensorrx   r   float32)	rD   batch	batch_idxr   r   r   rX   predicted_audior   rJ   rJ   rK   training_step   s   



zBNR2.training_stepr   valdataloader_idxtagc                 C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| |d}	| j||	|d}
t| dr[|| jv r[| j| |  D ]\}}|j	|	||d	 qN| 
d
tj| jjtjd | d|
iS )Nr   r   r   r   r   r   r   metrics)predsr   r   r   r   _loss)rd   r   r   r   r   r   r   r   itemsupdater   rR   r   rx   r   r   )rD   r   r   r   r   r   r   r   rX   processed_signalr   namemetricrJ   rJ   rK   evaluation_step   s    




zBNR2.evaluation_stepc                 C   s   dS )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        NrJ   )clsrJ   rJ   rK   list_available_models  s   	zBNR2.list_available_modelsrc   )r   r   )rq   rr   rs   rt   r   r   r$   propertyr   strr   r   r   r
   ra   r   intr   classmethodr   r	   r   ru   rJ   rJ   rH   rK   rv      s    
rv   )"rt   typingr   r   r   lightning.pytorchpytorchpltrR   torch.nnr)   torch.nn.functional
functionalrN   r   	omegaconfr   ,nemo.collections.audio.models.audio_to_audior   )nemo.collections.audio.parts.utils.maxiner   r   nemo.core.classes.commonr	   r
   nemo.core.neural_typesr   r   r!   SUPPORTED_INPUT_ALIGN_MSr   LightningModuler   rv   rJ   rJ   rJ   rK   <module>   s$   m