o
    }oiƭ                     @   s  d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZ d d	lmZmZmZmZ d d
lmZ G dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$dS )    )DictListOptionalTupleNConformerEncoder)make_seq_mask_like)!SpectrogramToMultichannelFeatures)ChannelAttentionPoolChannelAveragePool"ParametricMultichannelWienerFilterTransformAttendConcatenateTransformAverageConcatenate	WPEFilter)db2mag)NeuralModule	typecheck)	FloatTypeLengthsType
NeuralTypeSpectrogramType)loggingc                       s   e Zd ZdZ									dd	ed
edededee dee dedededef fddZ	e
deeef fddZe
deeef fddZe dejdejdeejejf fddZ  ZS )MaskEstimatorRNNa  Estimate `num_outputs` masks from the input spectrogram
    using stacked RNNs and projections.

    The module is structured as follows:
        input --> spatial features --> input projection -->
            --> stacked RNNs --> output projection for each output --> sigmoid

    Reference:
        Multi-microphone neural speech separation for far-field multi-talker
        speech recognition (https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8462081)

    Args:
        num_outputs: Number of output masks to estimate
        num_subbands: Number of subbands of the input spectrogram
        num_features: Number of features after the input projections
        num_layers: Number of RNN layers
        num_hidden_features: Number of hidden features in RNN layers
        num_input_channels: Number of input channels
        dropout: If non-zero, introduces dropout on the outputs of each RNN layer except the last layer, with dropout
                 probability equal to `dropout`. Default: 0
        bidirectional: If `True`, use bidirectional RNN.
        rnn_type: Type of RNN, either `lstm` or `gru`. Default: `lstm`
        mag_reduction: Channel-wise reduction for magnitude features
        use_ipd: Use inter-channel phase difference (IPD) features
          Nr   Tlstmrmsnum_outputsnum_subbandsnum_features
num_layersnum_hidden_featuresnum_input_channelsdropoutrnn_typemag_reductionuse_ipdc                    s   t    |d u r }t||
|d| _tjj| jj| jj  d| _	|	dkr5tjj
 ||d||d| _n|	dkrGtjj ||d||d| _ntd|	 tjj|rWd  n  d| _tj | _tj fd	d
t|D | _tj | _d S )N)r   r"   r%   r&   in_featuresout_featuresr   T)
input_sizehidden_sizer    batch_firstr#   bidirectionalgruzUnknown rnn_type:    c                    s   g | ]
}t jj d qS )r'   )torchnnLinear.0_r   r    Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/modules/masking.py
<listcomp>x   s    z-MaskEstimatorRNN.__init__.<locals>.<listcomp>)super__init__r	   featuresr0   r1   r2   r   num_channelsinput_projectionLSTMrnnGRU
ValueErrorfc	LayerNormnorm
ModuleListrangeoutput_projectionsSigmoidoutput_nonlinearity)selfr   r   r   r    r!   r"   r#   r-   r$   r%   r&   	__class__r6   r8   r;   ?   sN   


	zMaskEstimatorRNN.__init__returnc                 C      t dt t dt dS +Returns definitions of module output ports.BCDTrS   inputinput_lengthr   r   r   rK   r7   r7   r8   input_types|      

zMaskEstimatorRNN.input_typesc                 C   rO   rQ   rR   rW   outputoutput_lengthr   r   r   r\   r7   r7   r8   output_types   r^   zMaskEstimatorRNN.output_typesrY   rZ   c                 C   s&  | j ||d\}}|j\}}}}|dddd}|||d}| |}tjjjj	||
 ddd	|j}| j  | |\}}tjjjj|dd
\}	}
|
|j}
| | |	| }	g }| jD ]}||	}| |}|dd}|| qctj|dd}t|
|ddd}||d}||
fS )aq  Estimate `num_outputs` masks from the input spectrogram.

        Args:
            input: C-channel input, shape (B, C, F, N)
            input_length: Length of valid entries along the time dimension, shape (B,)

        Returns:
            Returns `num_outputs` masks in a tensor, shape (B, num_outputs, F, N),
            and output length with shape (B,)
        rX   r   r      r/   TF)r,   enforce_sorted)r,   axislengthsliketime_dim
valid_ones        )r<   shapepermuteviewr>   r0   r1   utilsr@   pack_padded_sequencecputodeviceflatten_parameterspad_packed_sequencerE   rC   rH   rJ   	transposeappendstackr   masked_fill)rK   rY   rZ   r5   rS   num_feature_channelsr   Ninput_packedra   rb   masksoutput_projectionmasklength_maskr7   r7   r8   forward   s6   




zMaskEstimatorRNN.forward)	r   r   NNr   Tr   r   N)__name__
__module____qualname____doc__intr   floatstrboolr;   propertyr   r   r]   rd   r   r0   Tensorr   r   __classcell__r7   r7   rL   r8   r   $   sL    
=0r   c                &       s  e Zd ZdZ										
	
		
		
	
d*dedededededededededededeee  dee dedee de	dee dee f$ fdd Z
ed!eeef fd"d#Zed!eeef fd$d%Ze d&ejd'ejd!eejejf fd(d)Z  ZS )+MaskEstimatorFlexChannelsu  Estimate `num_outputs` masks from the input spectrogram
    using stacked channel-wise and temporal layers.

    This model is using interlaved channel blocks and temporal blocks, and
    it can process arbitrary number of input channels.
    Default channel block is the transform-average-concatenate layer.
    Default temporal block is the Conformer encoder.
    Reduction from multichannel signal to single-channel signal is performed
    after `channel_reduction_position` blocks. Only temporal blocks are used afterwards.
    After the sequence of blocks, the output mask is computed using an additional
    output temporal layer and a nonlinearity.

    References:
        - Yoshioka et al, VarArray: Array-Geometry-Agnostic Continuous Speech Separation, 2022
        - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023

    Args:
        num_outputs: Number of output masks.
        num_subbands: Number of subbands on the input spectrogram.
        num_blocks: Number of blocks in the model.
        channel_reduction_position: After this block, the signal will be reduced across channels.
        channel_reduction_type: Reduction across channels: 'average' or 'attention'
        channel_block_type: Block for channel processing: 'transform_average_concatenate' or 'transform_attend_concatenate'
        temporal_block_type: Block for temporal processing: 'conformer_encoder'
        temporal_block_num_layers: Number of layers for the temporal block
        temporal_block_num_heads: Number of heads for the temporal block
        temporal_block_dimension: The hidden size of the model
        temporal_block_self_attention_model: Self attention model for the temporal block
        temporal_block_att_context_size: Attention context size for the temporal block
        mag_reduction: Channel-wise reduction for magnitude features
        mag_power: Power to apply on magnitude features
        use_ipd: Use inter-channel phase difference (IPD) features
        mag_normalization: Normalize using mean ('mean') or mean and variance ('mean_var')
        ipd_normalization: Normalize using mean ('mean') or mean and variance ('mean_var')
    rf   	attentiontransform_attend_concatenateconformer_encoder         rel_posNabs_meanTr   r   
num_blockschannel_reduction_positionchannel_reduction_typechannel_block_typetemporal_block_typetemporal_block_num_layerstemporal_block_num_headstemporal_block_dimension#temporal_block_self_attention_modeltemporal_block_att_context_sizer"   r%   	mag_powerr&   mag_normalizationipd_normalizationc              
      s   t    t ||||||d| _|| _td| j |dkr!|}||kr/td| d| || _td| j t	j
 | _t	j
 | _t|D ]x}td| ||k r|dkr]| jjn}td	|| |d
krrt|d}n|dkr}t|d}ntd| | j| || j  krdkrn n| jjn}td| |dkrt||dd}ntd| d| j| qItd| |dkrt | _n|dkr| jdkr| jjn}t|d| _ntd| td| t	j
 fddt|D | _t	j
 | _d S )N)r   r"   r%   r   r&   r   r   zTotal number of blocks: %drf   zChannel reduction position z exceeds the number of blocks z1Channel reduction will be applied before block %dzPrepare block %dr   zDSetup channel block %s with %d input features and %d output featurestransform_average_concatenater'   r   zUnknown channel layer type: zSetup temporal block %sr   re   )feat_inn_layersd_modelsubsampling_factorself_attention_modelatt_context_sizen_headszUnknown temporal block .zSetup channel reduction %saverager   )r(   z Unknown channel reduction type: zSetup %d output layersc                    s$   g | ]}t d  d dqS )re   )r   r   r   feat_outr   r   r   r   r   r3   r   r   r   r   r   r7   r8   r9   a  s    z6MaskEstimatorFlexChannels.__init__.<locals>.<listcomp>)r:   r;   r	   r<   r   r   debugrB   r   r0   r1   rF   channel_blockstemporal_blocksrG   r   r   r   r{   r   r   channel_reductionr
   output_layersrI   rJ   )rK   r   r   r   r   r   r   r   r   r   r   r   r   r"   r%   r   r&   r   r   nchannel_in_featureschannel_blocktemporal_in_featurestemporal_blockchannel_reduction_in_featuresrL   r   r8   r;      s   
	$

z"MaskEstimatorFlexChannels.__init__rN   c                 C   rO   rP   r[   r\   r7   r7   r8   r]   s  r^   z%MaskEstimatorFlexChannels.input_typesc                 C   rO   r_   rc   r\   r7   r7   r8   rd   {  r^   z&MaskEstimatorFlexChannels.output_typesrY   rZ   c              	   C   s  | j ||d\}}|d|d}}t| jD ]r}|| jk rD| j| |d}|d|d}}	|d||	}|dkrC||}n|| jkrO| j|d}t	
  | j| ||d\}}W d   n1 siw   Y  || jk r|d}	|||d|	}|dkr|dd| }q| j| jkr| j|d}g }
| jD ](}t	
  |||d\}}W d   n1 sw   Y  | |}|
| qtj|
dd	}
|
|fS )
z8Estimate `num_outputs` masks from the input spectrogram.rX   r   re   )rY   rf   )audio_signallengthNdim)r<   sizerG   r   r   r   reshaperepeat_interleaver   r   disable_checksr   r   rJ   r{   r0   r|   )rK   rY   rZ   ra   rb   rS   Mr   FrV   r   output_layerr   mask_lengthr7   r7   r8   r     s@   








z!MaskEstimatorFlexChannels.forward)rf   r   r   r   r   r   r   r   NNr   NTNN)r   r   r   r   r   r   r   r   r   r   r;   r   r   r   r]   rd   r   r0   r   r   r   r   r7   r7   rL   r8   r      sz    )	

 0r   c                       s  e Zd ZdZddejfdededejf fddZ	d)d
ej
dedej
fddZeedededddediddej
dej
dej
dej
fddZededidediddej
dej
fddZeedededdededdddej
dej
dej
deej
ej
f fd d!Zedeeef fd"d#Zedeeef fd$d%Ze d&ej
dej
dej
fd'd(Z  ZS )*MaskEstimatorGSSaw  Estimate masks using guided source separation with a complex
    angular Central Gaussian Mixture Model (cACGMM) [1].

    This module corresponds to `GSS` in Fig. 2 in [2].

    Notation is approximately following [1], where `gamma` denotes
    the time-frequency mask, `alpha` denotes the mixture weights,
    and `BM` denotes the shape matrix. Additionally, the provided
    source activity is denoted as `activity`.

    Args:
        num_iterations: Number of iterations for the EM algorithm
        eps: Small value for regularization
        dtype: Data type for internal computations (default `torch.cdouble`)

    References:
        [1] Ito et al., Complex Angular Central Gaussian Mixture Model for Directional Statistics in Mask-Based Microphone Array Signal Processing, 2016
        [2] Boeddeker et al., Front-End Processing for the CHiME-5 Dinner Party Scenario, 2018
    r   :0yE>num_iterationsepsdtypec                    s   t    |dkrtd| || _|dkrtd| || _|tjtjfvr1td| d|| _t	
d| jj t	
d| j t	
d| j t	
d	| j d S )
Nr   z+Number of iterations must be positive, got zeps must be positive, got Unsupported dtype z, expecting cfloat or cdoubleInitialized %s	num_iterations: %sz	eps:            %g	dtype:          %s)r:   r;   rB   r   r   r0   cfloatcdoubler   r   r   rM   r   )rK   r   r   r   rL   r7   r8   r;     s   
zMaskEstimatorGSS.__init__re   xr   rN   c                 C   s&   t jj|d|dd}||| j  }|S )a\  Normalize input to have a unit L2-norm across `dim`.
        By default, normalizes across the input channels.

        Args:
            x: C-channel input signal, shape (B, C, F, T)
            dim: Dimension for normalization, defaults to -3 to normalize over channels

        Returns:
            Normalized signal, shape (B, C, F, T)
        r/   T)ordr   keepdim)r0   linalgvector_normr   )rK   r   r   norm_xr7   r7   r8   	normalize  s   zMaskEstimatorGSS.normalize)rS   rT   rU   rS   rT   rV   rR   alphaactivitylog_pdfgamma)r]   rd   r   r   r   c                 C   s^   |t j|dddd  }t |}|d | |ddddf  }|t j|ddd| j  }|S )	at  Update masks for the cACGMM.

        Args:
            alpha: component weights, shape (B, num_outputs, F)
            activity: temporal activity for the components, shape (B, num_outputs, T)
            log_pdf: logarithm of the PDF, shape (B, num_outputs, F, T)

        Returns:
            Masks for the components of the model, shape (B, num_outputs, F, T)
        Tri   r   r   .N.Nr   r   )r0   maxexpsumr   )rK   r   r   r   	log_gammar   r7   r7   r8   update_masks   s
   
zMaskEstimatorGSS.update_masksc                 C   s   t j|dd}|S )zUpdate weights for the individual components
        in the mixture model.

        Args:
            gamma: masks, shape (B, num_outputs, F, T)

        Returns:
            Component weights, shape (B, num_outputs, F)
        rf   r   )r0   mean)rK   r   r   r7   r7   r8   update_weights#  s   zMaskEstimatorGSS.update_weightszr   
zH_invBM_z)r   r   r   r   c                 C   s0  | d}||| j  }|td||j||  }tj|dd}||d | j  }|| dd d }tj	
|\}}	tj|j| jd}|tj|dd	d
d | j  }|| j }tjt|dd}
tdd|  |	j|	 |}| dd}|| j }| t| |
d  }||fS )a  Update PDF of the cACGMM.

        Args:
            z: directional statistics, shape (B, num_inputs, F, T)
            gamma: masks, shape (B, num_outputs, F, T)
            zH_invBM_z: energy weighted by shape matrices, shape (B, num_outputs, F, T)

        Returns:
            Logarithm of the PDF, shape (B, num_outputs, F, T), the energy term, shape (B, num_outputs, F, T)
        r   zbmft,bift,bjft->bmfijrf   r   ).NNr   r/   minTr   r   zbmfj,bmfkj,bkft->bmftjre   r   )r   r   r0   einsumrv   r   conjr   rz   r   eighclamprealr   logsqrtabspow)rK   r   r   r   
num_inputsscaleBMdenomLQ	log_detBMr   r7   r7   r8   
update_pdf8  s    
 
$
zMaskEstimatorGSS.update_pdfc                 C   s   t dt t ddS )rQ   rR   r   )rY   r   )r   r   r\   r7   r7   r8   r]     s   
zMaskEstimatorGSS.input_typesc                 C   s   dt diS )rQ   r   rR   )r   r\   r7   r7   r8   rd     s   zMaskEstimatorGSS.output_typesrY   c              	   C   s  |j \}}}}|d}|jj}|d|kr#td|j  d|j  |d|kr6td|j  d|j  |dkrAtd| tjj|dd	l |j| j	d
}|
 s]J d|j	 | j|dd}	tj|| jd}
|
tj|
ddd }
|
ddd|d}
tj|||||j	|jd}t| jD ]}| j|
d}| j|	|
|d\}}| j|||d}
qW d   n1 sw   Y  tt|
rtd|
 |
S )as  Apply GSS to estimate the time-frequency masks for each output source.

        Args:
            input: batched C-channel input signal, shape (B, num_inputs, F, T)
            activity: batched frame-wise activity for each output source, shape (B, num_outputs, T)

        Returns:
            Masks for the components of the model, shape (B, num_outputs, F, T)
        re   r   z#Batch dimension mismatch: activity z
 vs input rf   z"Time dimension mismatch: activity z Expecting multiple outputs, got Fenabledr   Expecting complex input, got r   r   r   r   Tr   r/   )r   rw   )r   r   r   Nzgamma contains NaNs: )rp   r   rw   typerB   r0   ampautocastrv   r   
is_complexr   r   r   r   	unsqueezeexpandonesrG   r   r   r  r   anyisnanRuntimeError)rK   rY   r   rS   r   r   rV   r   rw   r   r   r   itr   r   r7   r7   r8   r     s4   
zMaskEstimatorGSS.forward)re   )r   r   r   r   r0   r   r   r   r   r;   r   r   r   r   r   r   r   r  r   r   r   r]   rd   r   r   r7   r7   rL   r8   r     sX    &$
@&r   c                       s   e Zd ZdZddededef fddZed	ee	e
f fd
dZed	ee	e
f fddZe dejdejdejd	eejejf fddZ  ZS )MaskReferenceChannelaP  A simple mask processor which applies mask
    on ref_channel of the input signal.

    Args:
        ref_channel: Index of the reference channel.
        mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB
        mask_max_db: Threshold mask to a maximal value before applying it, defaults to 0dB
    r   8ref_channelmask_min_dbmask_max_dbc                    sb   t    || _t|| _t|| _td| jj	 td| j td| j td| j d S )NzInitialized %s withz	ref_channel: %dz	mask_min:    %fz	mask_max:    %f)
r:   r;   r  r   mask_minmask_maxr   r   rM   r   )rK   r  r  r  rL   r7   r8   r;     s   


zMaskReferenceChannel.__init__rN   c                 C   s$   t dt t dt t dt dS )rQ   rR   rW   rY   rZ   r   r   r   r   r   r\   r7   r7   r8   r]     s   


z MaskReferenceChannel.input_typesc                 C   rO   r_   r[   r\   r7   r7   r8   rd     r^   z!MaskReferenceChannel.output_typesrY   rZ   r   c                 C   s>   t j|| j| jd}||dd| j| jd df  }||fS )a  Apply mask on `ref_channel` of the input signal.
        This can be used to generate multi-channel output.
        If `mask` has `M` channels, the output will have `M` channels as well.

        Args:
            input: Input signal complex-valued spectrogram, shape (B, C, F, N)
            input_length: Length of valid entries along the time dimension, shape (B,)
            mask: Mask for M outputs, shape (B, M, F, N)

        Returns:
            M-channel output complex-valed spectrogram with shape (B, M, F, N)
        r   r   Nre   .)r0   r   r  r  r  )rK   rY   rZ   r   ra   r7   r7   r8   r     s   "zMaskReferenceChannel.forward)r   r  r   )r   r   r   r   r   r   r;   r   r   r   r   r]   rd   r   r0   r   r   r   r   r7   r7   rL   r8   r    s"    	r  c                       s   e Zd ZdZ															
	d(dedededee dee dedededee dededededee def fddZ	e
deeef fddZe
deeef fd d!Ze 		d)d"ejd#ejd$eej d%eej dejf
d&d'Z  ZS )*MaskBasedBeamformera  Multi-channel processor using masks to estimate signal statistics.

    Args:
        filter_type: string denoting the type of the filter. Defaults to `mvdr`
        filter_beta: Parameter of the parameteric multichannel Wiener filter
        filter_rank: Parameter of the parametric multichannel Wiener filter
        filter_postfilter: Optional, postprocessing of the filter
        ref_channel: Optional, reference channel. If None, it will be estimated automatically
        ref_hard: If true, hard (one-hot) reference. If false, a soft reference
        ref_hard_use_grad: If true, use straight-through gradient when using the hard reference
        ref_subband_weighting: If true, use subband weighting when estimating reference channel
        num_subbands: Optional, used to determine the parameter size for reference estimation
        mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB
        mask_max_db: Threshold mask to a maximal value before applying it, defaults to 0dB
        diag_reg: Optional, diagonal regularization for the multichannel filter
        eps: Small regularization constant to avoid division by zero
    mvdr_soudenro   oneNr   TFr  ư>r   filter_typefilter_betafilter_rankfilter_postfilterr  ref_hardref_hard_use_gradref_subband_weightingr   r  r  postmask_min_dbpostmask_max_dbdiag_regr   c                    s.  t    |dvrtd| || _| jdkr)|dkr)td| j|| d}d}t||||||||	||d
| _|
|krGtd	|
 d
| dt|
| _	t|| _
||kr`td| d| dt|| _t|| _td| jj td| j td| j	 td| j
 td| j td| j d S )N)pmwfr  zUnknown filter type r  r   zqUsing filter type %s: beta will be automatically set to zero (current beta %f) and rank to one (current rank %s).ro   r  )
betarank
postfilterr  r%  r&  r'  r   r*  r   zLower bound for the mask z(dB must be smaller than the upper bound dBzLower bound for the postmask z/dB must be smaller or equal to the upper bound r   z	filter_type:  %sz	mask_min:     %ez	mask_max:     %ez	postmask_min: %ez	postmask_max: %e)r:   r;   rB   r!  r   warningr   filterr   r  r  postmask_minpostmask_maxr   rM   r   )rK   r!  r"  r#  r$  r  r%  r&  r'  r   r  r  r(  r)  r*  r   rL   r7   r8   r;     sV   




zMaskBasedBeamformer.__init__rN   c                 C   s6   t dt t dt t dt ddt dt dddS )rQ   rR   ToptionalrW   )rY   r   mask_undesiredrZ   )r   r   r   r   r\   r7   r7   r8   r]   `  s
   

zMaskBasedBeamformer.input_typesc                 C      t dt t dt dddS rQ   rR   rW   Tr4  r`   r[   r\   r7   r7   r8   rd   j     
z MaskBasedBeamformer.output_typesrY   r   r6  rZ   c                 C   sl  |durt ||ddddf ddd}g |d}}t|D ]z}|dd|df }	|dur8|dd|df }
n|dkrAd|	 }
n	tj|dd|	 }
tj|	| j| jd	}	tj|
| j| jd	}
|durn|	|d
}	|
|d
}
| j	||	|
d}| j
| jk rtj|dd|df | j
| jd	}||d }|| qtj|dd}|dur||ddddf d
}||fS )a  Apply a mask-based beamformer to the input spectrogram.
        This can be used to generate multi-channel output.
        If `mask` has multiple channels, a multichannel filter is created for each mask,
        and the output is concatenation of individual outputs along the channel dimension.
        The total number of outputs is `num_masks * M`, where `M` is the number of channels
        at the filter output.

        Args:
            input: Input signal complex-valued spectrogram, shape (B, C, F, N)
            mask: Mask for M output signals, shape (B, num_masks, F, N)
            input_length: Length of valid entries along the time dimension, shape (B,)

        Returns:
            Multichannel output signal complex-valued spectrogram, shape (B, num_masks * M, F, N)
        Nr   .rf   Frj   re   r   r  ro   )rY   mask_smask_nrh   )r   r   rG   r0   r   r   r  r  r}   r1  r2  r3  r  r{   concatenate)rK   rY   r   r6  rZ   r   ra   	num_masksmmask_dmask_uoutput_m
postmask_mr7   r7   r8   r   r  s4   
"zMaskBasedBeamformer.forward)r  ro   r  Nr   TFFNr  r   r   r   r   r   NN)r   r   r   r   r   r   r   r   r   r;   r   r   r   r]   rd   r   r0   r   r   r   r7   r7   rL   r8   r  
  s    	
C	r  c                       s   e Zd ZdZdddddejfdeded	ed
ededee dedej	f fddZ
edeeef fddZedeeef fddZe 	ddejdeej deej dejfddZ  ZS )MaskBasedDereverbWPEax  Multi-channel linear prediction-based dereverberation using
    weighted prediction error for filter estimation.

    An optional mask to estimate the signal power can be provided.
    If a time-frequency mask is not provided, the algorithm corresponds
    to the conventional WPE algorithm.

    Args:
        filter_length: Length of the convolutional filter for each channel in frames.
        prediction_delay: Delay of the input signal for multi-channel linear prediction in frames.
        num_iterations: Number of iterations for reweighting
        mask_min_db: Threshold mask to a minimal value before applying it, defaults to -200dB
        mask_max_db: Threshold mask to a minimal value before applying it, defaults to 0dB
        diag_reg: Diagonal regularization for WPE
        eps: Small regularization constant
        dtype: Data type for internal computations

    References:
        - Kinoshita et al, Neural network-based spectrum estimation for online WPE dereverberation, 2017
        - Yoshioka and Nakatani, Generalization of Multi-Channel Linear Prediction Methods for Blind MIMO Impulse Response Shortening, 2012
    re   r  r   r   r   filter_lengthprediction_delayr   r  r  r*  r   r   c	           	         s   t    t||||d| _|| _t|| _t|| _|tj	tj
fvr+td| d|| _td| jj td| j td| j td| j td| j d S )	N)rE  rF  r*  r   r   z), expecting torch.cfloat or torch.cdoubler   r   z	mask_min:       %gz	mask_max:       %gr   )r:   r;   r   r1  r   r   r  r  r0   r   r   rB   r   r   r   rM   r   )	rK   rE  rF  r   r  r  r*  r   r   rL   r7   r8   r;     s   


zMaskBasedDereverbWPE.__init__rN   c                 C   s,   t dt t dt ddt dt dddS )rQ   rR   rW   Tr4  r  r  r\   r7   r7   r8   r]     s   
z MaskBasedDereverbWPE.input_typesc                 C   r7  r8  r[   r\   r7   r7   r8   rd     r9  z!MaskBasedDereverbWPE.output_typesNrY   rZ   r   c                 C   s   |j }|jj}tjj|ddL |j| j d}| s#td|j  t	| j
D ]+}t|}|dkrE|durEtj|| j| jd}|| }|d }	| j|||	d	\}}
q(W d   n1 s^w   Y  |||
fS )
a  Given an input signal `input`, apply the WPE dereverberation algoritm.

        Args:
            input: C-channel complex-valued spectrogram, shape (B, C, F, T)
            input_length: Optional length for each signal in the batch, shape (B,)
            mask: Optional mask, shape (B, 1, F, N) or (B, C, F, T)

        Returns:
            Processed tensor with the same number of channels as the input,
            shape (B, C, F, T).
        Fr  r  r  r   Nr  r/   )rY   rZ   power)r   rw   r  r0   r	  r
  rv   r  r  rG   r   r   r   r  r  r1  )rK   rY   rZ   r   io_dtyperw   ra   i	magnituderG  rb   r7   r7   r8   r     s    
zMaskBasedDereverbWPE.forwardrC  )r   r   r   r   r0   r   r   r   r   r   r;   r   r   r   r   r]   rd   r   r   r   r   r7   r7   rL   r8   rD    sP    	rD  )%typingr   r   r   r   r0   .nemo.collections.asr.modules.conformer_encoderr   1nemo.collections.asr.parts.preprocessing.featuresr   'nemo.collections.audio.modules.featuresr	   4nemo.collections.audio.parts.submodules.multichannelr
   r   r   r   r   r   (nemo.collections.audio.parts.utils.audior   nemo.core.classesr   r   nemo.core.neural_typesr   r   r   r   
nemo.utilsr   r   r   r   r  r  rD  r7   r7   r7   r8   <module>   s*     ' z  B 3