o
    wiD                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ G d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!dS )    N)CallableDictOptionalTuple)make_seq_mask_like)MultiHeadAttention)covariance_matrix)NeuralModule	typecheck)AudioSignal	FloatTypeLengthsType
NeuralTypeSpectrogramType)loggingc                       s   e Zd ZdZ					ddededee dee d	ee f
 fd
dZe	dd Z
e	dd Ze e dejdejfddZ  ZS )ChannelAugmental  Randomly permute and selects a subset of channels.

    Args:
        permute_channels (bool): Apply a random permutation of channels.
        num_channels_min (int): Minimum number of channels to select.
        num_channels_max (int): Max number of channels to select.
        rng: Optional, random generator.
        seed: Optional, seed for the generator.
    T   Npermute_channelsnum_channels_minnum_channels_maxrngseedc                    s   t    |d u rt|n|| _|| _|| _|| _|d ur,||kr,td| d| t	
d| jj t	
d| j t	
d| j t	
d| j d S )NzMin number of channels z/ cannot be greater than max number of channels Initialized %s withz	permute_channels: %sz	num_channels_min: %sz	num_channels_max: %s)super__init__randomRandom_rngr   r   r   
ValueErrorr   debug	__class____name__)selfr   r   r   r   r   r     q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/audio/parts/submodules/multichannel.pyr   (   s   
zChannelAugment.__init__c                 C      dt dt iS ))Returns definitions of module input typesinputBCTr   r   r"   r$   r$   r%   input_typesA      zChannelAugment.input_typesc                 C   r&   )*Returns definitions of module output typesoutputr)   r-   r.   r$   r$   r%   output_typesH   r0   zChannelAugment.output_typesr(   returnc                 C   s   |j dks	J d|d}|| jk rtd| d| j d| jd u r&|n| j}| j| j|}tt|}| j	r@| j
| |d | }|d d |d d f S )N   z$Expecting input with shape (B, C, T)r   zNumber of input channels (z5) is smaller than the min number of output channels ())ndimsizer   RuntimeErrorr   r   randintlistranger   shuffle)r"   r(   num_channels_inr   num_channels_outchannelsr$   r$   r%   forwardO   s   

zChannelAugment.forward)Tr   NNN)r!   
__module____qualname____doc__boolintr   r   r   propertyr/   r3   r
   torchno_gradTensorrA   __classcell__r$   r$   r#   r%   r      s2    

"r   c                       sd   e Zd ZdZddedee f fddZedd Zed	d
 Z	e
 dejdejfddZ  ZS )TransformAverageConcatenatea  Apply transform-average-concatenate across channels.
    We're using a version from [2].

    Args:
        in_features: Number of input features
        out_features: Number of output features

    References:
        [1] Luo et al, End-to-end Microphone Permutation and Number Invariant Multi-channel Speech Separation, 2019
        [2] Yoshioka et al, VarArray: Array-Geometry-Agnostic Continuous Speech Separation, 2022
    Nin_featuresout_featuresc                    s   t    |d u r|}|d dkrtd| tjtjj||d ddtj | _tjtjj||d ddtj | _	t
d| jj t
d| t
d| d S )	N   r   GNumber of output features should be divisible by two, currently set to Fbiasr   	in_features:  %d	out_features: %d)r   r   r   rH   nn
SequentialLinearReLUtransform_channeltransform_averager   r   r    r!   )r"   rM   rN   r#   r$   r%   r   u   s   
z$TransformAverageConcatenate.__init__c                 C   r&   r'   r(   r*   r+   Dr,   r   r   r.   r$   r$   r%   r/      r0   z'TransformAverageConcatenate.input_typesc                 C   r&   r1   r2   r\   r^   r.   r$   r$   r%   r3      r0   z(TransformAverageConcatenate.output_typesr(   r4   c           	      C   sx   |j \}}}}|dddd}| |}tj|ddd}|dd|d}| |}tj||gdd	}|dddd}|S )

        Args:
            input: shape (B, M, in_features, T)

        Returns:
            Output tensor with shape shape (B, M, out_features, T)
        r   r5   r   rO   Tdimkeepdimrc   )shapepermuterZ   rH   meanexpandrY   cat)	r"   r(   r*   MFr,   average	transformr2   r$   r$   r%   rA      s   	

z#TransformAverageConcatenate.forwardN)r!   rB   rC   rD   rF   r   r   rG   r/   r3   r
   rH   rJ   rA   rK   r$   r$   r#   r%   rL   h   s    

 rL   c                	       sl   e Zd ZdZddedee dedef fd	d
Zedd Z	edd Z
e dejdejfddZ  ZS )TransformAttendConcatenateu  Apply transform-attend-concatenate across channels.
    The output is a concatenation of transformed channel and MHA
    over channels.

    Args:
        in_features: Number of input features
        out_features: Number of output features
        n_head: Number of heads for the MHA module
        dropout_rate: Dropout rate for the MHA module

    References:
        - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023
    N   r   rM   rN   n_headdropout_ratec                    s   t    |d u r|}|d dkrtd| tjtjj||d ddtj | _tjtjj||d ddtj | _	t
||d |d| _td| jj td| td	| td
| td| d S )NrO   r   rP   FrQ   rs   n_featrt   r   rS   rT   z	n_head:       %dz	dropout_rate: %f)r   r   r   rH   rU   rV   rW   rX   rY   transform_attendr   	attentionr   r   r    r!   )r"   rM   rN   rs   rt   r#   r$   r%   r      s"   
z#TransformAttendConcatenate.__init__c                 C   r&   r[   r^   r.   r$   r$   r%   r/      r0   z&TransformAttendConcatenate.input_typesc                 C   r&   r_   r^   r.   r$   r$   r%   r3      r0   z'TransformAttendConcatenate.output_typesr(   r4   c           	      C   s   |j \}}}}|dddd}||| ||}| |}| |}| j|||dd}tj||gdd}||||d}|dddd}|S )	r`   r   r5   r   rO   Nquerykeyvaluemaskre   rf   )	rg   rh   reshaperY   rw   rx   rH   rk   view)	r"   r(   r*   rl   rm   r,   ro   attendr2   r$   r$   r%   rA      s   	

z"TransformAttendConcatenate.forward)Nrr   r   )r!   rB   rC   rD   rF   r   floatr   rG   r/   r3   r
   rH   rJ   rA   rK   r$   r$   r#   r%   rq      s    $

 rq   c                       sT   e Zd ZdZ fddZedd Zedd Ze de	j
d	e	j
fd
dZ  ZS )ChannelAveragePoolz&Apply average pooling across channels.c                    s   t    td| jj d S )NInitialized %s)r   r   r   r   r    r!   r.   r#   r$   r%   r     s   
zChannelAveragePool.__init__c                 C   r&   r[   r^   r.   r$   r$   r%   r/     r0   zChannelAveragePool.input_typesc                 C   r&   r1   r2   r*   r]   r,   r^   r.   r$   r$   r%   r3     r0   zChannelAveragePool.output_typesr(   r4   c                 C   s   t j|ddS )
        Args:
            input: shape (B, M, F, T)

        Returns:
            Output tensor with shape shape (B, F, T)
        rf   )rH   ri   )r"   r(   r$   r$   r%   rA   &  s   	zChannelAveragePool.forward)r!   rB   rC   rD   r   rG   r/   r3   r
   rH   rJ   rA   rK   r$   r$   r#   r%   r     s    

 r   c                       sd   e Zd ZdZddededef fddZed	d
 Zedd Z	e
 dejdejfddZ  ZS )ChannelAttentionPoolu#  Use attention pooling to aggregate information across channels.
    First apply MHA across channels and then apply averaging.

    Args:
        in_features: Number of input features
        out_features: Number of output features
        n_head: Number of heads for the MHA module
        dropout_rate: Dropout rate for the MHA module

    References:
        - Wang et al, Neural speech separation using sparially distributed microphones, 2020
        - Jukić et al, Flexible multichannel speech enhancement for noise-robust frontend, 2023
    r   r   rM   rs   rt   c                    sX   t    || _t|||d| _td| jj td| td| td| d S )Nru   r   rS   z	num_heads:    %dz	dropout_rate: %d)	r   r   rM   r   rx   r   r   r    r!   )r"   rM   rs   rt   r#   r$   r%   r   A  s   
zChannelAttentionPool.__init__c                 C   r&   r[   r^   r.   r$   r$   r%   r/   K  r0   z ChannelAttentionPool.input_typesc                 C   r&   r   r^   r.   r$   r$   r%   r3   R  r0   z!ChannelAttentionPool.output_typesr(   r4   c                 C   st   |j \}}}}|dddd}||| ||}| j|||dd}||||d}|dddd}tj|dd	}|S )
r   r   r5   r   rO   Nry   re   r   )axis)rg   rh   r~   rx   r   rH   ri   )r"   r(   r*   rl   rm   r,   r2   r$   r$   r%   rA   Y  s   	zChannelAttentionPool.forward)r   r   )r!   rB   rC   rD   rF   r   r   rG   r/   r3   r
   rH   rJ   rA   rK   r$   r$   r#   r%   r   2  s    


 r   c                       s$  e Zd ZdZ										d,d	ed
edee dee dedededee dee def fddZ	e
d-dejdedejfddZdejdejfddZdejdejdejfdd Zdejdejd!ejdejfd"d#Zed$d% Zed&d' Ze dejd(ejd)ejdejfd*d+Z  ZS )."ParametricMultichannelWienerFiltera  Parametric multichannel Wiener filter, with an adjustable
    tradeoff between noise reduction and speech distortion.
    It supports automatic reference channel selection based
    on the estimated output SNR.

    Args:
        beta: Parameter of the parameteric filter, tradeoff between noise reduction
              and speech distortion (0: MVDR, 1: MWF).
        rank: Rank assumption for the speech covariance matrix.
        postfilter: Optional postfilter. If None, no postfilter is applied.
        ref_channel: Optional, reference channel. If None, it will be estimated automatically.
        ref_hard: If true, estimate a hard (one-hot) reference. If false, a soft reference.
        ref_hard_use_grad: If true, use straight-through gradient when using the hard reference
        ref_subband_weighting: If true, use subband weighting when estimating reference channel
        num_subbands: Optional, used to determine the parameter size for reference estimation
        diag_reg: Optional, diagonal regularization for the multichannel filter
        eps: Small regularization constant to avoid division by zero

    References:
        - Souden et al, On Optimal Frequency-Domain Multichannel Linear Filtering for Noise Reduction, 2010
          ?oneNTFư>:0yE>betarank
postfilterref_channelref_hardref_hard_use_gradref_subband_weightingnum_subbandsdiag_regepsc                    sT  t    || _|| _| jdkr"| jdkr"td| j d| j d|dvr.td| d|| _|	d urA|	dk rAtd	|	 d
|	| _|
dkrPtd|
 d
|
| _|| _| jdkrft	|||||
d| _
nd | _
| jd u | _td| jj td| j td| j td| j td| j td| j td| j td| j d S )Nfullr   zRank z is not compatible with beta .)NbanzPostfilter z is not supported.zDiagonal regularization z must be positive.zEpsilon max_snr)hardhard_use_gradsubband_weightingr   r   r   z	beta:        %fz	rank:        %sz	postfilter:  %sz	diag_reg:    %gz	eps:         %gz	ref_channel: %sz	is_mimo:     %s)r   r   r   r   r   r   r   r   r   ReferenceChannelEstimatorSNRref_estimatoris_mimor   r   r    r!   )r"   r   r   r   r   r   r   r   r   r   r   r#   r$   r%   r     sB   


z+ParametricMultichannelWienerFilter.__init__xrd   r4   c                 C   s.   t j| dddd}|r|dd}|S )zCalculate trace of matrix slices over the last
        two dimensions in the input tensor.

        Args:
            x: tensor, shape (..., C, C)

        Returns:
            Trace for each (C, C) matrix. shape (...)
        ra   re   dim1dim2)rH   diagonalsum	unsqueeze)r   rd   tracer$   r$   r%   r     s   z(ParametricMultichannelWienerFilter.tracepsdc                 C   sF   | j | |j | j }|t|dtj|jd |j	d  }|S )zApply diagonal regularization on psd.

        Args:
            psd: tensor, shape (..., C, C)

        Returns:
            Tensor, same shape as input.
        re   device)
r   r   realr   rH   
diag_embedr   onesrg   r   )r"   r   r   r$   r$   r%   apply_diag_reg  s   
*z1ParametricMultichannelWienerFilter.apply_diag_regr(   filterc                 C   s   |  std|j |  std|j |jdks1|d|dks1|d|dkr=td|j d|j td|	 |}|S )	a  Apply the MIMO filter on the input.

        Args:
            input: batch with C input channels, shape (B, C, F, T)
            filter: batch of C-input, M-output filters, shape (B, F, C, M)

        Returns:
            M-channel filter output, shape (B, M, F, T)
        z'Expecting complex-valued filter, found z&Expecting complex-valued input, found rr   ra   r   zFilter shape z", not compatible with input shape zbfcm,bcft->bmft)

is_complex	TypeErrordtyper7   r8   r   rg   rH   einsumconj)r"   r(   r   r2   r$   r$   r%   apply_filter  s   
2z/ParametricMultichannelWienerFilter.apply_filterpsd_nc                 C   sl   | d}td| |||}t| | }td| ||}| }||| j  }|d | }|S )a@  Apply blind analytic normalization postfilter. Note that this normalization has been
        derived for the GEV beamformer in [1]. More specifically, the BAN postfilter aims to scale GEV
        to satisfy the distortionless constraint and the final analytical expression is derived using
        an assumption on the norm of the transfer function.
        However, this may still be useful in some instances.

        Args:
            input: batch with M output channels (B, M, F, T)
            filter: batch of C-input, M-output filters, shape (B, F, C, M)
            psd_n: batch of noise PSDs, shape (B, F, C, C)

        Returns:
            Filtere input, shape (B, M, F, T)

        References:
            - Warsitz and Haeb-Umbach, Blind Acoustic Beamforming Based on Generalized Eigenvalue Decomposition, 2007
        ra   zbfcm,bfci,bfij,bfjm->bmfzbfcm,bfci,bfim->bmf).N)r8   rH   r   r   sqrtabsr   )r"   r(   r   r   
num_inputs	numeratordenominatorr   r$   r$   r%   	apply_ban  s   
z,ParametricMultichannelWienerFilter.apply_banc                 C   s$   t dt t dt t dt dS )r'   r\   r   )r(   mask_smask_n)r   r   r   r.   r$   r$   r%   r/   (     


z.ParametricMultichannelWienerFilter.input_typesc                 C   r&   r_   r^   r.   r$   r$   r%   r3   1  r0   z/ParametricMultichannelWienerFilter.output_typesr   r   c              	   C   s  |j }tjj|jjdd | }| }| }t||d}t||d}| j	dkrM| j
r3| |}tj||}| j|ddj}|| j| | j  }n$| j	dkri|| j|  }	| j
ra| |	}	tj|	|}ntd| j	 tj| jtr|d	| jf d
}n%| jdur| j|||d|j }
tj||
ddddddf  d
dd}| j||d}| jdkr| j|||d}W d   n1 sw   Y  ||S )a  Return processed signal.
        The output has either one channel (M=1) if a ref_channel is selected,
        or the same number of channels as the input (M=C) if ref_channel is None.

        Args:
            input: Input signal, complex tensor with shape (B, C, F, T)
            mask_s: Mask for the desired signal, shape (B, F, T)
            mask_n: Mask for the undesired noise, shape (B, F, T)

        Returns:
            Processed signal, shape (B, M, F, T)
        F)enabled)r   r}   r   T)rd   r   zUnexpected rank .re   NWpsd_sr   rb   )r(   r   r   )r(   r   r   )r   rH   ampautocastr   typecdoubledoubler   r   r   r   linalgsolver   r   r   r   r9   jit
isinstancer   rF   r   r   tor   r   r   r   )r"   r(   r   r   iodtyper   r   r   lampsd_snref_channel_tensorr2   r$   r$   r%   rA   8  s<   




(

4z*ParametricMultichannelWienerFilter.forward)
r   r   NNTTFNr   r   )F)r!   rB   rC   rD   r   strr   rF   rE   r   staticmethodrH   rJ   r   r   r   r   rG   r/   r3   r
   rA   rK   r$   r$   r#   r%   r   w  sX    	
@""

,r   c                       s   e Zd ZdZ					ddededed	ee d
ef
 fddZe	dd Z
e	dd Ze dejdejdejdejfddZ  ZS )r   a  Estimate a reference channel by selecting the reference
    that maximizes the output SNR. It returns one-hot encoded
    vector or a soft reference.

    A straight-through estimator is used for gradient when using
    hard reference.

    Args:
        hard: If true, use hard estimate of ref channel.
            If false, use a soft estimate across channels.
        hard_use_grad: Use straight-through estimator for
            the gradient.
        subband_weighting: If true, use subband weighting when
            adding across subband SNRs. If false, use average
            across subbands.

    References:
        Boeddeker et al, Front-End Processing for the CHiME-5 Dinner Party Scenario, 2018
    TFNr   r   r   r   r   r   c                    s   t    || _|| _|| _|| _|r|d u rtd| d|r*tj	t
|nd | _|r8tj	t
|nd | _td| jj td| j td| j td| j td| td| j d S )	NzANumber of subbands must be provided when using subband_weighting=r   r   z	hard:              %dz	hard_use_grad:     %dz	subband_weighting: %dz	num_subbands:      %sz	eps:               %e)r   r   r   r   r   r   r   rH   rU   	Parameterr   weight_sweight_nr   r   r    r!   )r"   r   r   r   r   r   r#   r$   r%   r     s   
z%ReferenceChannelEstimatorSNR.__init__c                 C   s$   t dt t dt t dt dS )r'   )r*   r]   r+   r+   r   r^   r.   r$   r$   r%   r/     r   z(ReferenceChannelEstimatorSNR.input_typesc                 C   r&   )r1   r2   )r*   r+   )r   r   r.   r$   r$   r%   r3     r0   z)ReferenceChannelEstimatorSNR.output_typesr   r   r   r4   c                 C   s0  | j r>td| || }td| || }tj|| jjddd dd}tj|| j	jddd dd}ntd| || }td| || }||| j
  }dt|| j
  }|jdd}| jr|jdd	d
\}}	t|d|	d}
| jr|
|  | }|S |
}|S |}|S )a  
        Args:
            W: Multichannel input multichannel output filter, shape (B, F, C, M), where
               C is the number of input channels and M is the number of output channels
            psd_s: Covariance for the signal, shape (B, F, C, C)
            psd_n: Covariance for the noise, shape (B, F, C, C)

        Returns:
            One-hot or soft reference channel, shape (B, M)
        z...jm,...jk,...km->...mr   rf   r   ra   z...fjm,...fjk,...fkm->...m
   re   Trb   r   )r   rH   r   r   r   r   r   softmaxr   r   r   log10r   max
zeros_likescatterr   detach)r"   r   r   r   pow_spow_nsnrref_soft_idxr   refr$   r$   r%   rA     s(   "$z$ReferenceChannelEstimatorSNR.forward)TTFNr   )r!   rB   rC   rD   rE   r   rF   r   r   rG   r/   r3   r
   rH   rJ   rA   rK   r$   r$   r#   r%   r     s0    

,r   c                       sb  e Zd ZdZd(dededee def fdd	Zed
e	e
ef fddZed
e	e
ef fddZe 	d)dejdejdeej d
ejfddZe	d*dejdededee d
ejf
ddZedejd
ejfddZ	d)dejdejdejdeej d
eej f
dd Zd!ejd"ejd
ejfd#d$Z	d+d%ejdeej deej d
ejfd&d'Z  ZS ),	WPEFilteruW  A weighted prediction error filter.
    Given input signal, and expected power of the desired signal, this
    class estimates a multiple-input multiple-output prediction filter
    and returns the filtered signal. Currently, estimation of statistics
    and processing is performed in batch mode.

    Args:
        filter_length: Length of the prediction filter in frames, per channel
        prediction_delay: Prediction delay in frames
        diag_reg: Diagonal regularization for the correlation matrix Q, applied as diag_reg * trace(Q) + eps
        eps: Small positive constant for regularization

    References:
        - Yoshioka and Nakatani, Generalization of Multi-Channel Linear Prediction
            Methods for Blind MIMO Impulse Response Shortening, 2012
        - Jukić et al, Group sparsity for MIMO speech dereverberation, 2015
    r   r   filter_lengthprediction_delayr   r   c                    sn   t    || _|| _|| _|| _td| jj	 td| j td| j td| j td| j d S )Nr   z	filter_length:    %dz	prediction_delay: %dz	diag_reg:         %gz	eps:              %g)
r   r   r   r   r   r   r   r   r    r!   )r"   r   r   r   r   r#   r$   r%   r     s   
zWPEFilter.__init__r4   c                 C   s(   t dt t dt t dt dddS )+Returns definitions of module output ports.r\   r*   Toptional)r(   powerinput_lengthr   r   r   r.   r$   r$   r%   r/     s   

zWPEFilter.input_typesc                 C   s   t dt t dt dddS )r   r\   r   Tr   )r2   output_lengthr   r.   r$   r$   r%   r3     s   
zWPEFilter.output_typesNr(   r   r   c                 C   s   t j|dd}d|| j  }| j|| j| jd}| j||||d\}}| j||d}| j||d}	||	 }
|durGt	||
dd	d
}|

|d}
|
|fS )aN  Given input and the predicted power for the desired signal, estimate
        the WPE filter and return the processed signal.

        Args:
            input: Input signal, shape (B, C, F, N)
            power: Predicted power of the desired signal, shape (B, C, F, N)
            input_length: Optional, length of valid frames in `input`. Defaults to `None`

        Returns:
            Tuple of (processed_signal, output_length). Processed signal has the same
            shape as the input signal (B, C, F, N), and the output length is the same
            as the input length.
        r   rf   r   delay)r(   weighttilde_inputr   )QR)r   r   Nre   Flengthsliketime_dim
valid_ones        )rH   ri   r   
convtensorr   r   estimate_correlationsestimate_filterr   r   masked_fill)r"   r(   r   r   r   r   r   r   Gundesired_signaldesired_signallength_maskr$   r$   r%   rA   #  s   
zWPEFilter.forwardr   r   r   n_stepsc           
      C   s   |j dkrtd|j |j\}}}}|du r|}tjj||d | df}|d|d}	|	ddddddd|ddf }	|	S )a  Create a tensor equivalent of convmtx_mc for each example in the batch.
        The input signal tensor `x` has shape (B, C, F, N).
        Convtensor returns a view of the input signal `x`.

        Note: We avoid reshaping the output to collapse channels and filter taps into
        a single dimension, e.g., (B, F, N, -1). In this way, the output is a view of the input,
        while an additional reshape would result in a contiguous array and more memory use.

        Args:
            x: input tensor, shape (B, C, F, N)
            filter_length: length of the filter, determines the shape of the convolution tensor
            delay: delay to add to the input signal `x` before constructing the convolution tensor
            n_steps: Optional, number of time steps to keep in the out. Defaults to the number of
                    time steps in the input tensor.

        Returns:
            Return a convolutional tensor with shape (B, C, F, n_steps, filter_length)
        rr   z1Expecting a 4-D input. Received input with shape Nr   r   re   )r7   r9   rg   rH   rU   
functionalpadunfold)
clsr   r   r   r  r*   r+   rm   Ntilde_Xr$   r$   r%   r   S  s   
&zWPEFilter.convtensorc           	      C   s   |j \}}}}}|ddddd}|||||| }g }t|D ]}|| tt| ||| |d | < q!|d|f S )a  Reshape and permute columns to convert the result of
        convtensor to be equal to convmtx_mc. This is used for verification
        purposes and it is not required to use the filter.

        Args:
            x: output of self.convtensor, shape (B, C, F, N, filter_length)

        Returns:
            Output has shape (B, F, N, C*filter_length) that corresponds to
            the layout of convmtx_mc.
        r   rO   r5   r   rr   .)rg   rh   r~   r<   npfliparange)	r  r   r*   r+   rm   r  r   rh   mr$   r$   r%   permute_convtensor}  s   
zWPEFilter.permute_convtensorr   r   c              
   C   s   |durt ||ddd}||d}td| |ddddddddf | }td| |dddddddf | }||fS )a[  
        Args:
            input: Input signal, shape (B, C, F, N)
            weight: Time-frequency weight, shape (B, F, N)
            tilde_input: Multi-channel convolution tensor, shape (B, C, F, N, filter_length)
            input_length: Length of each input example, shape (B)

        Returns:
            Returns a tuple of correlation matrices for each batch.

            Let `X` denote the input signal in a single subband,
            `tilde{X}` the corresponding multi-channel correlation matrix,
            and `w` the vector of weights.

            The first output is Q = tilde{X}^H * diag(w) * tilde{X}, for each (b, f).
            The matrix Q has shape (C * filter_length, C * filter_length)
            The output is returned in a tensor with shape (B, F, C, filter_length, C, filter_length).

            The second output is R = tilde{X}^H * diag(w) * X, for each (b, f).
            The matrix R has shape (C * filter_length, C)
            The output is returned in a tensor with shape (B, F, C, filter_length, C). The last
            dimension corresponds to output channels.
        Nre   Fr   r   zbjfik,bmfin->bfjkmnzbjfik,bmfi->bfjkm)r   r  rH   r   r   )r"   r(   r   r   r   r  r   r   r$   r$   r%   r     s   0.zWPEFilter.estimate_correlationsr   r   c           
      C   s   |j \}}}}}}|| jksJ d|j  d| j ||||| j || }||||| j |}| jr\| jtj|ddddj | j }|t	|
dtj|j d |jd  }tj||}	|	|||||}	|	ddd	d
d}	|	S )ah  Estimate the MIMO prediction filter as G(b,f) = Q(b,f) \ R(b,f)
        for each subband in each example in the batch (b, f).

        Args:
            Q: shape (B, F, C, filter_length, C, filter_length)
            R: shape (B, F, C, filter_length, C)

        Returns:
            Complex-valued prediction filter, shape (B, C, F, C, filter_length)
        zShape of Q z is not matching filter length ra   re   r   r   r   rr   r   rO   r5   )rg   r   r~   r   rH   r   r   r   r   r   r   r   r   r   r   rh   )
r"   r   r   r*   rm   r+   r   r   r   r  r$   r$   r%   r    s   $*zWPEFilter.estimate_filterr   c                 C   s^   |du r|du rt d|dur|durt d|du r&| j|| j| jd}td||}|S )aX  Apply a prediction filter `filter` on the input `input` as

            output(b,f) = tilde{input(b,f)} * filter(b,f)

        If available, directly use the convolution matrix `tilde_input`.

        Args:
            input: Input signal, shape (B, C, F, N)
            tilde_input: Convolution matrix for the input signal, shape (B, C, F, N, filter_length)
            filter: Prediction filter, shape (B, C, F, C, filter_length)

        Returns:
            Multi-channel signal obtained by applying the prediction filter on
            the input signal, same shape as input (B, C, F, N)
        Nz*Both inputs cannot be None simultaneously.z.Both inputs cannot be provided simultaneously.r   zbjfik,bmfjk->bmfi)r9   r   r   r   rH   r   )r"   r   r(   r   r2   r$   r$   r%   r     s   zWPEFilter.apply_filter)r   r   rp   )r   N)NN)r!   rB   rC   rD   rF   r   r   r   rG   r   r   r   r/   r3   r
   rH   rJ   rA   classmethodr   r  r   r   r  r   rK   r$   r$   r#   r%   r     sp    $/) 
/&r   )"r   typingr   r   r   r   numpyr  rH   1nemo.collections.asr.parts.preprocessing.featuresr   :nemo.collections.asr.parts.submodules.multi_head_attentionr   (nemo.collections.audio.parts.utils.audior   nemo.core.classesr	   r
   nemo.core.neural_typesr   r   r   r   r   
nemo.utilsr   r   rL   rq   r   r   r   r   r   r$   r$   r$   r%   <module>   s(   KQX!E  
s