o
    sif                     @   s   d dl Z d dl mZ d dlmZmZ d dlZddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZdS )    N)nn)foldunfold   )
complex_nn)has_arg   )activationsnorms)DCCRN_ARCHITECTURES)BaseDCUMaskNet)CumLNGlobLNc                       8   e Zd ZdZ	d fdd	Zedd Zd	d
 Z  ZS )	SingleRNNa  Module for a RNN block.

    Inspired from https://github.com/yluo42/TAC/blob/master/utility/models.py
    Licensed under CC BY-NC-SA 3.0 US.

    Args:
        rnn_type (str): Select from ``'RNN'``, ``'LSTM'``, ``'GRU'``. Can
            also be passed in lowercase letters.
        input_size (int): Dimension of the input feature. The input should have
            shape [batch, seq_len, input_size].
        hidden_size (int): Dimension of the hidden state.
        n_layers (int, optional): Number of layers used in RNN. Default is 1.
        dropout (float, optional): Dropout ratio. Default is 0.
        bidirectional (bool, optional): Whether the RNN layers are
            bidirectional. Default is ``False``.
    r   r   Fc                    sn   t t|   | dv sJ | }|| _|| _|| _|| _|| _|| _	t
t|||||dt|d| _d S N)RNNLSTMGRUT)
num_layersdropoutbatch_firstbidirectional)superr   __init__upperrnn_type
input_sizehidden_sizen_layersr   r   getattrr   boolrnnselfr   r   r   r   r   r   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/asteroid/masknn/recurrent.pyr       s"   zSingleRNN.__init__c                 C   s   | j | jrd S d S Nr   r   )r   r   r$   r'   r'   r(   output_size5   s   zSingleRNN.output_sizec                 C   s    | j   |}|  |\}}|S )Input shape [batch, seq, feats])r"   flatten_parameters)r$   inpoutput
rnn_output_r'   r'   r(   forward9   s   
zSingleRNN.forwardr   r   F	__name__
__module____qualname____doc__r   propertyr+   r2   __classcell__r'   r'   r%   r(   r      s    
r   c                       r   )	MulCatRNNa0  MulCat RNN block from [1].

    Composed of two RNNs, returns ``cat([RNN_1(x) * RNN_2(x), x])``.

    Args:
        rnn_type (str): Select from ``'RNN'``, ``'LSTM'``, ``'GRU'``. Can
            also be passed in lowercase letters.
        input_size (int): Dimension of the input feature. The input should have
            shape [batch, seq_len, input_size].
        hidden_size (int): Dimension of the hidden state.
        n_layers (int, optional): Number of layers used in RNN. Default is 1.
        dropout (float, optional): Dropout ratio. Default is 0.
        bidirectional (bool, optional): Whether the RNN layers are
            bidirectional. Default is ``False``.
    References
        [1] Eliya Nachmani, Yossi Adi, & Lior Wolf. (2020). Voice Separation with an Unknown Number of Multiple Speakers.
    r   r   Fc                    s   t t|   | dv sJ | }|| _|| _|| _|| _|| _|| _	t
t|||||dt|d| _t
t|||||dt|d| _d S r   )r   r;   r   r   r   r   r   r   r   r   r    r   r!   rnn1rnn2r#   r%   r'   r(   r   T   s2   zMulCatRNN.__init__c                 C   s   | j | jrdnd | j S r)   )r   r   r   r*   r'   r'   r(   r+   q   s   zMulCatRNN.output_sizec                 C   sD   | j   | j  |  |\}}| |\}}t|| |fdS )r,   r   )r<   r-   r=   torchcat)r$   r.   rnn_output1r1   rnn_output2r'   r'   r(   r2   u   s
   

zMulCatRNN.forwardr3   r4   r'   r'   r%   r(   r;   A   s    
r;   c                       *   e Zd ZdZd	 fdd	Zdd Z  ZS )
StackedResidualRNNal  Stacked RNN with builtin residual connection.
    Only supports forward RNNs.
    See StackedResidualBiRNN for bidirectional ones.

    Args:
        rnn_type (str): Select from ``'RNN'``, ``'LSTM'``, ``'GRU'``. Can
            also be passed in lowercase letters.
        n_units (int): Number of units in recurrent layers. This will also be
            the expected input size.
        n_layers (int): Number of recurrent layers.
        dropout (float): Dropout value, between 0. and 1. (Default: 0.)
        bidirectional (bool): If True, use bidirectional RNN, else
            unidirectional. (Default: False)
               Fc              	      s~   t t|   || _|| _|| _|| _|du sJ d|| _t	 | _
t|D ]}| j
t||||d q't| j| _d S )NFzBidirectional not supported yetr   r   r   )r   rC   r   r   n_unitsr   r   r   r   
ModuleListlayersrangeappendr   Dropoutdropout_layer)r$   r   rG   r   r   r   r1   r%   r'   r(   r      s   
zStackedResidualRNN.__init__c                 C   s*   | j D ]}||}| |}|| }q|S )|Builtin residual connections + dropout applied before residual.
        Input shape : [batch, time_axis, feat_axis]
        )rI   rM   )r$   xr"   rnn_outdropped_outr'   r'   r(   r2      s
   


zStackedResidualRNN.forward)rD   rE   Fr5   r6   r7   r8   r   r2   r:   r'   r'   r%   r(   rC   ~   s    rC   c                       rB   )
StackedResidualBiRNNa  Stacked Bidirectional RNN with builtin residual connection.
    Residual connections are applied on both RNN directions.
    Only supports bidiriectional RNNs.
    See StackedResidualRNN for unidirectional ones.

    Args:
        rnn_type (str): Select from ``'RNN'``, ``'LSTM'``, ``'GRU'``. Can
            also be passed in lowercase letters.
        n_units (int): Number of units in recurrent layers. This will also be
            the expected input size.
        n_layers (int): Number of recurrent layers.
        dropout (float): Dropout value, between 0. and 1. (Default: 0.)
        bidirectional (bool): If True, use bidirectional RNN, else
            unidirectional. (Default: False)
    rD   rE   Tc              	      s   t    || _|| _|| _|| _|du sJ d|| _t||||d| _t	
 | _t|d D ]}d| }| jt||||d q0t	| j| _d S )NTz$Only bidirectional not supported yetrF   r   r   )r   r   r   rG   r   r   r   r   first_layerr   rH   rI   rJ   rK   rL   rM   )r$   r   rG   r   r   r   ir   r%   r'   r(   r      s,   

zStackedResidualBiRNN.__init__c                 C   sT   |  |}| |}tj||gdd| }| jD ]}||}| |}|| }q|S )rN   dim)rT   rM   r>   r?   rI   )r$   rO   rP   rQ   r"   r'   r'   r(   r2      s   




zStackedResidualBiRNN.forward)rD   rE   TrR   r'   r'   r%   r(   rS      s    rS   c                       s6   e Zd ZdZ						d fdd		Zd
d Z  ZS )
DPRNNBlockat  Dual-Path RNN Block as proposed in [1].

    Args:
        in_chan (int): Number of input channels.
        hid_size (int): Number of hidden neurons in the RNNs.
        norm_type (str, optional): Type of normalization to use. To choose from
            - ``'gLN'``: global Layernorm
            - ``'cLN'``: channelwise Layernorm
        bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN.
        rnn_type (str, optional): Type of RNN used. Choose from ``'RNN'``,
            ``'LSTM'`` and ``'GRU'``.
        num_layers (int, optional): Number of layers used in each RNN.
        dropout (float, optional): Dropout ratio. Must be in [0, 1].

    References
        [1] "Dual-path RNN: efficient long sequence modeling for
        time-domain single-channel speech separation", Yi Luo, Zhuo Chen
        and Takuya Yoshioka. https://arxiv.org/abs/1910.06379
    gLNTr   Fr   r   c	           	         s   t t|   |r t|||||dd| _t||||||d| _nt|||||dd| _t||||||d| _t| jj	|| _
t||| _t| jj	|| _t||| _d S )NT)r   r   )r   rY   r   r;   	intra_RNN	inter_RNNr   r   Linearr+   intra_linearr
   get
intra_norminter_linear
inter_norm)	r$   in_chanhid_size	norm_typer   r   
use_mulcatr   r   r%   r'   r(   r      sL   	
	zDPRNNBlock.__init__c                 C   s   |  \}}}}|}|dd|| ||}| |}| |}|||||dd}| |}|| }|dddd|| ||}| |}| |}|||||dddd }| 	|}|| S )z4Input shape : [batch, feats, chunk_size, num_chunks]r   rV   r   )
size	transposereshaper[   r^   r`   r\   ra   
contiguousrb   )r$   rO   BNKLr/   r'   r'   r(   r2   3  s   


"

$
zDPRNNBlock.forward)rZ   Tr   Fr   r   rR   r'   r'   r%   r(   rY      s    6rY   c                       sL   e Zd ZdZ												
		d fdd	Zdd Zdd Z  ZS )DPRNNa`  Dual-path RNN Network for Single-Channel Source Separation
        introduced in [1].

    Args:
        in_chan (int): Number of input filters.
        n_src (int): Number of masks to estimate.
        out_chan  (int or None): Number of bins in the estimated masks.
            Defaults to `in_chan`.
        bn_chan (int): Number of channels after the bottleneck.
            Defaults to 128.
        hid_size (int): Number of neurons in the RNNs cell state.
            Defaults to 128.
        chunk_size (int): window size of overlap and add processing.
            Defaults to 100.
        hop_size (int or None): hop size (stride) of overlap and add processing.
            Default to `chunk_size // 2` (50% overlap).
        n_repeats (int): Number of repeats. Defaults to 6.
        norm_type (str, optional): Type of normalization to use. To choose from

            - ``'gLN'``: global Layernorm
            - ``'cLN'``: channelwise Layernorm
        mask_act (str, optional): Which non-linear function to generate mask.
        bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN
            (Intra-Chunk is always bidirectional).
        rnn_type (str, optional): Type of RNN used. Choose between ``'RNN'``,
            ``'LSTM'`` and ``'GRU'``.
        num_layers (int, optional): Number of layers in each RNN.
        dropout (float, optional): Dropout ratio, must be in [0,1].

    References
        [1] "Dual-path RNN: efficient long sequence modeling for
        time-domain single-channel speech separation", Yi Luo, Zhuo Chen
        and Takuya Yoshioka. https://arxiv.org/abs/1910.06379
    N   d      rZ   reluTr   Fr   r   c                    s  t t|   || _|d ur|n|}|| _|| _|| _|| _|d ur$|n|d }|| _|| _	|| _
|	| _|
| _|| _|| _|| _|| _|| _t|	|}t||d}t||| _g }t| j	D ]}|t|||	|||||dg7 }qbtj| | _t||| d}tt || _tt||dt | _tt||dt  | _!tj||ddd| _"t#|
}t$|dr|dd| _%d S | | _%d S )Nr   r   )re   r   r   rf   r   r   F)biasrX   rW   )&r   ro   r   rc   out_chanbn_chanrd   
chunk_sizehop_size	n_repeatsn_srcre   mask_actr   r   r   r   rf   r
   r_   r   Conv1d
Sequential
bottleneckrJ   rY   netConv2dPReLU	first_outTanhnet_outSigmoidnet_gatemask_netr	   r   
output_act)r$   rc   rz   ru   rv   rd   rw   rx   ry   re   r{   r   r   rf   r   r   
layer_normbottleneck_convr   rO   net_out_convmask_nl_classr%   r'   r(   r   k  sZ   

zDPRNN.__init__c           
      C   s&  |  \}}}| |}t|d| jdf| jdf| jdfd}|jd }||| j| j|}| 	|}| 
|}||| j | j| j|}| j| j }t||| j |||df| jdf| jdf| jdfd}||| j | jd}| || | }| |}| |}	|	|| j| j|}	|	S )zForward.

        Args:
            mixture_w (:class:`torch.Tensor`): Tensor of shape $(batch, nfilters, nframes)$

        Returns:
            :class:`torch.Tensor`: estimated mask of shape $(batch, nsrc, nfilters, nframes)$
        rV   r   r   )kernel_sizepaddingstride)rg   r~   r   	unsqueezerw   rx   shaperi   rv   r   r   rz   r   r   r   r   r   viewru   )
r$   	mixture_wbatch	n_filtersn_framesr/   n_chunks	to_unfoldscoreest_maskr'   r'   r(   r2     s6   	





zDPRNN.forwardc                 C   sF   | j | j| j| j| j| j| j| j| j| j	| j
| j| j| j| jd}|S )Nrc   ru   rv   rd   rw   rx   ry   rz   re   r{   r   r   r   r   rf   r   r$   configr'   r'   r(   
get_config  s"   zDPRNN.get_config)Nrp   rp   rq   Nrr   rZ   rs   Tr   Fr   r   r5   r6   r7   r8   r   r2   r   r:   r'   r'   r%   r(   ro   G  s$    'I+ro   c                       s@   e Zd ZdZ							d fd	d
	Zdd Zdd Z  ZS )
LSTMMaskera  LSTM mask network introduced in [1], without skip connections.

    Args:
        in_chan (int): Number of input filters.
        n_src (int): Number of masks to estimate.
        out_chan  (int or None): Number of bins in the estimated masks.
            Defaults to `in_chan`.
        rnn_type (str, optional): Type of RNN used. Choose between ``'RNN'``,
            ``'LSTM'`` and ``'GRU'``.
        n_layers (int, optional): Number of layers in each RNN.
        hid_size (int): Number of neurons in the RNNs cell state.
        mask_act (str, optional): Which non-linear function to generate mask.
        bidirectional (bool, optional): Whether to use BiLSTM
        dropout (float, optional): Dropout ratio, must be in [0,1].

    References
        [1]: Yi Luo et al. "Real-time Single-channel Dereverberation and Separation
        with Time-domain Audio Separation Network", Interspeech 2018
    NlstmrD      333333?sigmoidTc
              
      s   t    || _|| _|d ur|n|}|| _|| _|| _|| _|| _|| _	|	| _
t|}
t|
dr9|
dd| _n|
 | _|t|	d  }|	rMt|| _nt|| _ttd||||	|dt|| j| | j| _d S )NrX   r   rW   r   )r   r   r   r   )r   r   rc   rz   ru   r   r   rd   r   r{   r   r	   r_   r   r   intr   bn_layerr   r   r}   r   r]   masker)r$   rc   rz   ru   r   r   rd   r   r{   r   r   out_sizer%   r'   r(   r   	  s>   




zLSTMMasker.__init__c                 C   sF   |j d }| |}| |dddd}||| j| jd}|S )Nr   rV   )r   r   r   rh   r   rz   ru   )r$   rO   
batch_sizeto_sep	est_masksr'   r'   r(   r2   <  s
   

zLSTMMasker.forwardc              
   C   s.   | j | j| j| j| j| j| j| j| jd	}|S )N	rc   rz   ru   r   r   rd   r   r{   r   r   r   r'   r'   r(   r   C  s   zLSTMMasker.get_config)Nr   rD   r   r   r   Tr   r'   r'   r%   r(   r     s    3r   c                       s4   e Zd ZdZ	d fdd	Zdejfd	d
Z  ZS )DCCRMaskNetRNNa  RNN (LSTM) layer between encoders and decoders introduced in [1].

    Args:
        in_size (int): Number of inputs to the RNN. Must be the product of non-batch,
            non-time dimensions of output shape of last encoder, i.e. if the last
            encoder output shape is $(batch, nchans, nfreqs, time)$, `in_size` must be
            $nchans * nfreqs$.
        hid_size (int, optional): Number of units in RNN.
        rnn_type (str, optional): Type of RNN to use. See ``SingleRNN`` for valid values.
        n_layers (int, optional): Number of layers used in RNN.
        norm_type (Optional[str], optional): Norm to use after linear.
            See ``asteroid.masknn.norms`` for valid values. (Not used in [1]).
        rnn_kwargs (optional): Passed to :func:`~.recurrent.SingleRNN`.

    References
        [1] : "DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement",
        Yanxin Hu et al. https://arxiv.org/abs/2008.00264
    rp   r   r   Nc                    sL   t    tj|||fd|i|| _ttj| jj|| _	t
|| _d S )Nr   )r   r   r   ComplexSingleRNNr"   ComplexMultiplicationWrapperr   r]   r+   linearr
   get_complexnorm)r$   in_sizerd   r   r   re   
rnn_kwargsr%   r'   r(   r   f  s   
zDCCRMaskNetRNN.__init__rO   c              	   C   s   |j d|jd gtd|jd R  }| | |jg |jdd dR  j|j }|j dgtd|jdR  }| jdurE| |}|S )zInput shape: [batch, ..., time]r   r   Nr   rV   )permutendimrJ   r   r"   ri   r   r   )r$   rO   r'   r'   r(   r2   s  s   &2

zDCCRMaskNetRNN.forward)rp   r   r   N)	r5   r6   r7   r8   r   r   ComplexTensorr2   r:   r'   r'   r%   r(   r   R  s
    r   c                       s,   e Zd ZdZeZ fddZdd Z  ZS )DCCRMaskNeta  Masking part of DCCRNet, as proposed in [1].

    Valid `architecture` values for the ``default_architecture`` classmethod are:
    "DCCRN" and "mini".

    Args:
        encoders (list of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding)):
            Arguments of encoders of the u-net
        decoders (list of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding))
            Arguments of decoders of the u-net
        n_freqs (int): Number of frequencies (dim 1) of input to ``.forward()``.
            Must be divisible by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency
            strides of the encoders.

    Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs$ divisible
    by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency strides of the encoders.

    References
        [1] : "DCCRN: Deep Complex Convolution Recurrent Network for Phase-Aware Speech Enhancement",
        Yanxin Hu et al. https://arxiv.org/abs/2008.00264
    c                    s   t jdd |D dd| _| j\}}|d d tt || f}ddlm m t j	dg fdd	|D t
t |tj g fd
d	|d d D tj|d  d| d S )Nc                 S   s   g | ]	\}}}}}|qS r'   r'   ).0r1   
enc_strider'   r'   r(   
<listcomp>  s    z(DCCRMaskNet.__init__.<locals>.<listcomp>r   )axisrV   r   DCUNetComplexDecoderBlockDCUNetComplexEncoderBlockc                 3       | ]
} |d diV  qdS 
activationpreluNr'   r   args)r   r'   r(   	<genexpr>      z'DCCRMaskNet.__init__.<locals>.<genexpr>c                 3   r   r   r'   r   )r   r'   r(   r     r   )encodersdecodersoutput_layerr'   )npprodencoders_stride_productr   ceilconvolutionalr   r   r   r   r   r>   r   Identityr   ComplexConvTranspose2d)r$   r   r   n_freqskwargs	freq_prodr1   last_encoder_out_shaper%   r   r(   r     s(   


zDCCRMaskNet.__init__c                 C   s4   | j \}}|jd | rtd| d|j d|S )Nr   z?Input shape must be [batch, freq, time] with freq divisible by z, got z instead)r   r   	TypeError)r$   rO   r   r1   r'   r'   r(   fix_input_dims  s   
zDCCRMaskNet.fix_input_dims)	r5   r6   r7   r8   r   _architecturesr   r   r:   r'   r'   r%   r(   r     s
    r   )r>   r   torch.nn.functionalr   r   numpyr    r   utilsr   r	   r
   _dccrn_architecturesr   baser   r   r   Moduler   r;   rC   rS   rY   ro   r   r   r   r'   r'   r'   r(   <module>   s(    3=-=_ .^.