o
    %ݫi8                     @   s  d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	m
  mZ ddlmZ ddlmZmZ ddlmZ ddlmZ dZG dd	 d	ejZG d
d dejZd*ddZG dd dejZG dd dejZG dd dZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%d d! Z&G d"d# d#ejZ'G d$d% d%ejZ(G d&d' d'ejZ)G d(d) d)ejZ*dS )+zLibrary to support dual-path speech separation.

Authors
 * Cem Subakan 2020
 * Mirco Ravanelli 2020
 * Samuele Cornell 2020
 * Mirko Bronzi 2020
 * Jianyuan Zhong 2020
    N)ConformerEncoder)PositionalEncodingTransformerEncoder)SwishLinear:0yE>c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	GlobalLayerNormar  Calculate Global Layer Normalization.

    Arguments
    ---------
    dim : (int or list or torch.Size)
        Input shape from an expected input of size.
    shape : tuple
        Expected shape of the input.
    eps : float
        A value added to the denominator for numerical stability.
    elementwise_affine : bool
        A boolean value that when set to True,
        this module has learnable per-element affine parameters
        initialized to ones (for weights) and zeros (for biases).

    Example
    -------
    >>> x = torch.randn(5, 10, 20)
    >>> GLN = GlobalLayerNorm(10, 3)
    >>> x_norm = GLN(x)
    r   Tc                    s   t    || _|| _|| _| jrK|dkr+tt| jd| _	tt
| jd| _|dkrItt| jdd| _	tt
| jdd| _d S d S | dd  | dd  d S )N         weightbias)super__init__dimepselementwise_affinenn	Parametertorchonesr   zerosr   register_parameter)selfr   shaper   r   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/dual_path.pyr   5   s   
zGlobalLayerNorm.__init__c                 C   s   |  dkr<tj|ddd}tj|| d ddd}| jr0| j||  t|| j  | j }n|| t|| j  }|  dkrytj|ddd}tj|| d ddd}| jrm| j||  t|| j  | j }|S || t|| j  }|S )zReturns the normalized tensor.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of size [N, C, K, S] or [N, C, L].

        Returns
        -------
        out : torch.Tensor
            The normalized outputs.
        r   )r      T)keepdimr!   r   )r   r!   r   )r   r   meanr   r   sqrtr   r   )r   xr#   varr   r   r    forwardF   s(   zGlobalLayerNorm.forward)r   T__name__
__module____qualname____doc__r   r'   __classcell__r   r   r   r    r
      s    r
   c                       s.   e Zd ZdZd fdd	Z fddZ  ZS )	CumulativeLayerNorma  Calculate Cumulative Layer Normalization.

    Arguments
    ---------
    dim : int
        Dimension that you want to normalize.
    elementwise_affine : bool
        Learnable per-element affine parameters.
    eps : float
        A small value to prevent overflow.

    Example
    -------
    >>> x = torch.randn(5, 10, 20)
    >>> CLN = CumulativeLayerNorm(10)
    >>> x_norm = CLN(x)
    Tr   c                    s   t  j|||d d S )Nr   r   r   r   )r   r   r   r   r   r   r    r         zCumulativeLayerNorm.__init__c                    sx   |  dkr |dddd }t |}|dddd }|  dkr:t|dd}t |}t|dd}|S )zReturns the normalized tensor.

        Arguments
        ---------
        x : torch.Tensor
            torch.Tensor size [N, C, K, S] or [N, C, L]

        Returns
        -------
        out : torch.Tensor
            The normalized outputs.
        r   r   r!   r   r   )r   permute
contiguousr   r'   r   	transposer   r%   r   r   r    r'      s   zCumulativeLayerNorm.forward)Tr   r(   r   r   r   r    r.   o   s    r.   c                 C   sP   | dkrt ||d|dS | dkrt|d|dS | dkr#tjd||dS t|S )z0Just a wrapper to select the normalization type.glnTr/   clnlnr   )r   )r
   r.   r   	GroupNormBatchNorm1d)normr   r   r   r   r   r    select_norm   s   
r<   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
Encodera  Convolutional Encoder Layer.

    Arguments
    ---------
    kernel_size : int
        Length of filters.
    out_channels : int
        Number of output channels.
    in_channels : int
        Number of  input channels.

    Example
    -------
    >>> x = torch.randn(2, 1000)
    >>> encoder = Encoder(kernel_size=4, out_channels=64)
    >>> h = encoder(x)
    >>> h.shape
    torch.Size([2, 64, 499])
    r!   @   r   c                    s0   t    tj||||d ddd| _|| _d S )Nr!   r   F)in_channelsout_channelskernel_sizestridegroupsr   )r   r   r   Conv1dconv1dr?   )r   rA   r@   r?   r   r   r    r      s   

zEncoder.__init__c                 C   s0   | j dkrtj|dd}| |}t|}|S )a  Return the encoded output.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor with dimensionality [B, L].

        Returns
        -------
        x : torch.Tensor
            Encoded tensor with dimensionality [B, N, T_out].
            where B = Batchsize
                  L = Number of timepoints
                  N = Number of filters
                  T_out = Number of timepoints at the output of the encoder
        r   r   )r?   r   	unsqueezerE   Frelur5   r   r   r    r'      s
   


zEncoder.forward)r!   r>   r   r(   r   r   r   r    r=      s    r=   c                       s,   e Zd ZdZ fddZ fddZ  ZS )Decoderaz  A decoder layer that consists of ConvTranspose1d.

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments passed through to nn.ConvTranspose1d

    Example
    -------
    >>> x = torch.randn(2, 100, 1000)
    >>> decoder = Decoder(kernel_size=4, in_channels=100, out_channels=1)
    >>> h = decoder(x)
    >>> h.shape
    torch.Size([2, 1003])
    c                    s   t  j|i | d S Nr0   )r   argskwargsr   r   r    r      r1   zDecoder.__init__c                    sr   |  dvrtd| jt |  dkr|nt|d}t|  dkr2tj|dd}|S t|}|S )ap  Return the decoded output.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor with dimensionality [B, N, L].
                where, B = Batchsize,
                       N = number of filters
                       L = time points

        Returns
        -------
        out : torch.Tensor
            The decoded outputs.
        )r!   r   z{} accept 3/4D tensor as inputr   r   rF   )	r   RuntimeErrorformatr)   r   r'   r   rG   squeezer5   r   r   r    r'     s   
$
zDecoder.forwardr(   r   r   r   r    rJ      s    rJ   c                   @   s    e Zd ZdZdd Zdd ZdS )IdentityBlocka  This block is used when we want to have identity transformation within the Dual_path block.

    Arguments
    ---------
    **kwargs : dict
        Arguments are ignored.

    Example
    -------
    >>> x = torch.randn(10, 100)
    >>> IB = IdentityBlock()
    >>> xhat = IB(x)
    c                 K   s   d S rK   r   )r   rM   r   r   r    _init__.     zIdentityBlock._init__c                 C   s   |S rK   r   r5   r   r   r    __call__1  rS   zIdentityBlock.__call__N)r)   r*   r+   r,   rR   rT   r   r   r   r    rQ     s    rQ   c                       6   e Zd ZdZ						d fdd		Zd
d Z  ZS )FastTransformerBlocka  This block is used to implement fast transformer models with efficient attention.

    The implementations are taken from https://fast-transformers.github.io/

    Arguments
    ---------
    attention_type : str
        Specifies the type of attention.
        Check https://fast-transformers.github.io/  for details.
    out_channels : int
        Dimensionality of the representation.
    num_layers : int
        Number of layers.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Dimensionality of positional feed-forward.
    dropout : float
        Dropout drop rate.
    activation : str
        Activation function.
    reformer_bucket_size : int
        bucket size for reformer.

    Example
    -------
    # >>> x = torch.randn(10, 100, 64)
    # >>> block = FastTransformerBlock('linear', 64)
    # >>> x = block(x)
    # >>> x.shape
    # torch.Size([10, 100, 64])
             r   rI       c	                    sT   t    ddlm}	 |	j|||||| || |||d	}
|
 | _|| _|| _d S )Nr   )TransformerEncoderBuilder)	attention_typen_layersn_headsfeed_forward_dimensionsquery_dimensionsvalue_dimensionsdropoutattention_dropout
chunk_size)	r   r   fast_transformers.buildersr[   from_kwargsgetmdlr\   reformer_bucket_size)r   r\   r@   
num_layersnheadd_ffnrb   
activationri   r[   builderr   r   r    r   W  s    


zFastTransformerBlock.__init__c              	   C   s   | j dkrA| jd |jd | jd   }|j}tj|t|d||d|gdd}| 	|}|ddd| ddf S | 	|S )aY  Returns the transformed input.

        Arguments
        ---------
        x : torch.Tensor
            Tensor shaper [B, L, N].
            where, B = Batchsize,
                   N = number of filters
                   L = time points

        Returns
        -------
        out : torch.Tensor
            The transformed outputs.
        reformerr!   r   r   rF   N)
r\   ri   r   devicer   catr   sizetorh   )r   r%   pad_sizerq   x_paddedr   r   r    r'   u  s   
"

zFastTransformerBlock.forward)rW   rX   rY   r   rI   rZ   r(   r   r   r   r    rV   5  s    %rV   c                       r	   )	PyTorchPositionalEncodingae  Positional encoder for the pytorch transformer.

    Arguments
    ---------
    d_model : int
        Representation dimensionality.
    dropout : float
        Dropout drop prob.
    max_len : int
        Max sequence length.

    Example
    -------
    >>> x = torch.randn(10, 100, 64)
    >>> enc = PyTorchPositionalEncoding(64)
    >>> x = enc(x)
    皙?  c                    s   t    tj|d| _t||}tjd|tjd	d}t
td|d td |  }t|| |d d dd df< t|| |d d dd df< |	ddd}| d| d S )N)pr   )dtyper   r!   g     @pe)r   r   r   Dropoutrb   r   r   arangefloatrG   expmathlogsincosr4   register_buffer)r   d_modelrb   max_lenr|   positiondiv_termr   r   r    r     s   
   z"PyTorchPositionalEncoding.__init__c                 C   s*   || j d|dddf  }| |S )aP  Returns the encoded output.

        Arguments
        ---------
        x : torch.Tensor
            Tensor shape [B, L, N],
            where, B = Batchsize,
                   N = number of filters
                   L = time points

        Returns
        -------
        out : torch.Tensor
            The encoded output.
        Nr   )r|   rs   rb   r5   r   r   r    r'     s    
z!PyTorchPositionalEncoding.forward)rx   ry   r(   r   r   r   r    rw     s    rw   c                       rU   )PytorchTransformerBlocka  A wrapper that uses the pytorch transformer block.

    Arguments
    ---------
    out_channels : int
        Dimensionality of the representation.
    num_layers : int
        Number of layers.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Dimensionality of positional feed forward.
    dropout : float
        Dropout drop rate.
    activation : str
        Activation function.
    use_positional_encoding : bool
        If true we use a positional encoding.

    Example
    -------
    >>> x = torch.randn(10, 100, 64)
    >>> block = PytorchTransformerBlock(64)
    >>> x = block(x)
    >>> x.shape
    torch.Size([10, 100, 64])
    rW   rX      rx   rI   Tc           	         sJ   t    tj|||||d}tj||d| _|r t|| _d S d | _d S )N)r   rk   dim_feedforwardrb   rm   )rj   )r   r   r   TransformerEncoderLayerr   rh   rw   pos_encoder)	r   r@   rj   rk   rl   rb   rm   use_positional_encodingencoder_layerr   r   r    r     s   


z PytorchTransformerBlock.__init__c                 C   s   | j dur
|  |}| |S )aW  Returns the transformed output.

        Arguments
        ---------
        x : torch.Tensor
            Tensor shape [B, L, N]
            where, B = Batchsize,
                   N = number of filters
                   L = time points

        Returns
        -------
        out : torch.Tensor
            The transformed output.
        N)r   rh   r5   r   r   r    r'     s   


zPytorchTransformerBlock.forward)rW   rX   r   rx   rI   Tr(   r   r   r   r    r     s    r   c                       s<   e Zd ZdZ									d fdd		Zd
d Z  ZS )SBTransformerBlocka  A wrapper for the SpeechBrain implementation of the transformer encoder.

    Arguments
    ---------
    num_layers : int
        Number of layers.
    d_model : int
        Dimensionality of the representation.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Dimensionality of positional feed forward.
    input_shape : tuple
        Shape of input.
    kdim : int
        Dimension of the key (Optional).
    vdim : int
        Dimension of the value (Optional).
    dropout : float
        Dropout rate.
    activation : str
        Activation function.
    use_positional_encoding : bool
        If true we use a positional encoding.
    norm_before : bool
        Use normalization before transformations.
    attention_type : str
        Type of attention to use, default "regularMHA"

    Example
    -------
    >>> x = torch.randn(10, 100, 64)
    >>> block = SBTransformerBlock(1, 64, 8)
    >>> x = block(x)
    >>> x.shape
    torch.Size([10, 100, 64])
    r   Nrx   rI   F
regularMHAc                    sp   t    |
| _|	dkrtj}	n|	dkrtj}	ntdt|||||||||	||d| _|
r6t	|d| _
d S d S )NrI   geluunknown activation)rj   rk   rl   input_shaper   kdimvdimrb   rm   normalize_beforer\   
input_size)r   r   r   r   ReLUGELU
ValueErrorr   rh   r   pos_enc)r   rj   r   rk   rl   r   r   r   rb   rm   r   norm_beforer\   r   r   r    r   B  s.   
zSBTransformerBlock.__init__c                 C   s0   | j r| |}| || d S | |d S )aX  Returns the transformed output.

        Arguments
        ---------
        x : torch.Tensor
            Tensor shape [B, L, N],
            where, B = Batchsize,
                   L = time points
                   N = number of filters

        Returns
        -------
        out : torch.Tensor
            The transformed output.
        r   )r   r   rh   )r   r%   r   r   r   r    r'   l  s   
zSBTransformerBlock.forward)	r   NNNrx   rI   FFr   r(   r   r   r   r    r     s    +*r   c                       s0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )

SBRNNBlockaa  RNNBlock for the dual path pipeline.

    Arguments
    ---------
    input_size : int
        Dimensionality of the input features.
    hidden_channels : int
        Dimensionality of the latent layer of the rnn.
    num_layers : int
        Number of the rnn layers.
    rnn_type : str
        Type of the the rnn cell.
    dropout : float
        Dropout rate
    bidirectional : bool
        If True, bidirectional.

    Example
    -------
    >>> x = torch.randn(10, 100, 64)
    >>> rnn = SBRNNBlock(64, 100, 1, bidirectional=True)
    >>> x = rnn(x)
    >>> x.shape
    torch.Size([10, 100, 200])
    LSTMr   Tc                    s(   t    tt||||||d| _d S )N)r   rj   rb   bidirectional)r   r   getattrSBRNNrh   )r   r   hidden_channelsrj   rnn_typerb   r   r   r   r    r     s   
	zSBRNNBlock.__init__c                 C   s   |  |d S )aJ  Returns the transformed output.

        Arguments
        ---------
        x : torch.Tensor
            [B, L, N]
            where, B = Batchsize,
                   N = number of filters
                   L = time points

        Returns
        -------
        out : torch.Tensor
            The transformed output.
        r   )rh   r5   r   r   r    r'     s   zSBRNNBlock.forward)r   r   Tr(   r   r   r   r    r     s    r   c                       s8   e Zd ZdZ	d fdd	Z fddZd	d
 Z  ZS )DPTNetBlocka  The DPT Net block.

    Arguments
    ---------
    d_model : int
        Number of expected features in the input (required).
    nhead : int
        Number of heads in the multiheadattention models (required).
    dim_feedforward : int
        Dimension of the feedforward network model (default=2048).
    dropout : float
        Dropout value (default=0.1).
    activation : str
        Activation function of intermediate layer, relu or gelu (default=relu).

    Examples
    --------
    >>> encoder_layer = DPTNetBlock(d_model=512, nhead=8)
    >>> src = torch.rand(10, 100, 512)
    >>> out = encoder_layer(src)
    >>> out.shape
    torch.Size([10, 100, 512])
       r   rI   c                    s   ddl m} ddlm} ddlm} ddlm}	 ddlm	}
 t
   ||||d| _|
||d d	d
d| _||| _||d d || _|	|| _|	|| _||| _||| _t|| _d S )Nr   )MultiheadAttention)r}   r   )	LayerNorm)r   )rb   r!   r   T)r   )torch.nn.modules.activationr   torch.nn.modules.dropoutr}   torch.nn.modules.linearr   torch.nn.modules.normalizationr   torch.nn.modules.rnnr   r   r   	self_attnrnnrb   linear2norm1norm2dropout1dropout2_get_activation_fnrm   )r   r   rk   r   rb   rm   r   r}   r   r   r   r   r   r    r     s   





zDPTNetBlock.__init__c                    s"   d|vr	t j|d< t | d S )Nrm   )rH   rI   r   __setstate__)r   stater   r   r    r     s   
zDPTNetBlock.__setstate__c                 C   sx   | j |||dddd }|| | }| |}| |d }| |}| |}| |}|| | }| |}|S )a=  Pass the input through the encoder layer.

        Arguments
        ---------
        src : torch.Tensor
            Tensor shape [B, L, N]
            where, B = Batchsize,
                   N = number of filters
                   L = time points

        Returns
        -------
        Encoded outputs.
        N)	attn_maskkey_padding_maskr   )	r   r   r   r   rm   rb   r   r   r   )r   srcsrc2r   r   r    r'     s   





zDPTNetBlock.forward)r   r   rI   )r)   r*   r+   r,   r   r   r'   r-   r   r   r   r    r     s    r   c                 C   s    | dkrt jS | dkrt jS dS )z/Just a wrapper to get the activation functions.rI   r   N)rH   rI   r   )rm   r   r   r    r     s
   r   c                       s0   e Zd ZdZ			d fdd	Zdd Z  ZS )	Dual_Computation_BlockaE  Computation block for dual-path processing.

    Arguments
    ---------
    intra_mdl : torch.nn.module
        Model to process within the chunks.
    inter_mdl : torch.nn.module
        Model to process across the chunks.
    out_channels : int
        Dimensionality of inter/intra model.
    norm : str
        Normalization type.
    skip_around_intra : bool
        Skip connection around the intra layer.
    linear_layer_after_inter_intra : bool
        Linear layer or not after inter or intra.

    Example
    -------
    >>> intra_block = SBTransformerBlock(1, 64, 8)
    >>> inter_block = SBTransformerBlock(1, 64, 8)
    >>> dual_comp_block = Dual_Computation_Block(intra_block, inter_block, 64)
    >>> x = torch.randn(10, 64, 100, 10)
    >>> x = dual_comp_block(x)
    >>> x.shape
    torch.Size([10, 64, 100, 10])
    r8   Tc                    s   t    || _|| _|| _|| _|| _|d ur&t||d| _t||d| _	|r]t
|tr:t|d|jjj d| _nt||d| _t
|trTt|d|jjj d| _d S t||d| _d S d S )Nr   r!   r   )r   r   	intra_mdl	inter_mdlskip_around_intralinear_layer_after_inter_intrar;   r<   
intra_norm
inter_norm
isinstancer   r   rh   r   hidden_sizeintra_linearinter_linear)r   r   r   r@   r;   r   r   r   r   r    r   @  s2   
	


zDual_Computation_Block.__init__c           	      C   s  |j \}}}}|dddd || ||}| |}| jr%| |}|||||}|dddd }| jdurA| |}| j	rH|| }|dddd || ||}| 
|}| jrf| |}|||||}|dddd }| jdur| |}|| }|S )a  Returns the output tensor.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor of dimension [B, N, K, S].

        Returns
        -------
        out: torch.Tensor
            Output tensor of dimension [B, N, K, S].
            where, B = Batchsize,
               N = number of filters
               K = time points in each chunk
               S = the number of chunks
        r   r   r!   r   N)r   r2   r3   viewr   r   r   r;   r   r   r   r   r   )	r   r%   BNKSintrainteroutr   r   r    r'   j  s*   "



"



zDual_Computation_Block.forward)r8   TTr(   r   r   r   r    r   #  s    !*r   c                       sR   e Zd ZdZ								d fd	d
	Zdd Zdd Zdd Zdd Z  Z	S )Dual_Path_Modela  The dual path model which is the basis for dualpathrnn, sepformer, dptnet.

    Arguments
    ---------
    in_channels : int
        Number of channels at the output of the encoder.
    out_channels : int
        Number of channels that would be inputted to the intra and inter blocks.
    intra_model : torch.nn.module
        Model to process within the chunks.
    inter_model : torch.nn.module
        model to process across the chunks,
    num_layers : int
        Number of layers of Dual Computation Block.
    norm : str
        Normalization type.
    K : int
        Chunk length.
    num_spks : int
        Number of sources (speakers).
    skip_around_intra : bool
        Skip connection around intra.
    linear_layer_after_inter_intra : bool
        Linear layer after inter and intra.
    use_global_pos_enc : bool
        Global positional encodings.
    max_length : int
        Maximum sequence length.

    Example
    -------
    >>> intra_block = SBTransformerBlock(1, 64, 8)
    >>> inter_block = SBTransformerBlock(1, 64, 8)
    >>> dual_path_model = Dual_Path_Model(64, 64, intra_block, inter_block, num_spks=2)
    >>> x = torch.randn(10, 64, 2000)
    >>> x = dual_path_model(x)
    >>> x.shape
    torch.Size([2, 10, 64, 2000])
    r   r8      r!   TF N  c                    s
  t    || _|| _|| _t||d| _tj||ddd| _	|| _
| j
r*t|| _tg | _t|D ]}| jtt|||||	|
d q4tj||| dd| _tj||ddd| _t | _t | _tt||dt | _tt||dt | _d S )Nr   r   F)r   )r   r   )rA   ) r   r   r   num_spksrj   r<   r;   r   rD   rE   use_global_pos_encr   r   
ModuleListdual_mdlrangeappendcopydeepcopyr   Conv2dconv2dend_conv1x1PReLUprelur   rm   
SequentialTanhoutputSigmoidoutput_gate)r   r?   r@   intra_modelinter_modelrj   r;   r   r   r   r   r   
max_lengthir   r   r    r     sF   





zDual_Path_Model.__init__c           
      C   s  |  |}| |}| jr#| |dddd||dd   }| || j\}}t| j	D ]	}| j
| |}q1| |}| |}|j\}}}}||| j d||}| ||}| || | }| |}|j\}}}	||| j||	}| |}|dd}|S )a  Returns the output tensor.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor of dimension [B, N, L].

        Returns
        -------
        out : torch.Tensor
            Output tensor of dimension [spks, B, N, L]
            where, spks = Number of speakers
               B = Batchsize,
               N = number of filters
               L = the number of time points
        r   rp   g      ?r   )r;   rE   r   r   r4   rs   _Segmentationr   r   rj   r   r   r   r   r   r   	_over_addr   r   r   rm   )
r   r%   gapr   r   _r   r   r   Lr   r   r    r'     s*   





zDual_Path_Model.forwardc           
      C   s   |j \}}}|d }||||  |  }|dkr3tt||||j|j}tj||gdd}tt||||j|j}	tj|	||	gdd}||fS )a  Padding the audio times.

        Arguments
        ---------
        input : torch.Tensor
            Tensor of size [B, N, L].
            where, B = Batchsize,
                   N = number of filters
                   L = time points
        K : int
            Chunks of length.

        Returns
        -------
        output : torch.Tensor
            Padded inputs
        gap : int
            Size of padding
        r!   r   rF   )	r   r   Tensorr   typer{   rt   rq   rr   )
r   inputr   r   r   r   Pr   pad_padr   r   r    _paddingF  s   zDual_Path_Model._paddingc           
      C   s   |j \}}}|d }| ||\}}|ddddd| f  ||d|}|dddd|df  ||d|}	tj||	gdd||d|dd}| |fS )a  The segmentation stage splits

        Arguments
        ---------
        input : torch.Tensor
            Tensor with dim [B, N, L].
        K : int
            Length of the chunks.

        Return
        ------
        output : torch.Tensor
            Tensor with dim [B, N, K, S].
            where, B = Batchsize,
               N = number of filters
               K = time points in each chunk
               S = the number of chunks
               L = the number of time points
        gap : int
            Size of padding
        r!   Nrp   r   rF   )r   r   r3   r   r   rr   r4   )
r   r   r   r   r   r   r   r   input1input2r   r   r    r   n  s   ,*$zDual_Path_Model._Segmentationc           
      C   s   |j \}}}}|d }|dd ||d|d }|ddddddd|f  ||ddddd|df }|dddddd|df  ||dddddd| f }	||	 }|dkrv|ddddd| f }|S )a  Merge the sequence with the overlap-and-add method.

        Arguments
        ---------
        input : torch.Tensor
            Tensor with dim [B, N, K, S].
        gap : int
            Padding length.

        Return
        ------
        output : torch.Tensor
            Tensor with dim [B, N, L].
            where, B = Batchsize,
               N = number of filters
               K = time points in each chunk
               S = the number of chunks
               L = the number of time points
        r!   r   rp   Nr   )r   r4   r3   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r    r     s    DFzDual_Path_Model._over_add)r   r8   r   r!   TTFr   )
r)   r*   r+   r,   r   r'   r   r   r   r-   r   r   r   r    r     s    .7>("r   c                       sX   e Zd ZdZ											
	
	
	
										d fdd	Zdd Zdd Z  ZS )SepformerWrappera	  The wrapper for the sepformer model which combines the Encoder, Masknet and the decoder
    https://arxiv.org/abs/2010.13154

    Arguments
    ---------
    encoder_kernel_size: int
        The kernel size used in the encoder
    encoder_in_nchannels: int
        The number of channels of the input audio
    encoder_out_nchannels: int
        The number of filters used in the encoder.
        Also, number of channels that would be inputted to the intra and inter blocks.
    masknet_chunksize: int
        The chunk length that is to be processed by the intra blocks
    masknet_numlayers: int
        The number of layers of combination of inter and intra blocks
    masknet_norm: str,
        The normalization type to be used in the masknet
        Should be one of 'ln' -- layernorm, 'gln' -- globallayernorm
                         'cln' -- cumulative layernorm, 'bn' -- batchnorm
                         -- see the select_norm function above for more details
    masknet_useextralinearlayer: bool
        Whether or not to use a linear layer at the output of intra and inter blocks
    masknet_extraskipconnection: bool
        This introduces extra skip connections around the intra block
    masknet_numspks: int
        This determines the number of speakers to estimate
    intra_numlayers: int
        This determines the number of layers in the intra block
    inter_numlayers: int
        This determines the number of layers in the inter block
    intra_nhead: int
        This determines the number of parallel attention heads in the intra block
    inter_nhead: int
        This determines the number of parallel attention heads in the inter block
    intra_dffn: int
        The number of dimensions in the positional feedforward model in the inter block
    inter_dffn: int
        The number of dimensions in the positional feedforward model in the intra block
    intra_use_positional: bool
        Whether or not to use positional encodings in the intra block
    inter_use_positional: bool
        Whether or not to use positional encodings in the inter block
    intra_norm_before: bool
        Whether or not we use normalization before the transformations in the intra block
    inter_norm_before: bool
        Whether or not we use normalization before the transformations in the inter block

    Example
    -------
    >>> model = SepformerWrapper()
    >>> inp = torch.rand(1, 160)
    >>> result = model.forward(inp)
    >>> result.shape
    torch.Size([1, 160, 2])
       r   r      r!   r8   FTrX   rY   c                    s   t    t|||d| _t|
|||||d}t||||||d}t||||||||	||d
| _t||||d dd| _|	| _	| j| j| jfD ]}| 
| qGd S )N)rA   r@   r?   )rj   r   rk   rl   r   r   )
r?   r@   r   r   rj   r;   r   r   r   r   r!   F)r?   r@   rA   rB   r   )r   r   r=   encoderr   r   masknetrJ   decoderr   reset_layer_recursively)r   encoder_kernel_sizeencoder_in_nchannelsencoder_out_nchannelsmasknet_chunksizemasknet_numlayersmasknet_normmasknet_useextralinearlayermasknet_extraskipconnectionmasknet_numspksintra_numlayersinter_numlayersintra_nheadinter_nhead
intra_dffn
inter_dffnintra_use_positionalinter_use_positionalintra_norm_beforeinter_norm_beforer   r   moduler   r   r    r     sZ   
		zSepformerWrapper.__init__c                 C   s6   t |dr	|  | D ]}||kr| | qdS )z+Reinitializes the parameters of the networkreset_parametersN)hasattrr  modulesr   )r   layerchild_layerr   r   r    r   4  s   

z(SepformerWrapper.reset_layer_recursivelyc                    s     |} |}t|g j }|| tj fddt jD dd}|d}|d}||krEt	|ddd|| f}|S |ddd|ddf }|S )z:Processes the input tensor x and returns an output tensor.c                    s    g | ]}  | d qS )rp   )r   rG   ).0r   r   sep_hr   r    
<listcomp>E  s    z,SepformerWrapper.forward.<locals>.<listcomp>rp   rF   r   r   N)
r   r   r   stackr   rr   r   rs   rH   r   )r   mixmix_west_mask
est_sourceT_originT_estr   r  r    r'   <  s"   


	
zSepformerWrapper.forward)r   r   r   r   r!   r8   FTr!   rX   rX   rX   rX   rY   rY   TTTT)r)   r*   r+   r,   r   r   r'   r-   r   r   r   r    r     s0    ;Gr   c                       s>   e Zd ZdZ										d fd	d
	Zdd Z  ZS )SBConformerEncoderBlocka  A wrapper for the SpeechBrain implementation of the ConformerEncoder.

    Arguments
    ---------
    num_layers : int
        Number of layers.
    d_model : int
        Dimensionality of the representation.
    nhead : int
        Number of attention heads.
    d_ffn : int
        Dimensionality of positional feed forward.
    input_shape : tuple
        Shape of input.
    kdim : int
        Dimension of the key (Optional).
    vdim : int
        Dimension of the value (Optional).
    dropout : float
        Dropout rate.
    activation : str
        Activation function.
    kernel_size: int
        Kernel size in the conformer encoder
    bias: bool
        Use bias or not in the convolution part of conformer encoder
    use_positional_encoding : bool
        If true we use a positional encoding.
    attention_type : str
        The type of attention to use, default "RelPosMHAXL"

    Example
    -------
    >>> x = torch.randn(10, 100, 64)
    >>> block = SBConformerEncoderBlock(1, 64, 8)
    >>> from speechbrain.lobes.models.transformer.Transformer import PositionalEncoding
    >>> pos_enc = PositionalEncoding(64)
    >>> pos_embs = pos_enc(torch.ones(1, 199, 64))
    >>> x = block(x)
    >>> x.shape
    torch.Size([10, 100, 64])
    r   Nrx   swish   TRelPosMHAXLc                    s   t    || _|| _|	dkrtj}	n|	dkrtj}	n|	dkr"t}	ntdt	||||||||	|
||d| _
| jdkrCt|d| _d S | jdkrU| jrSt|d| _d S d S td	)
NrI   r   r$  r   )rj   rk   rl   r   r   r   rb   rm   rA   r   r\   r&  r   r   Unsupported attention type)r   r   r   r\   r   r   r   r   r   r   rh   r   r   )r   rj   r   rk   rl   r   r   r   rb   rm   rA   r   r   r\   r   r   r    r     s<   


z SBConformerEncoderBlock.__init__c                 C   s   | j dkr(| tj|jd |jd d d |jd |jd}| j||dd S | j dkrE| jr>| |}| || d S | |d S td)	a4  Returns the transformed output.

        Arguments
        ---------
        x : torch.Tensor
            Tensor shape [B, L, N],
            where, B = Batchsize,
                   L = time points
                   N = number of filters

        Returns
        -------
        Transformed output
        r&  r   r   r!   )rq   )pos_embsr   r'  )	r\   r   r   r   r   rq   rh   r   r   )r   r%   r   r(  r   r   r    r'     s   
$

zSBConformerEncoderBlock.forward)
r   NNNrx   r$  r%  TTr&  r(   r   r   r   r    r#  W  s    04r#  )r   )+r,   r   r   r   torch.nnr   torch.nn.functional
functionalrH   speechbrain.nnet.RNNnnetRNNr   .speechbrain.lobes.models.transformer.Conformerr   0speechbrain.lobes.models.transformer.Transformerr   r   speechbrain.nnet.activationsr   speechbrain.nnet.linearr   EPSModuler
   r   r.   r<   r=   ConvTranspose1drJ   rQ   rV   rw   r   r   r   r   r   r   r   r   r#  r   r   r   r    <module>   sB    
Q
4<3d5MhBU	    %