o
    %ݫi                    @   sb  d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ eeZdd Zdd	 ZG d
d dejjZG dd dejjZG dd dejjZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG d d! d!ejjZG d"d# d#ejZd$d% ZdS )&zLibrary implementing recurrent neural networks.

Authors
 * Adel Moumen 2023
 * Mirco Ravanelli 2020
 * Ju-Chieh Chou 2020
 * Jianyuan Zhong 2020
 * Loren Lugosch 2020
    )OptionalN)ContentBasedAttentionKeyValueAttentionLocationAwareAttention)
get_loggerc                 C   s*   ||  d  }tjjjj| |dddS )zReturns packed speechbrain-formatted tensors.

    Arguments
    ---------
    inputs : torch.Tensor
        The sequences to pack.
    lengths : torch.Tensor
        The length of each sequence.

    Returns
    -------
    The packed sequences.
       TF)batch_firstenforce_sorted)sizecputorchnnutilsrnnpack_padded_sequence)inputslengths r   H/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/nnet/RNN.pyr      s   
r   c                 C   s   t jjjj| dd\}}|S )a  Returns speechbrain-formatted tensor from packed sequences.

    Arguments
    ---------
    inputs : torch.nn.utils.rnn.PackedSequence
        An input set of sequences to convert to a tensor.

    Returns
    -------
    outputs : torch.Tensor
        The padded sequences.
    T)r   )r   r   r   r   pad_packed_sequence)r   outputsr   r   r   r   r   .   s   

r   c                       s<   e Zd ZdZ								d fdd		Zdd
dZ  ZS )RNNa  This function implements a vanilla RNN.

    It accepts in input tensors formatted as (batch, time, fea).
    In the case of 4d inputs like (batch, time, fea, channel) the tensor is
    flattened as (batch, time, fea*channel).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
        values (i.e, time and frequency kernel sizes respectively).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    nonlinearity : str
        Type of nonlinearity (tanh, relu).
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        If True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    bidirectional : bool
        If True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = RNN(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor, _ = net(inp_tensor)
    >>>
    torch.Size([4, 10, 5])
    Nrelur   T        Fc
           
   
      s   t    d| _|d u r|d u rtd|d u r-t|dkr!d| _tt|dd  }tjj	|||||	|d|d| _
|rEt| j
 d S d S )NF*Expected one of input_shape or input_size.   T   )
input_sizehidden_size
num_layersdropoutbidirectionalbiasr   nonlinearity)super__init__reshape
ValueErrorlenr   prodtensorr   r   r   rnn_init)
selfr   input_shaper   r#   r   r"   r    re_initr!   	__class__r   r   r%   i   s*   
zRNN.__init__c                 C      | j r|jdkr| |jd |jd |jd |jd  }| j  |dur+t||}|dur9| j||d\}}n| |\}}|durHt|}||fS )a  Returns the output of the vanilla RNN.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor
            Starting hidden state.
        lengths : torch.Tensor
            Relative lengths of the input signals.

        Returns
        -------
        output : torch.Tensor
            The output of the vanilla RNN
        hn : torch.Tensor
            The hidden states.
           r   r   r   r   Nhxr&   ndimshaper   flatten_parametersr   r   r,   xr4   r   outputhnr   r   r   forward      
*

zRNN.forward)NNr   r   Tr   TFNN__name__
__module____qualname____doc__r%   r=   __classcell__r   r   r/   r   r   A   s    *&r   c                       :   e Zd ZdZ							d fdd	Zdd	d
Z  ZS )LSTMaH  This function implements a basic LSTM.

    It accepts in input tensors formatted as (batch, time, fea).
    In the case of 4d inputs like (batch, time, fea, channel) the tensor is
    flattened as (batch, time, fea*channel).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
        values (i.e, time and frequency kernel sizes respectively).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        It True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    bidirectional : bool
        If True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = LSTM(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor = net(inp_tensor)
    >>>
    torch.Size([4, 10, 5])
    Nr   Tr   Fc	           	   	         t    d| _|d u r|d u rtd|d u r/t|dkr!d| _tt|dd   }tj	j
||||||dd| _|rFt| j d S d S NFr   r   Tr   )r   r   r   r    r!   r"   r   )r$   r%   r&   r'   r(   r   r)   r*   itemr   rG   r   r+   	r,   r   r-   r   r   r"   r    r.   r!   r/   r   r   r%      (   

zLSTM.__init__c                 C   r1   )a  Returns the output of the LSTM.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor
            Starting hidden state.
        lengths : torch.Tensor
            Relative length of the input signals.

        Returns
        -------
        output : torch.Tensor
            The output of the LSTM.
        hn : torch.Tensor
            The hidden states.
        r2   r   r   r   r   Nr3   r5   r9   r   r   r   r=     r>   zLSTM.forwardNNr   Tr   TFr?   r@   r   r   r/   r   rG          ($rG   c                       rF   )GRUaE  This function implements a basic GRU.

    It accepts input tensors formatted as (batch, time, fea).
    In the case of 4d inputs like (batch, time, fea, channel) the tensor is
    flattened as (batch, time, fea*channel).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
        values (i.e, time and frequency kernel sizes respectively).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout: float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        If True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    bidirectional : bool
        If True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = GRU(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor, _ = net(inp_tensor)
    >>>
    torch.Size([4, 10, 5])
    Nr   Tr   Fc	           	   	      rH   rI   )r$   r%   r&   r'   r(   r   r)   r*   rJ   r   rO   r   r+   rK   r/   r   r   r%   W  rL   zGRU.__init__c                 C   r1   )a  Returns the output of the GRU.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor
            Starting hidden state.
        lengths : torch.Tensor
            Relative length of the input signals.

        Returns
        -------
        output : torch.Tensor
            Output of GRU.
        hn : torch.Tensor
            Hidden states.
        r2   r   r   r   r   Nr3   r5   r9   r   r   r   r=   {  r>   zGRU.forwardrM   r?   r@   r   r   r/   r   rO   1  rN   rO   c                       rF   )RNNCella]  This class implements a basic RNN Cell for a timestep of input,
    while RNN() takes the whole sequence as input.

    It is designed for an autoregressive decoder (ex. attentional decoder),
    which takes one input at a time.
    Using torch.nn.RNNCell() instead of torch.nn.RNN() to reduce VRAM
    consumption.

    It accepts in input tensors formatted as (batch, fea).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        It True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    nonlinearity : str
        Type of nonlinearity (tanh, relu).

    Example
    -------
    >>> inp_tensor = torch.rand([4, 20])
    >>> net = RNNCell(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor, _ = net(inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    Nr   Tr   tanhc	                    s   t    || _|| _|d u r|d u rtd|d u r0t|dkr$d| _tt	|dd  }|| j||d}	t
tj
jd	i |	g| _| j|	d< t| jd D ]}
| jtj
jd	i |	 qRt
 fddt| jd D | _|r}t| j d S d S )
Nr   r   Tr   )r   r   r"   r#   r   c                       g | ]	}t jj d qS pr   r   Dropout.0_r    r   r   
<listcomp>      z$RNNCell.__init__.<locals>.<listcomp>r   )r$   r%   r   r   r'   r(   r&   r   r)   r*   r   
ModuleListrP   	rnn_cellsrangeappenddropout_layersr+   )r,   r   r-   r   r   r"   r    r.   r#   kwargsir/   r[   r   r%     s0   

zRNNCell.__init__c                 C      |du r| | j|jd | j}| jd ||d }|g}td| jD ]}| j|d  |}| j| ||| }|| q#tj	|dd}||fS )ab  Returns the output of the RNNCell.

        Arguments
        ---------
        x : torch.Tensor
            The input of RNNCell.
        hx : torch.Tensor
            The hidden states of RNNCell.

        Returns
        -------
        h : torch.Tensor
            Outputs of RNNCell.
        hidden : torch.Tensor
            Hidden states.
        Nr   r   dim
	new_zerosr   r7   r   r_   r`   rb   ra   r   stackr,   r:   r4   h
hidden_lstrd   drop_hhiddenr   r   r   r=     s   zRNNCell.forward)NNr   Tr   TrQ   Nr@   r   r   r/   r   rP     s    *,rP   c                       8   e Zd ZdZ						d
 fdd	Zddd	Z  ZS )GRUCella  This class implements a basic GRU Cell for a timestep of input,
    while GRU() takes the whole sequence as input.

    It is designed for an autoregressive decoder (ex. attentional decoder),
    which takes one input at a time.
    Using torch.nn.GRUCell() instead of torch.nn.GRU() to reduce VRAM
    consumption.
    It accepts in input tensors formatted as (batch, fea).

    Arguments
    ---------
    hidden_size: int
        Number of output neurons (i.e, the dimensionality of the output).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        Number of layers to employ in the GRU architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        It True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 20])
    >>> net = GRUCell(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor, _ = net(inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    Nr   Tr   c           
            t    || _|| _|d u r|d u rtd|d u r0t|dkr$d| _tt	|dd  }|| j|d}t
tj
jd	i |g| _| j|d< t| jd D ]}	| jtj
jd	i | qQt
 fddt| jd D | _|r|t| j d S d S )
Nr   r   Tr   r   r   r"   r   c                    rR   rS   rV   rX   r[   r   r   r\   d  r]   z$GRUCell.__init__.<locals>.<listcomp>r   )r$   r%   r   r   r'   r(   r&   r   r)   r*   r   r^   rr   r_   r`   ra   rb   r+   
r,   r   r-   r   r   r"   r    r.   rc   rd   r/   r[   r   r%   @  .   


zGRUCell.__init__c                 C   re   )aa  Returns the output of the GRUCell.

        Arguments
        ---------
        x : torch.Tensor
            The input of GRUCell.
        hx : torch.Tensor
            The hidden states of GRUCell.

        Returns
        -------
        h : torch.Tensor
            Outputs of GRUCell
        hidden : torch.Tensor
            Hidden states.
        Nr   r   rf   rh   rk   r   r   r   r=   j  s   zGRUCell.forwardNNr   Tr   Trp   r@   r   r   r/   r   rr         '*rr   c                       rq   )LSTMCella  This class implements a basic LSTM Cell for a timestep of input,
    while LSTM() takes the whole sequence as input.

    It is designed for an autoregressive decoder (ex. attentional decoder),
    which takes one input at a time.
    Using torch.nn.LSTMCell() instead of torch.nn.LSTM() to reduce VRAM
    consumption.
    It accepts in input tensors formatted as (batch, fea).

    Arguments
    ---------
    hidden_size: int
        Number of output neurons (i.e, the dimensionality of the output).
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        Number of layers to employ in the LSTM architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        If True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 20])
    >>> net = LSTMCell(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor, _ = net(inp_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    Nr   Tr   c           
         rs   )
Nr   r   Tr   rt   r   c                    rR   rS   rV   rX   r[   r   r   r\     r]   z%LSTMCell.__init__.<locals>.<listcomp>r   )r$   r%   r   r   r'   r(   r&   r   r)   r*   r   r^   ry   r_   r`   ra   rb   r+   ru   r/   r[   r   r%     rv   zLSTMCell.__init__c                 C   s   |du r| | j|jd | j| | j|jd | jf}| jd ||d d |d d f\}}|g}|g}td| jD ])}| j|d  |}| j| ||d | |d | f\}}|| || q<tj	|dd}	tj	|dd}
||	|
ffS )a@  Returns the output of the LSTMCell.

        Arguments
        ---------
        x : torch.Tensor
            The input of LSTMCell.
        hx : torch.Tensor
            The hidden states of LSTMCell.

        Returns
        -------
        h : torch.Tensor
            Outputs
        Tuple of (hidden, cell)
        Nr   r   rf   rh   )r,   r:   r4   rl   crm   cell_lstrd   rn   ro   cellr   r   r   r=     s   ((
zLSTMCell.forwardrw   rp   r@   r   r   r/   r   ry     rx   ry   c                       sB   e Zd ZdZ								d fdd		Zd
d Zdd Z  ZS )AttentionalRNNDecodera	  This function implements RNN decoder model with attention.

    This function implements different RNN models. It accepts in enc_states
    tensors formatted as (batch, time, fea). In the case of 4d inputs
    like (batch, time, fea, channel) the tensor is flattened in this way:
    (batch, time, fea*channel).

    Arguments
    ---------
    rnn_type : str
        Type of recurrent neural network to use (rnn, lstm, gru).
    attn_type : str
        type of attention to use (location, content).
    hidden_size : int
        Number of the neurons.
    attn_dim : int
        Number of attention module internal and output neurons.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    enc_dim : int
        Size of encoding dimension.
    input_size : int
        Expected size of the relevant input dimension.
    nonlinearity : str
        Type of nonlinearity (tanh, relu). This option is active for
        rnn and ligru models only. For lstm and gru tanh is used.
    re_init : bool
        It True, orthogonal init is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    normalization : str
        Type of normalization for the ligru model (batchnorm, layernorm).
        Every string different from batchnorm and layernorm will result
        in no normalization.
    scaling : float
        A scaling factor to sharpen or smoothen the attention distribution.
    channels : int
        Number of channels for location-aware attention.
    kernel_size : int
        Size of the kernel for location-aware attention.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).

    Example
    -------
    >>> batch_size = 4
    >>> enc_states = torch.rand([batch_size, 10, 20])
    >>> wav_len = torch.ones([batch_size])
    >>> inp_tensor = torch.rand([batch_size, 5, 6])
    >>> net = AttentionalRNNDecoder(
    ...     rnn_type="lstm",
    ...     attn_type="content",
    ...     hidden_size=7,
    ...     attn_dim=5,
    ...     num_layers=1,
    ...     enc_dim=20,
    ...     input_size=6,
    ... )
    >>> out_tensor, attn = net(inp_tensor, enc_states, wav_len)
    >>> out_tensor.shape
    torch.Size([4, 5, 7])
    r   T	batchnorm      ?Nr   c              	      s  t    | | _| | _|| _|| _|| _|| _|| _	|| _
|
| _|	| _|| _|| _|| _t| j| j | j| _| jdkrPt|| j| j| j| jd| _n2| jdkrht|| j| j| j| j| j| jd| _n| jdkrzt|| j| j| jd| _nt| j dtj| j
d| _| jd	krd
n| j
}| jdkrt}n| jdkrt}n| jdkrt}nt| j d|| j | j| j| j	|| jd}| jdkr| j|d< |di || _d S )Ncontent)enc_dimdec_dimattn_dim
output_dimscalinglocation)r   r   r   r   conv_channelskernel_sizer   keyvalue)r   r   r   r   z is not implemented.rT   r   r   r   grulstmz not implemented.)r   r   r   r"   r    r.   r#   r   )r$   r%   lowerrnn_type	attn_typer   r   r   r   r"   r    normalizationr.   r#   channelsr   r   Linearprojr   attnr   r   r'   rW   droprP   rr   ry   r   )r,   r   r   r   r   r   r   r   r#   r.   r   r   r   r   r"   r    
cell_classrc   r/   r   r   r%   @  s|   














zAttentionalRNNDecoder.__init__c           
      C   sf   t j||gdd}| |}| ||\}}| |||\}}t j||gdd}	| |	}	|	|||fS )a6  One step of forward pass process.

        Arguments
        ---------
        inp : torch.Tensor
            The input of current timestep.
        hs : torch.Tensor or tuple of torch.Tensor
            The cell state for RNN.
        c : torch.Tensor
            The context vector of previous timestep.
        enc_states : torch.Tensor
            The tensor generated by encoder, to be attended.
        enc_len : torch.LongTensor
            The actual length of encoder states.

        Returns
        -------
        dec_out : torch.Tensor
            The output tensor.
        hs : torch.Tensor or tuple of torch.Tensor
            The new cell state for RNN.
        c : torch.Tensor
            The context vector of the current timestep.
        w : torch.Tensor
            The weight of attention.
        rf   r   )r   catr   r   r   r   )
r,   inphsrz   
enc_statesenc_lencell_inpcell_outwdec_outr   r   r   forward_step  s   

z"AttentionalRNNDecoder.forward_stepc                 C   s   t |jd |  }| j  t j|jd | j|jd}d}g g }}t	|jd D ]}	| 
|dd|	f ||||\}
}}}||
 || q,t j|dd}
t j|dd}|
|fS )aI  This method implements the forward pass of the attentional RNN decoder.

        Arguments
        ---------
        inp_tensor : torch.Tensor
            The input tensor for each timesteps of RNN decoder.
        enc_states : torch.Tensor
            The tensor to be attended by the decoder.
        wav_len : torch.Tensor
            This variable stores the relative length of wavform.

        Returns
        -------
        outputs : torch.Tensor
            The output of the RNN decoder.
        attn : torch.Tensor
            The attention weight of each timestep.
        r   r   deviceNrf   )r   roundr7   longr   resetzerosr   r   r`   r   ra   rj   )r,   
inp_tensorr   wav_lenr   rz   r   outputs_lstattn_lsttr   r   r   r   r   r   r=     s    


zAttentionalRNNDecoder.forward)r   Tr~   r   NNTr   )rA   rB   rC   rD   r%   r   r=   rE   r   r   r/   r   r}     s    Id%r}   c                       sb   e Zd ZdZ							d fdd		Zd
d Zddeej fddZ	deej fddZ
  ZS )LiGRUa	  This function implements a Light GRU (Li-GRU).

    Li-GRU is single-gate GRU model based on batch-norm + relu
    activations + recurrent dropout. For more info see:

    "M. Ravanelli, P. Brakel, M. Omologo, Y. Bengio,
    Light Gated Recurrent Units for Speech Recognition,
    in IEEE Transactions on Emerging Topics in Computational Intelligence,
    2018" (https://arxiv.org/abs/1803.10225)

    If you face instabilities during training, instead use the Stabilised Li-GRU (SLi-GRU).
    See:
        - speechbrain.nnet.RNN.SLiGRU

    To improve the speed of the model, it is recommended to use the torch just-in-time compiler (jit)
    right before using it or you can use the custom implementation (CUDA+PyTorch) that is available
    at https://github.com/Adel-Moumen/fast_ligru.

    You can compile it with:
    compiled_model = torch.jit.script(model)

    It accepts in input tensors formatted as (batch, time, fea).
    In the case of 4d inputs like (batch, time, fea, channel) the tensor is
    flattened as (batch, time, fea*channel).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
        values (i.e, time and frequency kernel sizes respectively).
    input_shape : tuple
        The shape of an example input.
    nonlinearity : str
        Type of nonlinearity (tanh, relu).
    normalization : str
        Type of normalization for the ligru model (batchnorm, layernorm).
        Every string different from batchnorm and layernorm will result
        in no normalization.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        If True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    bidirectional : bool
        If True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = LiGRU(input_shape=inp_tensor.shape, hidden_size=5)
    >>> out_tensor, _ = net(inp_tensor)
    >>>
    torch.Size([4, 10, 5])
    r   r~   r   Tr   Fc
           
         s   t    || _|| _|| _|| _|| _|| _|| _|	| _	d| _
t|dkr)d| _
ttt|dd  | _|d | _|  | _| jrLt| j d S d S NFr   Tr   r   )r$   r%   r   r#   r   r   r"   r    r.   r!   r&   r(   floatr   r)   r*   fea_dim
batch_size_init_layersr   r+   )
r,   r   r-   r#   r   r   r"   r    r.   r!   r/   r   r   r%   5  s$   


zLiGRU.__init__c                 C   sr   t jg }| j}t| jD ](}t|| j| j| j| j	| j
| j| j| jd	}|| | jr3| jd }q| j}q|S )z%Initializes the layers of the Li-GRU.)r    r#   r   r"   r!   r   )r   r   r^   r   r`   r   LiGRU_Layerr   r   r    r#   r   r"   r!   ra   r,   r   current_dimrd   rnn_layr   r   r   r   V  s&   
zLiGRU._init_layersNr4   c                 C   T   | j r|jdkr| |jd |jd |jd |jd  }| j||d\}}||fS )aR  Returns the output of the Li-GRU.

        Arguments
        ---------
        x : torch.Tensor
            The input tensor.
        hx : torch.Tensor
            Starting hidden state.

        Returns
        -------
        output : torch.Tensor
            Output of LiGRU
        hh : torch.Tensor
            Hidden states
        r2   r   r   r   r   r3   )r&   r6   r7   _forward_ligrur,   r:   r4   r;   hhr   r   r   r=   o  
   
*zLiGRU.forwardc                 C      g }|dur| j r|| j| jd | j}t| jD ]%\}}|dur+|||| d}n||dd}||dddddf  qtj	|dd}| j r^||j
d d |j
d | j}||fS |dd}||fS )a1  Returns the output of the vanilla Li-GRU.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor

        Returns
        -------
        x : torch.Tensor
            Output tensor.
        h : torch.Tensor
            The hidden states.
        Nr   r3   r   r   rf   r   r!   r&   r   r   r   	enumerater   ra   r   rj   r7   	transpose)r,   r:   r4   rl   rd   	ligru_layr   r   r   r     "    zLiGRU._forward_ligru)r   r~   r   Tr   TFrp   )rA   rB   rC   rD   r%   r   r   r   Tensorr=   r   rE   r   r   r/   r   r     s    @!r   c                       sb   e Zd ZdZ					d fdd	Zdd
eej fddZdd Z	dd Z
dd Zdd Z  ZS )r   ay  This class implements Light-Gated Recurrent Units (Li-GRU) layer.

    Arguments
    ---------
    input_size : int
        Feature dimensionality of the input tensors.
    hidden_size : int
        Number of output neurons.
    num_layers : int
        The layer number.
    batch_size : int
        Batch size of the input tensors.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    nonlinearity : str
        Type of nonlinearity (tanh, sin, leaky_relu, relu).
    normalization : str
        Type of normalization (batchnorm, layernorm).
        Every string different from batchnorm and layernorm will result
        in layer normalization.
    bias: bool
        If True, the additive bias b is adopted.
    bidirectional : bool
        if True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.
    r   r   r~   TFc
           
         sx  t    t|| _t|| _|| _|	| _|| _|| _t	j
| jd| j dd| _t	j
| jd| j dd| _| jr>| jd | _d| _|dkrTt	jd| j dd| _d| _n|dkrftj	d| j | _d| _ntj	d| j | _d| _| js| jjjd	 d| jj_| d
td| j |   |dkrtj	 | _d S |dkrtj| _d S |dkrtj	 | _d S tj	 | _d S )Nr   Fr"   r~   皙?momentumT	layernormr   h_initr   rQ   sin
leaky_relu)r$   r%   intr   r   r   r!   r    r"   r   r   r   u	normalizeBatchNorm1dnormr   	LayerNormdatafill_requires_gradregister_bufferr   
_init_dropTanhactr   	LeakyReLUReLU)
r,   r   r   r   r   r    r#   r   r"   r!   r/   r   r   r%     s@   



zLiGRU_Layer.__init__Nr4   c           
      C      | j r|d}tj||gdd}| | | |}| jrB| ||j	d |j	d  |j	d }||j	d |j	d |j	d }|durM| 
||}n| j|j	d | jj	d }| 
||}| j r{|jddd\}}	|	d}	tj||	gdd}|S )a  Returns the output of the liGRU layer.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor
            Hidden state.

        Returns
        -------
        h : torch.Tensor
            The output of the liGRU.
        r   r   rf   r   N)r!   flipr   r   _change_batch_sizer   r   r   r&   r7   _ligru_cellr   broadcast_tochunk
r,   r:   r4   x_flipr   w_bnrl   r   h_fh_br   r   r   r=     "   


( 
zLiGRU_Layer.forwardc                 C   s   g }|  |}t|jd D ]2}|dd|f | | }|dd\}}t|}| || }	|| d| |	  }|| qtj	|dd}
|
S )a2  Returns the hidden states for each time step.

        Arguments
        ---------
        w : torch.Tensor
            Linearly transformed input.
        ht : torch.Tensor
            Hidden state.

        Returns
        -------
        h : torch.Tensor
            Hidden state for each step.
        r   Nr   rf   )
_sample_drop_maskr`   r7   r   r   r   sigmoidr   ra   rj   r,   r   hthiddens	drop_maskkgatesatzthcandrl   r   r   r   r   ;  s   

zLiGRU_Layer._ligru_cellc              	   C   s\   t jj| jdd| _d| _d| _| d| t | j| j	j
 | dt dg  dS )	wInitializes the recurrent dropout operation. To speed it up,
        the dropout masks are sampled in advance.
        FrU   inplace>  r   
drop_masksdrop_mask_ter   Nr   r   rW   r    r   N_drop_masksdrop_mask_cntr   onesr   r   r*   r   r,   r   r   r   r   \  s   zLiGRU_Layer._init_dropc                 C      | j r4| j| j | jkrd| _| tj| j| j|jdj	| _
| j
| j| j| j  }| j| j | _|S | j|j| _| j}|S z,Selects one of the pre-defined dropout masksr   r   trainingr   r   r   r   r   r   r   r   r   r   r   tor,   r   r   r   r   r   r   j  $   zLiGRU_Layer._sample_drop_maskc                 C   N   | j |jd kr#|jd | _ | jr%| tj| j| j|jdj	| _
dS dS dS a  This function changes the batch size when it is different from
        the one detected in the initialization method. This might happen in
        the case of multi-gpu or when we have different batch sizes in train
        and test. We also update the h_int and drop masks.
        r   r   Nr   r7   r   r   r   r   r   r   r   r   r   r,   r:   r   r   r   r        zLiGRU_Layer._change_batch_size)r   r   r~   TFrp   )rA   rB   rC   rD   r%   r   r   r   r=   r   r   r   r   rE   r   r   r/   r   r     s    !@.!r   c                       sd   e Zd ZdZ								d fdd		Zd
d Zddeej fddZ	deej fddZ
  ZS )SLiGRUa  This class implements a Stabilised Light GRU (SLi-GRU).

    SLi-GRU is single-gate GRU model based on batch-norm + relu
    activations + layer-norm on the recurrent connections + recurrent dropout.

    The SLi-GRU differs from the vanilla Li-GRU on the recurrent weights. Indeed, the Li-GRU
    suffers from an exploding gradient problem on the recurrent weights, and cannot be trained on medium to large ASR dataset.
    To solve this problem, we use a layer-norm on the recurrent weights that stabilises the training of the model and allows one
    to train it on large ASR datasets without any problem.

    This model beat traditional LSTM/GRU models on the CommonVoice/LibriSpeech datasets (WER and efficiency).

    For more info see:
    "Moumen, A., & Parcollet, T. (2023, June). Stabilising and accelerating light gated recurrent units for automatic speech recognition.
    In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 1-5). IEEE."
    (https://arxiv.org/abs/2302.10144)

    To improve the speed of the model, it is recommended to use the torch just-in-time compiler (jit)
    right before using it or you can use the custom implementation (CUDA+PyTorch) that is available
    at https://github.com/Adel-Moumen/fast_ligru.

    You can compile it with:
    compiled_model = torch.jit.script(model)

    It accepts in input tensors formatted as (batch, time, fea).
    In the case of 4d inputs like (batch, time, fea, channel) the tensor is
    flattened as (batch, time, fea*channel).

    Arguments
    ---------
    hidden_size : int
        Number of output neurons (i.e, the dimensionality of the output).
        values (i.e, time and frequency kernel sizes respectively).
    input_shape : tuple
        The shape of an example input.
    nonlinearity : str
        Type of nonlinearity (tanh, relu).
    ff_normalization : str
        Type of feedforward normalization for the ligru model (batchnorm, layernorm).
        Every string different from batchnorm and layernorm will result
        in no normalization.
    recurrent_elementwise_affine : bool
        A boolean value that when set to True will enable the learnable affine parameters.
    num_layers : int
        Number of layers to employ in the RNN architecture.
    bias : bool
        If True, the additive bias b is adopted.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    re_init : bool
        If True, orthogonal initialization is used for the recurrent weights.
        Xavier initialization is used for the input connection weights.
    bidirectional : bool
        If True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = SLiGRU(input_shape=inp_tensor.shape, hidden_size=5)
    >>> out_tensor, _ = net(inp_tensor)
    >>>
    torch.Size([4, 10, 5])
    r   r~   Fr   Tr   c                    s   t    || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
d| _t|dkr,d| _ttt|dd  | _|d | _|  | _| j	rOt| j d S d S r   )r$   r%   r   r#   r   ff_normalizationrecurrent_elementwise_affiner"   r    r.   r!   r&   r(   r   r   r)   r*   r   r   r   r   r+   )r,   r   r-   r#   r  r	  r   r"   r    r.   r!   r/   r   r   r%     s&   


zSLiGRU.__init__c                 C   sv   t jg }| j}t| jD ]*}t|| j| j| j| j	| j
| j| j| j| jd
}|| | jr5| jd }q| j}q|S )z&Initializes the layers of the SLi-GRU.)r    r#   r  r	  r"   r!   r   )r   r   r^   r   r`   r   SLiGRU_Layerr   r   r    r#   r  r	  r"   r!   ra   r   r   r   r   r     s(   
zSLiGRU._init_layersNr4   c                 C   r   )aT  Returns the output of the SLi-GRU.

        Arguments
        ---------
        x : torch.Tensor
            The input tensor.
        hx : torch.Tensor
            Starting hidden state.

        Returns
        -------
        output : torch.Tensor
            Output of SLiGRU
        hh : torch.Tensor
            Hidden states
        r2   r   r   r   r   r3   )r&   r6   r7   _forward_sligrur   r   r   r   r=     r   zSLiGRU.forwardc                 C   r   )a/  Returns the output of the vanilla SLi-GRU.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor

        Returns
        -------
        x : torch.Tensor
            Output of SLiGRU
        h : torch.Tensor
            Hidden states
        Nr   r3   r   r   rf   r   r   )r,   r:   r4   rl   rd   
sligru_layr   r   r   r  /  r   zSLiGRU._forward_sligru)r   r~   Fr   Tr   TFrp   )rA   rB   rC   rD   r%   r   r   r   r   r=   r  rE   r   r   r/   r   r    s    E#r  c                       sd   e Zd ZdZ						d fdd	Zdd
eej fddZdd Z	dd Z
dd Zdd Z  ZS )r
  a  This class implements a Stabilised Light-Gated Recurrent Units (SLi-GRU) layer.

    Arguments
    ---------
    input_size : int
        Feature dimensionality of the input tensors.
    hidden_size : int
        Number of output neurons.
    num_layers : int
        The layer number.
    batch_size : int
        Batch size of the input tensors.
    dropout : float
        It is the dropout factor (must be between 0 and 1).
    nonlinearity : str
        Type of nonlinearity (tanh, sin, leaky_relu, relu).
    ff_normalization : str
        Type of normalization (batchnorm, layernorm).
        Every string different from batchnorm and layernorm will result
        in layer normalization.
        Note that this only applies to the feedforward affine transform.
        SLi-GRU (unlike Li-GRU) unconditionally applies layer normalization in
        the recurrent layers, which is unaffected by this parameter.
    recurrent_elementwise_affine : bool
        A boolean value that when set to True will enable the learnable affine parameters.
    bias: bool
        If True, the additive bias b is adopted.
    bidirectional : bool
        if True, a bidirectional model that scans the sequence both
        right-to-left and left-to-right is used.
    r   r   r~   FTc                    s  t    t|| _t|| _|| _|
| _|| _|	| _t	j
| jd| j dd| _t	j
| jd| j dd| _t	jd| j |d| _| jrI| jd | _d| _|dkr_t	jd| j dd| _d| _n|d	krqtj	d| j | _d| _ntj	d| j | _d| _| js| jjjd
 d| jj_| dtd| j |   |dkrtj	 | _d S |dkrtj| _d S |dkrtj	 | _d S tj	 | _d S )Nr   Fr   )elementwise_affiner~   r   r   Tr   r   r   r   rQ   r   r   )r$   r%   r   r   r   r   r!   r    r"   r   r   r   r   r   
layer_normr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r,   r   r   r   r   r    r#   r  r	  r"   r!   r/   r   r   r%   w  sH   



zSLiGRU_Layer.__init__Nr4   c           
      C   r   )a  Returns the output of the liGRU layer.

        Arguments
        ---------
        x : torch.Tensor
            Input tensor.
        hx : torch.Tensor
            Hidden state.

        Returns
        -------
        h : torch.Tensor
            The output of liGRU.
        r   r   rf   r   N)r!   r   r   r   r   r   r   r   r&   r7   _sligru_cellr   r   r   r   r   r   r   r=     r   zSLiGRU_Layer.forwardc                 C   s   g }|  |}t|jd D ]5}|dd|f | | | }|dd\}}t|}| || }	|| d| |	  }|	| qtj
|dd}
|
S )a7  Returns the hidden states for each time step.

        Arguments
        ---------
        w : torch.Tensor
            Linearly transformed input.
        ht : torch.Tensor
            Hidden state.

        Returns
        -------
        h : torch.Tensor
            The hidden states for each step.
        r   Nr   rf   )r   r`   r7   r  r   r   r   r   r   ra   rj   r   r   r   r   r    s   
 
zSLiGRU_Layer._sligru_cellc                 C   s`   t jj| jdd| _d| _d| _| jd| t | j| j	j
dd | dt dg  d	S )
r   Fr   r   r   r   )
persistentr   r   Nr   r   r   r   r   r     s   zSLiGRU_Layer._init_dropc                 C   r   r   r   r   r   r   r   r     r  zSLiGRU_Layer._sample_drop_maskc                 C   r  r  r  r  r   r   r   r   3  r  zSLiGRU_Layer._change_batch_size)r   r   r~   FTFrp   )rA   rB   rC   rD   r%   r   r   r   r=   r  r   r   r   rE   r   r   r/   r   r
  V  s    &F.!r
  c                       s@   e Zd ZdZ		d fdd	Zdd Zdd	 ZdddZ  ZS )QuasiRNNLayera  Applies a single layer Quasi-Recurrent Neural Network (QRNN) to an
    input sequence.

    Arguments
    ---------
    input_size : int
        The number of expected features in the input x.
    hidden_size : int
        The number of features in the hidden state h. If not specified,
        the input size is used.
    bidirectional : bool
        Whether to apply the RNN in both forward and backward directions.
    zoneout : float
        Whether to apply zoneout (i.e. failing to update elements in the
        hidden state) to the hidden state updates. Default: 0.
    output_gate : bool
        If True, performs QRNN-fo (applying an output gate to the output).
        If False, performs QRNN-f. Default: True.

    Example
    -------
    >>> import torch
    >>> model = QuasiRNNLayer(60, 256, bidirectional=True)
    >>> a = torch.rand([10, 120, 60])
    >>> b = model(a)
    >>> b[0].shape
    torch.Size([10, 120, 512])
    r   Tc                    sz   t    || _|| _|| _|| _| jrd| j nd| j }tj||d| _	t
 | _t | _| jr;t | _d S d S )Nr   r   T)r$   r%   r   zoneoutoutput_gater!   r   r   r   r   r   z_gateSigmoidf_gateo_gate)r,   r   r   r!   r  r  stacked_hiddenr/   r   r   r%   d  s   


zQuasiRNNLayer.__init__c           
      C   s   g }|}|| }t |jd D ]+}||ddddf }||ddddf }	|dur3|d|	 |  }|| |}qt|S )a2  Returns the hidden states for each time step.

        Arguments
        ---------
        f : torch.Tensor
        x : torch.Tensor
            Input tensors
        hidden : torch.Tensor
            First hidden state if any.

        Returns
        -------
        Hidden states for each step.
        r   Nr   )r`   r7   ra   r   rj   )
r,   fr:   ro   resulthtm1r   rd   h_tftr   r   r   
forgetMult}  s   

zQuasiRNNLayer.forgetMultc                 C   s<   | j r|jddd\}}}n|jddd\}}d}|||fS )zSplits the input gates.r   r   rf   r   N)r  r   )r,   yzr  or   r   r   split_gate_inputs  s
   
zQuasiRNNLayer.split_gate_inputsNc                 C   s  |j dkr||jd |jd |jd |jd  }|ddd}| jr2|d}tj||gdd}| |}| 	|\}}}| 
|}| |}|durR| |}| jrw| jrpt|jd| j |  }|| }n|d| j  }| }| }| |||}	|dur||	 }
n|	}
|	ddd}	|
ddd}
| jr|
jddd\}}|d}tj||gdd}
|	jddd\}}|d}tj||gdd}	|
|	dddddf fS )	a0  Returns the output of the QRNN layer.

        Arguments
        ---------
        x : torch.Tensor
            Input to transform linearly.
        hidden : torch.Tensor
            Initial hidden state, if any.

        Returns
        -------
        h : torch.Tensor
        c : torch.Tensor
        r2   r   r   r   r   rf   Nr   )r6   r&   r7   permuter!   r   r   r   r   r"  r  r  r  r  r   empty
bernoulli_r   
get_devicedetach
contiguousr  r   )r,   r:   ro   	x_flippedr  r   r  r!  maskrz   rl   h_fwdh_bwdc_fwdc_bwdr   r   r   r=     sJ   
*










zQuasiRNNLayer.forward)r   Trp   )	rA   rB   rC   rD   r%   r  r"  r=   rE   r   r   r/   r   r  F  s    "
r  c                       s8   e Zd ZdZ						d fdd	Zdd	d
Z  ZS )QuasiRNNa  This is a implementation for the Quasi-RNN.

    https://arxiv.org/pdf/1611.01576.pdf

    Part of the code is adapted from:
    https://github.com/salesforce/pytorch-qrnn

    Arguments
    ---------
    hidden_size : int
        The number of features in the hidden state h. If not specified,
        the input size is used.
    input_shape : tuple
        The shape of an example input. Alternatively, use ``input_size``.
    input_size : int
        The size of the input. Alternatively, use ``input_shape``.
    num_layers : int
        The number of QRNN layers to produce.
    bias : bool
        Whether to add a bias term, only True supported.
    dropout : float
        The rate at which to zero out outputs.
    bidirectional : bool
        If true, one set of parameters will traverse forward, and the
        other set will traverse from end to start.
    **kwargs : dict
        Arguments to forward to QuasiRNN layers.

    Example
    -------
    >>> a = torch.rand([8, 120, 40])
    >>> model = QuasiRNN(
    ...     256, num_layers=4, input_shape=a.shape, bidirectional=True
    ... )
    >>> b, _ = model(a)
    >>> b.shape
    torch.Size([8, 120, 512])
    Nr   Tr   Fc                    s  |du sJ dt    || _|| _|| _|dkr|nd | _|| _|d u r.|d u r.td|d u rGt|dkr;d| _	t
t
|dd  }g }	t| jD ]!}
|	t|
dkrY|n
| jra| jd n| j| j| jfi | j qNt
j|	| _| jrt
j| j| _d S d S )NTz-Removing underlying bias is not yet supportedr   r   r   r   )r$   r%   r   r   r!   r    rc   r'   r(   r&   r   r)   r*   r`   ra   r  r   r^   qrnnrW   )r,   r   r-   r   r   r"   r    r!   rc   layerslayerr/   r   r   r%     s@   
zQuasiRNN.__init__c                 C   s   g }t | jD ])\}}|||du rdn|| \}}|| | jr0|t| jd k r0| |}qt|dj| jg|d j	dd R  }||fS )z+Applies the QuasiRNN to the input tensor x.Nr   r   )
r   r0  ra   r    r(   r   r   viewr   r7   )r,   r:   ro   next_hiddenrd   r2  rl   r   r   r   r=   P  s   

zQuasiRNN.forward)NNr   Tr   Frp   r@   r   r   r/   r   r/    s    *4r/  c                 C   s2   |   D ]\}}d|v sd|v rtj| qdS )a  This function is used to initialize the RNN weight.
    Recurrent connection: orthogonal initialization.

    Arguments
    ---------
    module: torch.nn.Module
        Recurrent neural network module.

    Example
    -------
    >>> inp_tensor = torch.rand([4, 10, 20])
    >>> net = RNN(hidden_size=5, input_shape=inp_tensor.shape)
    >>> out_tensor = net(inp_tensor)
    >>> rnn_init(net)
    	weight_hhz	.u.weightN)named_parametersr   initorthogonal_)modulenameparamr   r   r   r+   d  s
   r+   )rD   typingr   r   torch.nnr   speechbrain.nnet.attentionr   r   r   speechbrain.utils.loggerr   rA   loggerr   r   Moduler   rG   rO   rP   rr   ry   r}   r   r   r  r
  r  r/  r+   r   r   r   r   <module>   s:    
zvvtpt z : e B q /p