o
    %ݫi")                     @   sZ   d Z ddlZddlZG dd dejjjZG dd dejjjZG dd dejjjZ	dS )	zA combination of Convolutional, Recurrent, and Fully-connected networks.

Authors
 * Mirco Ravanelli 2020
 * Peter Plantinga 2020
 * Ju-Chieh Chou 2020
 * Titouan Parcollet 2020
 * Abdel 2020
    Nc                       s^   e Zd ZdZddejjddddgddddejj	j
ddgdd	d
dddd
ddf fdd	Z  ZS )CRDNNa6
  This model is a combination of CNNs, RNNs, and DNNs.

    This model expects 3-dimensional input [batch, time, feats] and
    by default produces output of the size [batch, time, dnn_neurons].

    One exception is if ``using_2d_pooling`` or ``time_pooling`` is True.
    In this case, the time dimension will be downsampled.

    Arguments
    ---------
    input_size : int
        The length of the expected input at the third dimension.
    input_shape : tuple
        While input_size will suffice, this option can allow putting
        CRDNN into a sequential with other classes.
    activation : torch class
        A class used for constructing the activation layers for CNN and DNN.
    dropout : float
        Neuron dropout rate as applied to CNN, RNN, and DNN.
    cnn_blocks : int
        The number of convolutional neural blocks to include.
    cnn_channels : list of ints
        A list of the number of output channels for each CNN block.
    cnn_kernelsize : tuple of ints
        The size of the convolutional kernels.
    time_pooling : bool
        Whether to pool the utterance on the time axis before the RNN.
    time_pooling_size : int
        The number of elements to pool on the time axis.
    freq_pooling_size : int
        The number of elements to pool on the frequency axis.
    rnn_class : torch class
        The type of RNN to use in CRDNN network (LiGRU, LSTM, GRU, RNN)
    inter_layer_pooling_size : list of ints
        A list of the pooling sizes for each CNN block.
    using_2d_pooling: bool
        Whether using a 2D or 1D pooling after each CNN block.
    rnn_layers : int
        The number of recurrent RNN layers to include.
    rnn_neurons : int
        Number of neurons in each layer of the RNN.
    rnn_bidirectional : bool
        Whether this model will process just forward or in both directions.
    rnn_re_init : bool,
        If True, an orthogonal initialization will be applied to the recurrent
        weights.
    dnn_blocks : int
        The number of linear neural blocks to include.
    dnn_neurons : int
        The number of neurons in the linear layers.
    projection_dim : int
        The number of neurons in the projection layer.
        This layer is used to reduce the size of the flattened
        representation obtained after the CNN blocks.
    use_rnnp: bool
        If True, a linear projection layer is added between RNN layers.

    Example
    -------
    >>> inputs = torch.rand([10, 15, 60])
    >>> model = CRDNN(input_shape=inputs.shape)
    >>> outputs = model(inputs)
    >>> outputs.shape
    torch.Size([10, 15, 512])
    N333333?         )   r   F   i   Tc                    s  |d u r|d u rt d|d u rd d |g}t j|d |dkr*| jtjjjdd t|D ]}| j	jt
|| |||| ||d| d q.|rX| jtjjjdd	|	d
ddd |dkr| jtjjjdd | jjtjjj|dddd | jjtjjjdd | jj| dd |dkr|r| jtjjjdd t|D ]"}| j||d
||d | jtjjj|ddd | tjj|d qn| j|d|||||d |dkr| jtjjjdd t|D ]}| jjt|||d| d qd S )Nz-Must specify one of input_size or input_shapeinput_shaper   CNN
layer_nameblock_)channelskernel_sizeusing_2d_poolpooling_size
activationdropoutr   maxr      	pool_type
input_dimsr   	pool_axistime_poolingr	   
projectionTlinear)	n_neuronsbiascombine_dimsr   normactRNN)hidden_size
num_layersbidirectionalre_init)r   r    r!   p)r   r%   r&   r   r'   r(   DNN)neuronsr   r   r   )
ValueErrorsuper__init__appendsbnnet
containers
Sequentialranger   	CNN_Blockpooling	Pooling1dr   r   Linearnormalization	LayerNormtorchnnDropoutr+   	DNN_Block)self
input_sizer   r   r   
cnn_blockscnn_channelscnn_kernelsizer   time_pooling_sizefreq_pooling_size	rnn_classinter_layer_pooling_sizeusing_2d_pooling
rnn_layersrnn_neuronsrnn_bidirectionalrnn_re_init
dnn_blocksdnn_neuronsprojection_dimuse_rnnpblock_index_	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/CRDNN.pyr/   S   s   


zCRDNN.__init__)__name__
__module____qualname____doc__r<   r=   	LeakyReLUr1   r2   r$   LiGRUr/   __classcell__rV   rV   rT   rW   r      s0    Dr   c                       s4   e Zd ZdZddgejjdddf fdd	Z  ZS )r6   a/  CNN Block, based on VGG blocks.

    Arguments
    ---------
    input_shape : tuple
        Expected shape of the input.
    channels : int
        Number of convolutional channels for the block.
    kernel_size : tuple
        Size of the 2d convolutional kernel
    activation : torch.nn.Module class
        A class to be used for instantiating an activation layer.
    using_2d_pool : bool
        Whether to use 2d pooling or only 1d pooling.
    pooling_size : int
        Size of pooling kernel, duplicated for 2d pooling.
    dropout : float
        Rate to use for dropping channels.

    Example
    -------
    >>> inputs = torch.rand(10, 15, 60)
    >>> block = CNN_Block(input_shape=inputs.shape, channels=32)
    >>> outputs = block(inputs)
    >>> outputs.shape
    torch.Size([10, 15, 30, 32])
    r   Fr   r   c                    s   t  j|d | jtjjj||dd | jtjjjdd | j| dd | jtjjj||dd | jtjjjdd | j| d	d |rW| jtjj	j
d
||fdddd n| jtjj	jd
d|dddd | jtjjj|ddd d S )Nr
   conv_1)out_channelsr   r   norm_1r   act_1conv_2norm_2act_2r   )r   r   )r   r   r   r7   r   r   r   )	drop_ratedrop)r.   r/   r0   r1   r2   r   Conv2dr:   r;   r7   	Pooling2dr8   r   	Dropout2d)r@   r   r   r   r   r   r   r   rT   rV   rW   r/      sL   
	

zCNN_Block.__init__	rX   rY   rZ   r[   r<   r=   r\   r/   r^   rV   rV   rT   rW   r6      s     r6   c                       s*   e Zd ZdZejjdf fdd	Z  ZS )r?   a*  Block for linear layers.

    Arguments
    ---------
    input_shape : tuple
        Expected shape of the input.
    neurons : int
        Size of the linear layers.
    activation : torch.nn.Module class
        Class definition to use for constructing activation layers.
    dropout : float
        Rate to use for dropping neurons.

    Example
    -------
    >>> inputs = torch.rand(10, 15, 128)
    >>> block = DNN_Block(input_shape=inputs.shape, neurons=64)
    >>> outputs = block(inputs)
    >>> outputs.shape
    torch.Size([10, 15, 64])
    r   c                    sd   t  j|d | jtjjj|dd | jtjjjdd | j| dd | jt	j
j|ddd d S )	Nr
   r   )r   r   r"   r   r#   r)   r   )r.   r/   r0   r1   r2   r   r9   r:   BatchNorm1dr<   r=   r>   )r@   r   r,   r   r   rT   rV   rW   r/   0  s   zDNN_Block.__init__rk   rV   rV   rT   rW   r?     s    r?   )
r[   r<   speechbrainr1   r2   r3   r4   r   r6   r?   rV   rV   rV   rW   <module>   s    
 8R