o
    %ݫiG0                     @   s   d Z ddlZddlmZ ddlm  mZ dd Zdd ZG dd dej	Z
G d	d
 d
ej	ZG dd dej	ZG dd dej	ZdS )z This file implements the CNN14 model from https://arxiv.org/abs/1912.10211

 Authors
 * Cem Subakan 2022
 * Francesco Paissan 2022
    Nc                 C   s<   t j| j t| dr| jdur| jjd dS dS dS )z+Initialize a Linear or Convolutional layer.biasN        )nninitxavier_uniform_weighthasattrr   datafill_)layer r   R/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/Cnn14.py
init_layer   s   

r   c                 C   s    | j jd | jjd dS )zInitialize a Batchnorm layer.r   g      ?N)r   r	   r
   r   )bnr   r   r   init_bn   s   r   c                       s2   e Zd ZdZ fddZdd Zd
dd	Z  ZS )	ConvBlocka  This class implements the convolutional block used in CNN14

    Arguments
    ---------
    in_channels : int
        Number of input channels
    out_channels : int
        Number of output channels
    norm_type : str in ['bn', 'in', 'ln']
        The type of normalization

    Example
    -------
    >>> convblock = ConvBlock(10, 20, 'ln')
    >>> x = torch.rand(5, 10, 20, 30)
    >>> y = convblock(x)
    >>> print(y.shape)
    torch.Size([5, 20, 10, 15])
    c                    s   t t|   tj||ddddd| _tj||ddddd| _|| _|dkr3t|| _	t|| _
n1|dkrJtj|ddd| _	tj|ddd| _
n|d	kr]td
|| _	td
|| _
ntd||   d S )N   r      r   F)in_channelsout_channelskernel_sizestridepaddingr   r   inTaffinetrack_running_statslnr   Unknown norm type {})superr   __init__r   Conv2dconv1conv2	norm_typeBatchNorm2dnorm1norm2InstanceNorm2d	GroupNorm
ValueErrorformatinit_weight)selfr   r   r&   	__class__r   r   r"   1   sB   
zConvBlock.__init__c                 C   s,   t | j t | j t| j t| j dS )zU
        Initializes the model convolutional layers and the batchnorm layers
        N)r   r$   r%   r   r(   r)   r/   r   r   r   r.   W   s   


zConvBlock.init_weight   r4   avgc                 C   s   t | | |}t | | |}|dkr#t j||d}|S |dkr0t j||d}|S |dkrHt j||d}t j||d}|| }|S td)ak  The forward pass for convblocks in CNN14

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape B x C_in x D1 x D2
            where B = Batchsize
                  C_in = Number of input channel
                  D1 = Dimensionality of the first spatial dim
                  D2 = Dimensionality of the second spatial dim
        pool_size : tuple with integer values
            Amount of pooling at each layer
        pool_type : str in ['max', 'avg', 'avg+max']
            The type of pooling

        Returns
        -------
        The output of one conv block
        max)r   r5   zavg+maxzIncorrect pooling type!)	Frelu_r(   r$   r)   r%   
max_pool2d
avg_pool2d	Exception)r/   x	pool_size	pool_typex1x2r   r   r   forward`   s   	zConvBlock.forward)r3   r5   __name__
__module____qualname____doc__r"   r.   rA   __classcell__r   r   r0   r   r      s
    &	r   c                       s4   e Zd ZdZ	d
 fdd	Zdd Zdd	 Z  ZS )Cnn14a  This class implements the Cnn14 model from https://arxiv.org/abs/1912.10211

    Arguments
    ---------
    mel_bins : int
        Number of mel frequency bins in the input
    emb_dim : int
        The dimensionality of the output embeddings
    norm_type: str in ['bn', 'in', 'ln']
        The type of normalization
    return_reps: bool (default=False)
        If True the model returns intermediate representations as well for interpretation
    l2i : bool
        If True, remove one of the outputs.

    Example
    -------
    >>> cnn14 = Cnn14(120, 256)
    >>> x = torch.rand(3, 400, 120)
    >>> h = cnn14.forward(x)
    >>> print(h.shape)
    torch.Size([3, 1, 256])
    r   Fc                    s   t t|   || _|| _|| _|dkrt|| _n!|dkr)tj	|ddd| _n|dkr5t
d|| _ntd|tdd|d	| _tdd
|d	| _td
d|d	| _tdd|d	| _tdd|d	| _td||d	| _|   d S )Nr   r   Tr   r   r   r    @   )r   r   r&         i   i   )r!   rH   r"   return_repsl2ir&   r   r'   norm0r*   r+   r,   r-   r   conv_block1conv_block2conv_block3conv_block4conv_block5conv_block6r.   )r/   mel_binsemb_dimr&   rL   rM   r0   r   r   r"      s@   
zCnn14.__init__c                 C   s   t | j dS )z8
        Initializes the model batch norm layer
        N)r   rN   r2   r   r   r   r.      s   zCnn14.init_weightc           	      C   sz  |  dkr|d}|dd}| |}|dd}| j|ddd}tj|d| jd}| j|ddd}tj|d| jd}| j	|ddd}tj|d| jd}| j
|ddd}tj|d| jd}| j|ddd}tj|d| jd}| j|ddd}tj|d| jd}tj|dd	}tj|d
d	\}}tj|d
d	}|| }| js|dS | jr|d|||ffS |d||||ffS )a  
        The forward pass for the CNN14 encoder

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape B x C_in x D1 x D2
            where B = Batchsize
                  C_in = Number of input channel
                  D1 = Dimensionality of the first spatial dim
                  D2 = Dimensionality of the second spatial dim

        Returns
        -------
        Outputs of CNN14 encoder
        r   r   r3   r5   )r=   r>   g?)ptrainingr   )dimr4   )rY   	unsqueeze	transposerN   rO   r7   dropoutrX   rP   rQ   rR   rS   rT   torchmeanr6   rL   rM   )	r/   r<   x4_outx3_outx2_outx1_outr?   _r@   r   r   r   rA      s4   


zCnn14.forward)r   FFrB   r   r   r0   r   rH      s    'rH   c                       s.   e Zd ZdZ	d fdd	Zd	ddZ  ZS )
CNN14PSIa  
    This class estimates a mel-domain saliency mask

    Arguments
    ---------
    dim : int
        Dimensionality of the embeddings

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI(2048)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 1, 201, 80])
    rJ   c                    s   t    t||ddd| _t|d |ddd| _t||ddd| _t|d |ddd| _t||d	dd| _t|d
 |d	dd| _	t||ddd| _
t|dddd| _td| _d S )Nr   r3   r   r4         r4   rg   rg      rg   r      rg   r   r   r   rg   T)r!   r"   r   ConvTranspose2dconvt1convt2convt3convt4convt5convt6convt7convt8ReLUnonl)r/   rY   r0   r   r   r"     s   
zCNN14PSI.__init__Nc                 C   s   |  |d }| |}| |d }| |}|| }| |}| |}| |d }| |}|| }| |}| |}| |d }	| |	}	||	 }| |}| |}| |}
|
S )aa  
        Forward step. Given the classifier representations estimates a saliency map.

        Arguments
        ---------
        hs : torch.Tensor
            Classifier's representations.
        labels : None
            Unused

        Returns
        -------
        xhat : torch.Tensor
            Estimated saliency map (before sigmoid)
        r   r   r4   r   )	ro   rx   rp   rq   rr   rs   rt   ru   rv   )r/   hslabelsh1h2hh3h4h5h6xhatr   r   r   rA   $  s&   










zCNN14PSI.forward)rJ   )NrC   rD   rE   rF   r"   rA   rG   r   r   r0   r   rd      s
    rd   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	CNN14PSI_stfta  
    This class estimates a saliency map on the STFT domain, given classifier representations.

    Arguments
    ---------
    dim : int
        Dimensionality of the input representations.
    outdim : int
        Defines the number of output channels in the saliency map.

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI_stft(2048, 1)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 1, 201, 513])
    rJ   r   c                    s   t    t||ddd| _t|d |ddd| _t||ddd| _t|d |ddd| _t||d dd	d| _t|d
 |d ddd| _	t|d |d dd	d| _
t|d |d
 dd	d| _t|d
 |ddd| _td| _d S )Nr   rh   r   r4   re   rg   ri   )r   rj   r3   rk   r   rl   )r   rj   rm   )r   r4   )r   rj   )r   rg   r   T)r!   r"   r   rn   ro   rp   rq   rr   rs   rt   ru   rv   convt9rw   rx   )r/   rY   outdimr0   r   r   r"   h  s   
zCNN14PSI_stft.__init__c           
      C   s   |  |d }| |}| |d }| |}|| }| |}| |}| |d }| |}|| }| |}| |}| |d }| |}|| }| |}| |}| |}| 	|}	|	S )a  
        Forward step to estimate the saliency map

        Arguments
        --------
        hs : torch.Tensor
            Classifier's representations.

        Returns
        --------
        xhat : torch.Tensor
            An Estimate for the saliency map
        r   r   r4   r   )
ro   rx   rp   rq   rr   rs   rt   ru   rv   r   )
r/   ry   r{   r|   r}   r~   r   r   r   r   r   r   r   rA   {  s(   











zCNN14PSI_stft.forward)rJ   r   r   r   r   r0   r   r   Q  s    r   )rF   r]   torch.nnr   torch.nn.functional
functionalr7   r   r   Moduler   rH   rd   r   r   r   r   r   <module>   s    	hzS