o
    ½e¦iG0  ã                   @   s†   d Z ddlZddlmZ ddlm  mZ dd„ Zdd„ ZG dd„ dej	ƒZ
G d	d
„ d
ej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZdS )z„ This file implements the CNN14 model from https://arxiv.org/abs/1912.10211

 Authors
 * Cem Subakan 2022
 * Francesco Paissan 2022
é    Nc                 C   s<   t j | j¡ t| dƒr| jdur| jj d¡ dS dS dS )z+Initialize a Linear or Convolutional layer.ÚbiasNç        )ÚnnÚinitÚxavier_uniform_ÚweightÚhasattrr   ÚdataÚfill_)Úlayer© r   ú\/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/Cnn14.pyÚ
init_layer   s   

þr   c                 C   s    | j j d¡ | jj d¡ dS )zInitialize a Batchnorm layer.r   g      ð?N)r   r	   r
   r   )Úbnr   r   r   Úinit_bn   s   r   c                       s2   e Zd ZdZ‡ fdd„Zdd„ Zd
dd	„Z‡  ZS )Ú	ConvBlockaÔ  This class implements the convolutional block used in CNN14

    Arguments
    ---------
    in_channels : int
        Number of input channels
    out_channels : int
        Number of output channels
    norm_type : str in ['bn', 'in', 'ln']
        The type of normalization

    Example
    -------
    >>> convblock = ConvBlock(10, 20, 'ln')
    >>> x = torch.rand(5, 10, 20, 30)
    >>> y = convblock(x)
    >>> print(y.shape)
    torch.Size([5, 20, 10, 15])
    c                    sÔ   t t| ƒ ¡  tj||ddddd| _tj||ddddd| _|| _|dkr3t |¡| _	t |¡| _
n1|dkrJtj|ddd| _	tj|ddd| _
n|d	kr]t d
|¡| _	t d
|¡| _
ntd |¡ƒ‚|  ¡  d S )N©é   r   ©é   r   F)Úin_channelsÚout_channelsÚkernel_sizeÚstrideÚpaddingr   r   ÚinT©ÚaffineÚtrack_running_statsÚlnr   úUnknown norm type {})Úsuperr   Ú__init__r   ÚConv2dÚconv1Úconv2Ú	norm_typeÚBatchNorm2dÚnorm1Únorm2ÚInstanceNorm2dÚ	GroupNormÚ
ValueErrorÚformatÚinit_weight)Úselfr   r   r&   ©Ú	__class__r   r   r"   1   sB   úúÿ
ÿzConvBlock.__init__c                 C   s,   t | jƒ t | jƒ t| jƒ t| jƒ dS )zU
        Initializes the model convolutional layers and the batchnorm layers
        N)r   r$   r%   r   r(   r)   ©r/   r   r   r   r.   W   s   


zConvBlock.init_weight©é   r4   Úavgc                 C   s˜   t  |  |  |¡¡¡}t  |  |  |¡¡¡}|dkr#t j||d}|S |dkr0t j||d}|S |dkrHt j||d}t j||d}|| }|S tdƒ‚)ak  The forward pass for convblocks in CNN14

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape B x C_in x D1 x D2
            where B = Batchsize
                  C_in = Number of input channel
                  D1 = Dimensionality of the first spatial dim
                  D2 = Dimensionality of the second spatial dim
        pool_size : tuple with integer values
            Amount of pooling at each layer
        pool_type : str in ['max', 'avg', 'avg+max']
            The type of pooling

        Returns
        -------
        The output of one conv block
        Úmax)r   r5   zavg+maxzIncorrect pooling type!)	ÚFÚrelu_r(   r$   r)   r%   Ú
max_pool2dÚ
avg_pool2dÚ	Exception)r/   ÚxÚ	pool_sizeÚ	pool_typeÚx1Úx2r   r   r   Úforward`   s   	øúÿzConvBlock.forward)r3   r5   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r"   r.   rA   Ú__classcell__r   r   r0   r   r      s
    &	r   c                       s4   e Zd ZdZ	d
‡ fdd„	Zdd„ Zdd	„ Z‡  ZS )ÚCnn14a«  This class implements the Cnn14 model from https://arxiv.org/abs/1912.10211

    Arguments
    ---------
    mel_bins : int
        Number of mel frequency bins in the input
    emb_dim : int
        The dimensionality of the output embeddings
    norm_type: str in ['bn', 'in', 'ln']
        The type of normalization
    return_reps: bool (default=False)
        If True the model returns intermediate representations as well for interpretation
    l2i : bool
        If True, remove one of the outputs.

    Example
    -------
    >>> cnn14 = Cnn14(120, 256)
    >>> x = torch.rand(3, 400, 120)
    >>> h = cnn14.forward(x)
    >>> print(h.shape)
    torch.Size([3, 1, 256])
    r   Fc                    sä   t t| ƒ ¡  || _|| _|| _|dkrt |¡| _n!|dkr)tj	|ddd| _n|dkr5t 
d|¡| _ntd |¡ƒ‚tdd|d	| _tdd
|d	| _td
d|d	| _tdd|d	| _tdd|d	| _td||d	| _|  ¡  d S )Nr   r   Tr   r   r   r    é@   )r   r   r&   é€   é   i   i   )r!   rH   r"   Úreturn_repsÚl2ir&   r   r'   Únorm0r*   r+   r,   r-   r   Úconv_block1Úconv_block2Úconv_block3Úconv_block4Úconv_block5Úconv_block6r.   )r/   Úmel_binsÚemb_dimr&   rL   rM   r0   r   r   r"      s@   
ÿÿÿÿÿÿÿzCnn14.__init__c                 C   s   t | jƒ dS )z8
        Initializes the model batch norm layer
        N)r   rN   r2   r   r   r   r.   Ä   s   zCnn14.init_weightc           	      C   sz  |  ¡ dkr| d¡}| dd¡}|  |¡}| dd¡}| j|ddd}tj|d| jd}| j|ddd}tj|d| jd}| j	|ddd}tj|d| jd}| j
|ddd}tj|d| jd}| j|ddd}tj|d| jd}| j|ddd}tj|d| jd}tj|dd	}tj|d
d	\}}tj|d
d	}|| }| js¥| d¡S | jr²| d¡|||ffS | d¡||||ffS )a¿  
        The forward pass for the CNN14 encoder

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape B x C_in x D1 x D2
            where B = Batchsize
                  C_in = Number of input channel
                  D1 = Dimensionality of the first spatial dim
                  D2 = Dimensionality of the second spatial dim

        Returns
        -------
        Outputs of CNN14 encoder
        r   r   r3   r5   )r=   r>   gš™™™™™É?)ÚpÚtrainingr   )Údimr4   )rY   Ú	unsqueezeÚ	transposerN   rO   r7   ÚdropoutrX   rP   rQ   rR   rS   rT   ÚtorchÚmeanr6   rL   rM   )	r/   r<   Úx4_outÚx3_outÚx2_outÚx1_outr?   Ú_r@   r   r   r   rA   Ê   s4   


zCnn14.forward)r   FFrB   r   r   r0   r   rH   „   s    ÿ'rH   c                       s.   e Zd ZdZ	d‡ fdd„	Zd	dd„Z‡  ZS )
ÚCNN14PSIaû  
    This class estimates a mel-domain saliency mask

    Arguments
    ---------
    dim : int
        Dimensionality of the embeddings

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI(2048)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 1, 201, 80])
    rJ   c                    sÆ   t ƒ  ¡  t ||ddd¡| _t |d |ddd¡| _t ||ddd¡| _t |d |ddd¡| _t ||d	dd¡| _t |d
 |d	dd¡| _	t ||ddd¡| _
t |dddd¡| _t d¡| _d S )Nr   r3   r   r4   ©é   é   ©r4   rg   rg   ©é   rg   r   é   ©rg   r   r   ©r   rg   T)r!   r"   r   ÚConvTranspose2dÚconvt1Úconvt2Úconvt3Úconvt4Úconvt5Úconvt6Úconvt7Úconvt8ÚReLUÚnonl)r/   rY   r0   r   r   r"     s   
zCNN14PSI.__init__Nc                 C   sÂ   |   |d ¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}	|  |	¡}	||	 }|  |¡}|  |¡}|  |¡}
|
S )aa  
        Forward step. Given the classifier representations estimates a saliency map.

        Arguments
        ---------
        hs : torch.Tensor
            Classifier's representations.
        labels : None
            Unused

        Returns
        -------
        xhat : torch.Tensor
            Estimated saliency map (before sigmoid)
        r   r   r4   r   )	ro   rx   rp   rq   rr   rs   rt   ru   rv   )r/   ÚhsÚlabelsÚh1Úh2ÚhÚh3Úh4Úh5Úh6Úxhatr   r   r   rA   $  s&   










zCNN14PSI.forward)rJ   )N©rC   rD   rE   rF   r"   rA   rG   r   r   r0   r   rd   þ   s
    þrd   c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	ÚCNN14PSI_stftaŽ  
    This class estimates a saliency map on the STFT domain, given classifier representations.

    Arguments
    ---------
    dim : int
        Dimensionality of the input representations.
    outdim : int
        Defines the number of output channels in the saliency map.

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI_stft(2048, 1)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 1, 201, 513])
    rJ   r   c                    sö   t ƒ  ¡  t ||ddd¡| _t |d |ddd¡| _t ||ddd¡| _t |d |ddd¡| _t ||d dd	d¡| _t |d
 |d ddd¡| _	t |d |d dd	d¡| _
t |d |d
 dd	d¡| _t |d
 |ddd¡| _t d¡| _d S )Nr   rh   r   r4   re   rg   ri   )r   rj   r3   rk   r   rl   )r   rj   rm   )r   r4   )r   rj   )r   rg   r   T)r!   r"   r   rn   ro   rp   rq   rr   rs   rt   ru   rv   Úconvt9rw   rx   )r/   rY   Úoutdimr0   r   r   r"   h  s   
ÿÿzCNN14PSI_stft.__init__c           
      C   sÌ   |   |d ¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |¡}|  	|¡}	|	S )a  
        Forward step to estimate the saliency map

        Arguments
        --------
        hs : torch.Tensor
            Classifier's representations.

        Returns
        --------
        xhat : torch.Tensor
            An Estimate for the saliency map
        r   r   r4   r   )
ro   rx   rp   rq   rr   rs   rt   ru   rv   r…   )
r/   ry   r{   r|   r}   r~   r   r€   r   r‚   r   r   r   rA   {  s(   











zCNN14PSI_stft.forward)rJ   r   rƒ   r   r   r0   r   r„   Q  s    r„   )rF   r]   Útorch.nnr   Útorch.nn.functionalÚ
functionalr7   r   r   ÚModuler   rH   rd   r„   r   r   r   r   Ú<module>   s    	hzS