o
    ½e¦i9A  ã                   @   sÀ   d Z ddlZddlmZ ddlm  mZ ddlmZ G dd„ dej	ƒZ
G dd„ dej	ƒZdd	„ ZG d
d„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZdS )zÖThis file implements the necessary classes and functions to implement Listen-to-Interpret (L2I) interpretation method from https://arxiv.org/abs/2202.11479v2

 Authors
 * Cem Subakan 2022
 * Francesco Paissan 2022
é    N)ÚResBlockAudioc                       s4   e Zd ZdZddg d¢f‡ fdd„	Zdd„ Z‡  ZS )	ÚPsia7  Convolutional Layers to estimate NMF Activations from Classifier Representations

    Arguments
    ---------
    n_comp : int
        Number of NMF components (or equivalently number of neurons at the output per timestep)
    T : int
        The targeted length along the time dimension
    in_emb_dims : List with int elements
        A list with length 3 that contains the dimensionality of the input dimensions
        The list needs to match the number of channels in the input classifier representations
        The last entry should be the smallest entry

    Example
    -------
    >>> inp = [torch.ones(2, 150, 6, 2), torch.ones(2, 100, 6, 2), torch.ones(2, 50, 12, 5)]
    >>> psi = Psi(n_comp=100, T=120, in_emb_dims=[150, 100, 50])
    >>> h = psi(inp)
    >>> print(h.shape)
    torch.Size([2, 100, 120])
    éd   é¯  )i   i   i   c                    s¶   t ƒ  ¡  || _tjdd| _tj|dfd| _t|ƒ}tj|d |ddd| _	tj|d |ddd| _
tj||ddd| _t tj|d |dddt |¡t ¡ ¡| _t ¡ | _d S )	N©é   r   )Úscale_factoré   )Úsizer   é   Úsame©Úkernel_sizeÚpadding)ÚsuperÚ__init__Úin_emb_dimsÚnnÚUpsamplingBilinear2dÚupsampÚupsamp_timeÚminÚConv2dÚc1Úc2Úout_convÚ
SequentialÚBatchNorm2dÚReLUÚconvÚact)ÚselfÚn_compÚTr   Úout_c©Ú	__class__© úZ/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/L2I.pyr   &   s$   
ÿÿýzPsi.__init__c           	      C   sx  d}t | jƒD ]\}}|| jd | j| ksJ d| ƒ‚q|d jd |d jd ks2J d| ƒ‚|d jd |d jd ksFJ d| ƒ‚d|d jd  |d jd d kspJ d| d|d jd › d	|d jd ›  ƒ‚|\}}}|  |¡}|  |¡}|  |  |¡¡}|  |  |¡¡}t |d
¡}t |d
¡}t	j
|||fdd}|  |¡}|  |¡}|  |  |¡¡ d¡}|S )a  This forward function returns the NMF time activations given classifier activations

        Arguments
        ---------
        inp: list
            A length 3 list of classifier input representations.

        Returns
        -------
        NMF time activations
        zcin PSI doesn't match. The embedding dimensions need to be consistent with the list self.in_emb_dimsr	   zNr. of channels r   r   zSpatial dimension r   z 1st (idx 0) element has shape z" second element (idx 1) has shape )r   r	   r   r   )Úaxis)Ú	enumerater   Úshaper   r    r   r   ÚFÚpadÚtorchÚcatr   r   r   Úsqueeze)	r!   ÚinpÚerrorÚiÚ
in_emb_dimÚx1Úx2Úx3Úxr'   r'   r(   Úforward>   s4   ÿ((&ÿ"þÿ




zPsi.forward©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r9   Ú__classcell__r'   r'   r%   r(   r      s    r   c                       s2   e Zd ZdZd‡ fdd„	Zdd„ Zd	d
„ Z‡  ZS )ÚNMFDecoderAudioaÈ  This class implements an NMF decoder

    Arguments
    ---------
    n_comp : int
        Number of NMF components
    n_freq : int
        The number of frequency bins in the NMF dictionary
    device : str
        The device to run the model

    Example
    -------
    >>> NMF_dec = NMFDecoderAudio(20, 210, device='cpu')
    >>> H = torch.rand(1, 20, 150)
    >>> Xhat = NMF_dec.forward(H)
    >>> print(Xhat.shape)
    torch.Size([1, 210, 150])
    r   é  Úcudac                    s4   t ƒ  ¡  tjdt ||¡ dd| _t ¡ | _d S )Ngš™™™™™¹?T)Úrequires_grad)	r   r   r   Ú	Parameterr.   ÚrandÚWr   Úactiv)r!   r"   Ún_freqÚdevicer%   r'   r(   r   †   s
   
ÿzNMFDecoderAudio.__init__c                 C   s.   |   |¡}|   | j¡ d¡}t d||¡}|S )aŽ  The forward pass for NMF given the activations H

        Arguments
        ---------
        H : torch.Tensor
            The activations Tensor with shape B x n_comp x T
            where B = Batchsize
                  n_comp = number of NMF components
                  T = number of timepoints

        Returns
        -------
        output : torch.Tensor
            The NMF outputs
        r   zbij, bjk -> bik)rG   rF   Ú	unsqueezer.   Úeinsum)r!   ÚHÚtempÚoutputr'   r'   r(   r9   Ž   s   
zNMFDecoderAudio.forwardc                 C   s   | j }|  |¡S )z(This function returns the NMF dictionary)rF   rG   )r!   rF   r'   r'   r(   Úreturn_W¦   s   
zNMFDecoderAudio.return_W)r   rA   rB   )r;   r<   r=   r>   r   r9   rO   r?   r'   r'   r%   r(   r@   q   s
    r@   c                 C   s^   | j j}| d¡dkr-ztj | jj¡ | jj 	d¡ W dS  t
y,   td|ƒ Y dS w dS )zˆ
    Applies Xavier initialization to network weights.

    Arguments
    ---------
    m : nn.Module
        Module to initialize.
    ÚConvéÿÿÿÿr   zSkipping initialization of N)r&   r;   Úfindr   ÚinitÚxavier_uniform_ÚweightÚdataÚbiasÚfill_ÚAttributeErrorÚprint)ÚmÚ	classnamer'   r'   r(   Úweights_init¬   s   	ÿür]   c                       s4   e Zd ZdZ					d‡ fdd„	Zd	d
„ Z‡  ZS )ÚPsiOptimizedas  Convolutional Layers to estimate NMF Activations from Classifier Representations, optimized for log-spectra.

    Arguments
    ---------
    dim : int
        Dimension of the hidden representations (input to the classifier).
    K : int
        Number of NMF components (or equivalently number of neurons at the output per timestep)
    numclasses : int
        Number of possible classes.
    use_adapter : bool
        `True` if you wish to learn an adapter for the latent representations.
    adapter_reduce_dim: bool
        `True` if the adapter should compress the latent representations.

    Example
    -------
    >>> inp = torch.randn(1, 256, 26, 32)
    >>> psi = PsiOptimized(dim=256, K=100, use_adapter=False, adapter_reduce_dim=False)
    >>> h, inp_ad= psi(inp)
    >>> print(h.shape, inp_ad.shape)
    torch.Size([1, 1, 417, 100]) torch.Size([1, 256, 26, 32])
    é€   r   é2   FTc                    s  t ƒ  ¡  || _|| _|r(t|ƒ| _|r(t ||ddd¡| _t 	||ddd¡| _
t t 	||ddd¡t d¡t |¡t 	||ddd¡t ¡ t |¡t 	||ddd¡t ¡ t |¡t 	||ddd¡t ¡ t |¡t 	|dddd¡t ¡ t d|¡t ¡ ¡| _|  t¡ d S )Né   r   r	   r   Té   rA   )r   r   Úuse_adapterÚadapter_reduce_dimr   Úadapterr   r   ÚdownÚConvTranspose2dÚupr   r   r   ÚLinearÚdecoderÚapplyr]   )r!   ÚdimÚKÚ
numclassesrc   rd   r%   r'   r(   r   ×   s6   


ðzPsiOptimized.__init__c                 C   sT   | j r	|  |¡}n|}| jr!|  |¡}|  |¡}|  |¡}||fS |  |¡}||fS )aY  
        Computes forward step.

        Arguments
        ---------
        hs : torch.Tensor
            Latent representations (input to the classifier). Expected shape `torch.Size([B, C, H, W])`.

        Returns
        -------
        NMF activations and adapted representations. Shape `torch.Size([B, 1, T, 100])`. : torch.Tensor
        )rc   re   rd   rf   rh   rj   )r!   ÚhsÚhcatÚz_q_x_stÚoutr'   r'   r(   r9   þ   s   



þzPsiOptimized.forward)r_   r   r`   FTr:   r'   r'   r%   r(   r^   ¾   s    ú'r^   c                       s*   e Zd ZdZd	‡ fdd„	Zdd„ Z‡  ZS )
ÚThetaaÙ  This class implements a linear classifier on top of NMF activations

    Arguments
    ---------
    n_comp : int
        Number of NMF components
    T : int
        Number of Timepoints in the NMF activations
    num_classes : int
        Number of classes that the classifier works with

    Example
    -------
    >>> theta = Theta(30, 120, 50)
    >>> H = torch.rand(1, 30, 120)
    >>> c_hat = theta.forward(H)
    >>> print(c_hat.shape)
    torch.Size([1, 50])
    r   r   r`   c                    sB   t ƒ  ¡  tj|ddd| _t tj||ddtjdd¡| _d S )Nr	   F)rW   )rl   )r   r   r   ri   Úhard_attr   ÚSoftmaxÚ
classifier)r!   r"   r#   Únum_classesr%   r'   r(   r   /  s
   

ÿzTheta.__init__c                 C   s   |   |¡ d¡}|  |¡}|S )aª  We first collapse the time axis, and then pass through the linear layer

        Arguments
        ---------
        H : torch.Tensor
            The activations Tensor with shape B x n_comp x T
            where B = Batchsize
                  n_comp = number of NMF components
                  T = number of timepoints

        Returns
        -------
        theta_out : torch.Tensor
            Classifier output
        r   )rt   r0   rv   )r!   rL   Ú	theta_outr'   r'   r(   r9   :  s   
zTheta.forward)r   r   r`   r:   r'   r'   r%   r(   rs     s    rs   c                       s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )Ú
NMFEncodera  This class implements an NMF encoder with a convolutional network

    Arguments
    ---------
    n_freq : int
        The number of frequency bins in the NMF dictionary
    n_comp : int
        Number of NMF components

    Example
    -------
    >>> nmfencoder = NMFEncoder(513, 100)
    >>> X = torch.rand(1, 513, 240)
    >>> Hhat = nmfencoder(X)
    >>> print(Hhat.shape)
    torch.Size([1, 100, 240])
    c                    sZ   t ƒ  ¡  t tj|ddddt ¡ tjdddddt ¡ tjd|dddt ¡ ¡| _d S )Né   é   r   r   r_   )r   r   r   r   ÚConv1dr   Úconvenc)r!   rH   r"   r%   r'   r(   r   b  s   

úzNMFEncoder.__init__c                 C   s
   |   |¡S )aL  
        Arguments
        ---------
        X : torch.Tensor
            The input spectrogram Tensor with shape B x n_freq x T
            where B = Batchsize
                  n_freq = nfft for the input spectrogram
                  T = number of timepoints

        Returns
        -------
        NMF encoded outputs.
        )r}   )r!   ÚXr'   r'   r(   r9   m  s   
zNMFEncoder.forwardr:   r'   r'   r%   r(   ry   O  s    ry   c                       ó,   e Zd ZdZd	‡ fdd„	Zd
dd„Z‡  ZS )ÚCNN14PSI_stfta†  
    This class estimates a saliency map on the STFT domain, given classifier representations.

    Arguments
    ---------
    dim : int
        Dimensionality of the input representations.
    K : int
        Defines the number of output channels in the saliency map.

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI_stft(2048, 20)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 20, 207])
    r_   r   c                    sö   t ƒ  ¡  t ||ddd¡| _t |d |ddd¡| _t ||ddd¡| _t |d |ddd¡| _t ||d ddd¡| _t |d |d ddd¡| _	t |d |d ddd¡| _
t |d |d ddd¡| _t |d |ddd¡| _t d	¡| _d S )
Nr   r   r	   é   ra   é   r{   r   T)r   r   r   ÚConvTranspose1dÚconvt1Úconvt2Úconvt3Úconvt4Úconvt5Úconvt6Úconvt7Úconvt8Úconvt9r   Únonl©r!   rl   rm   r%   r'   r(   r   •  s   
zCNN14PSI_stft.__init__Nc                 C   sø   dd„ |D ƒ}|   |d ¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}	|  |	¡}	||	 }|  |¡}|  |¡}|  |¡}|  |¡}|  	|¡}
|  |
¡}
t
 |
¡}
|
S )á—  
        Forward step. Estimates NMF activations to be used to get the saliency mask.

        Arguments
        --------
        hs : torch.Tensor
            Classifier's representations.
        labels : torch.Tensor
            Predicted labels for classifier's representations.

        Returns
        --------
        xhat : torch.Tensor
            The estimated NMF activation coefficients
        c                 S   s   g | ]}|  d ¡‘qS )rQ   )Úmean)Ú.0Úhr'   r'   r(   Ú
<listcomp>µ  s    z)CNN14PSI_stft.forward.<locals>.<listcomp>r   r	   r   r   )r„   r   r…   r†   r‡   rˆ   r‰   rŠ   r‹   rŒ   r,   Úrelu©r!   ro   ÚlabelsÚh1Úh2r’   Úh3Úh4Úh5Úh6Úxhatr'   r'   r(   r9   ¤  s0   














zCNN14PSI_stft.forward©r_   r   ©Nr:   r'   r'   r%   r(   r€   ~  s    r€   c                       r   )ÚCNN14PSI_stft_2da‰  
    This class estimates the NMF activations to create a saliency map using the L2I framework

    Arguments
    ---------
    dim : int
        Dimensionality of the input representations.
    K : int
        Defines the number of output channels in the saliency map.

    Example
    -------
    >>> from speechbrain.lobes.models.Cnn14 import Cnn14
    >>> classifier_embedder = Cnn14(mel_bins=80, emb_dim=2048, return_reps=True)
    >>> x = torch.randn(2, 201, 80)
    >>> _, hs = classifier_embedder(x)
    >>> psimodel = CNN14PSI_stft_2d(2048, 20)
    >>> xhat = psimodel.forward(hs)
    >>> print(xhat.shape)
    torch.Size([2, 20, 207])
    r_   r   c                    sö   t ƒ  ¡  t ||ddd¡| _t |d |ddd¡| _t ||ddd¡| _t |d |ddd¡| _t ||d dd	d¡| _t |d
 |d ddd¡| _	t |d |d dd	d¡| _
t |d |d
 dd	d¡| _t |d
 |ddd¡| _t d¡| _d S )Nr   )r   ra   r	   r   )r   ra   ra   )r‚   ra   )r   r‚   r   r{   )r   r   )ra   r   )r   r‚   )r   ra   )r   r   )r   r‚   )r	   ra   r   T)r   r   r   rg   r„   r…   r†   r‡   rˆ   r‰   rŠ   r‹   rŒ   r   r   rŽ   r%   r'   r(   r   ñ  s   
ÿÿzCNN14PSI_stft_2d.__init__Nc                 C   sô   |   |d ¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}|  |¡}|| }|  |¡}|  |¡}|  |d ¡}	|  |	¡}	||	 }|  |¡}|  |¡}|  |¡}|  |¡}|  	|¡}
|  |
¡}
|
 
d¡}
t |
¡}
|
S )r   r   r	   r   r   rQ   )r„   r   r…   r†   r‡   rˆ   r‰   rŠ   r‹   rŒ   r   r,   r”   r•   r'   r'   r(   r9     s0   















zCNN14PSI_stft_2d.forwardrž   rŸ   r:   r'   r'   r%   r(   r    Ú  s    r    )r>   r.   Útorch.nnr   Útorch.nn.functionalÚ
functionalr,   Úspeechbrain.lobes.models.PIQr   ÚModuler   r@   r]   r^   rs   ry   r€   r    r'   r'   r'   r(   Ú<module>   s    b;\5/\