o
    ei0                     @   sf   d Z ddlZddlmZ ddlZddlmZ G dd dejZG dd dejZG d	d
 d
ejZ	dS )zThis module mixes information from different tokens via HyperMixing.
It can be viewed as a linear-time drop-in replacement for (self-)attention.

source: https://arxiv.org/abs/2203.03691

Authors
 * Florian Mai 2023
 * Juan Pablo Zuluaga 2023
    N)Optional)nnc                       s   e Zd ZdZ				ddedededed	ed
eddf fddZdd Z				ddee	j
 dee	j
 dee dee	j
 fddZ  ZS )HyperMixinga  This class implements multi-head HyperMixing.
    It is an implementation of the token-mixing component in HyperMixer, a linear
    time drop-in replacement for self-attention. In contrast to the original HyperMixer,
    this module supports multiple heads, which improves the expressiveness of the model
    while decreasing the number of parameters.

    Reference: https://arxiv.org/abs/2203.03691

    Arguments
    ---------
    input_output_dim : int
        number of features in keys, queries, and values
    hypernet_size : int
        determines the size of the hidden layer of the token-mixing MLP.
    tied : bool
        If True, then the generated weight matrices of the token-mixing MLP are tied.
    num_heads : int
        parallel token-mixing MLPs.
    fix_tm_hidden_size : bool
        If True, the hidden-layer size is equal to hypernet_size rather than hypernet_size / num_heads.
    max_length : int
        Maximum number of input tokens. Needed for generating sufficiently large position embeddings.

    Example
    -------
    >>> import torch
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = HyperMixing(512, 2048, num_heads=8)
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    F     input_output_dimhypernet_sizetied	num_headsfix_tm_hidden_size
max_lengthreturnNc                    s\   t    || _t|||||d| _t | _t|| _	|| _
ddlm} |||| _d S )N)r	   r
   keep_output_sizer   )PositionalEncoding)super__init__r   HyperNetworkhyperr   GELU
activation	LayerNorm
layer_normr
   0speechbrain.lobes.models.transformer.Transformerr   positional_encoding)selfr   r   r	   r
   r   r   r   	__class__ Z/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/nnet/hypermixing.pyr   4   s    
	

zHyperMixing.__init__c                 C   s,   t ||}||}t ||dd}|S )z(function to stick MLP1 together manuallyr      )torchbmm	transpose)r   outW1W2r   r   r   r   _mlp_pass_from_componentsS   s   z%HyperMixing._mlp_pass_from_componentsT	attn_maskkey_padding_maskreturn_attn_weightspos_embsc                 C   s*  |}| d}	| d}
|durt|d }|| }|| | }| |\}}|dur>||d }||d }|dd}||	| j	 | j
| j	 |
f}||	| j	 |
df}||	| j	 |
df}| |||| j}||	| j
|
f}|dd}| |}tj|	|
|
f|jd}||fS )a   
        The signature of this method is deliberately chosen to be the same as for
        sb.nnet.attention.MultiHeadAttention for compatibility within SpeechBrain.

        NOTE: key, value, attn_mask and pos_embs have no effect. Query is used for
        all three. Thus, the module should only be used to replace self-attention at the moment.

        Arguments
        ----------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
            Currently unused. All
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
            Currently unused.
        attn_mask : torch.Tensor, optional
            NOTE: Currently has NO effect.
        key_padding_mask : torch.Tensor, optional
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        return_attn_weights: torch.Tensor, optional
            NOTE: Currently has NO effect.
        pos_embs: torch.Tensor, optional
            NOTE: Currently has NO effect.

        Outputs
        -------
        attn_output : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_output_weights : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
            NOTE: always returns all zeros.
        r   r   Nr   )device)sizer    logical_not	unsqueezefloatr   r   r"   reshaper
   r   r&   r   r   zerosr,   )r   querykeyvaluer'   r(   r)   r*   r#   bsizeseq_len
float_mask	hyp_inputr$   r%   dummy_att_weightsr   r   r   forwardZ   s>   8



zHyperMixing.forward)Fr   Fr   )NNTN)__name__
__module____qualname____doc__intboolr   r&   r   r    Tensorr;   __classcell__r   r   r   r   r      sF    %r   c                       sF   e Zd ZdZ			ddededdf fd	d
ZdejfddZ  Z	S )r   a  This class implements The HyperNetwork. It is an approach of using a one network,
    also known as a hypernetwork, to generate the weights for another network.
    Here, it is used to generate the labels of linear layers.

    Reference: https://arxiv.org/abs/1609.09106

    Arguments
    ----------
    input_output_dim : int
        Dimension of the linear layers
    hypernet_size:
        Dimension of the HyperNetwork
    tied : bool, optional
        Define whether weights of layer 1 and layer 2 are shared
    num_heads: int, optional
        Number of heads, akin to heads in MultiHeadAttention
    keep_output_size: bool, optional
        Set whether to keep the same output size independent of number of heads
    Fr   Tr   r   r   Nc                    sR   t t|   || _t|||||d| _| jr| j| _d S t|||||d| _d S )N)output_sizenum_mlpsr   )r   r   r   r	   ParallelMLPsw1_genw2_gen)r   r   r   r	   r
   r   r   r   r   r      s$   zHyperNetwork.__init__input_tensorc                 C   s.   |  |}| jr|}||fS | |}||fS )a  Forward computation for a HyperNetwork.

        Arguments
        ----------
        input_tensor : [batchsize, max_positions, d]
            The HyperNetwork is supposed to generate an MLP of the form W_2(GELU(W1 x)), where
            W1 : N -> k and W2 : k -> N, so it has to return tensors W1 and W2

        Outputs
        -------
        W1 : torch.Tensor
            Generated weights of Layer 1
        W2 : torch.Tensor
            Generated weights of Layer 2
        )rG   r	   rH   )r   rI   r$   r%   r   r   r   r;      s   

zHyperNetwork.forward)Fr   T)
r<   r=   r>   r?   r@   r   r    rB   r;   rC   r   r   r   r   r      s    r   c                       s4   e Zd ZdZ			d
	d fddZdd	 Z  ZS )rF   a  Class that implements the MultiHead HyperMixer or HyperConformer.

    Arguments
    ----------
    input_size : int
        Dimension of the linear layers
    hidden_size: int
        Dimension of the hidden layer
    output_size : int
        Dimension of the HyperNetwork
    num_mlps : int
        Number of heads, akin to heads in MultiHeadAttention
    keep_output_size : bool, optional
        Set whether to keep the same output size independent of number of heads
    Nr   Tr   c                    sF  t t|   |d u r|}|| _|| _|| dksJ || dks#J || dks+J || }|s5|| }|| }|| _|| _|| _t	t
|||| _t	t
||| _t	t
|||| _t	t
||| _tjj| jtdd tjj| jtdd tjj| jtdd tjj| jtdd t | _d S )Nr   g       @)gain)r   rF   r   original_in_sizeoriginal_out_size
input_sizerD   rE   r   	Parameterr    emptyfc1_weights
fc1_biasesfc2_weights
fc2_biasesinitxavier_uniform_mathsqrtr   r   )r   rM   hidden_sizerD   rE   r   r   r   r   r   #  s8   zParallelMLPs.__init__c                 C   s|   | d}| d}|||| j| jf}td|| j| jdd }| 	|}td|| j
| jdd }|S )zPerforms the forward computation of multi parallel MLPs.

        Arguments
        ----------
        x : tensor
            Input tensor

        Outputs
        -------
        x : torch.Tensor
            return output tensor
        r   r   zblmf,mhf->bmlhr   zbmlh,mfh->bmlf)r-   r1   rE   rM   r    einsumrP   rQ   r/   r   rR   rS   )r   xr6   r7   r   r   r   r;   S  s   


zParallelMLPs.forward)Nr   T)r   N)r<   r=   r>   r?   r   r;   rC   r   r   r   r   rF     s    0rF   )
r?   rV   typingr   r    r   Moduler   r   rF   r   r   r   r   <module>   s    
 5L