o
    %ݫi                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
Z
ddlmZ ddlm  mZ ddlmZ ddlmZ eeZG dd dejZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dZdeegef deegef fddZ e dd de!de!d e
j"d!e
j#def
d"d#Z$d$d% Z%G d&d' d'ejZ&d(d) Z'dS )*zLibrary implementing attention modules.

Authors
 * Ju-Chieh Chou 2020
 * Jianyuan Zhong 2020
 * Loren Lugosch 2020
 * Samuele Cornell 2020
 * Shucong Zhang 2024
    N)AnyCallableDictOptionalTuple)length_to_mask)
get_loggerc                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
ContentBasedAttentiona  This class implements content-based attention module for seq2seq
    learning.

    Reference: NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN
    AND TRANSLATE, Bahdanau et.al. https://arxiv.org/pdf/1409.0473.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder layer.
    dec_dim : int
        Size of decoder layer.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = ContentBasedAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
          ?c                    sf   t    t||| _t||| _tj|ddd| _t||| _|| _tj	dd| _
|   d S )N   Fbiasdim)super__init__nnLinearmlp_encmlp_decmlp_attnmlp_outscalingSoftmaxsoftmaxreset)selfenc_dimdec_dimattn_dim
output_dimr   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/nnet/attention.pyr   8   s   
zContentBasedAttention.__init__c                 C      d| _ d| _d| _dS z)Reset the memory in the attention module.N)enc_lenprecomputed_enc_hmaskr   r$   r$   r%   r   G      
zContentBasedAttention.resetc                 C   s   | j du r| || _ t||d|jd| _| |d}| t	
| j | d}|| jdktj }| || j }t	|d|d}| |}||fS )  Returns the output of the attention module.

        Arguments
        ---------
        enc_states : torch.Tensor
            The tensor to be attended.
        enc_len : torch.Tensor
            The real length (without padding) of enc_states for each sentence.
        dec_states : torch.Tensor
            The query tensor.

        Returns
        -------
        The output of the attention module.
        Nr   max_lendevicer   r   )r)   r   r   sizer0   r*   r   	unsqueezer   torchtanhsqueezemasked_fillnpinfr   r   bmmr   )r   
enc_statesr(   
dec_statesdec_hattncontextr$   r$   r%   forwardM   s    

zContentBasedAttention.forwardr
   __name__
__module____qualname____doc__r   r   r?   __classcell__r$   r$   r"   r%   r	      s
    r	   c                       sD   e Zd ZU dZeej ed< 	d
 fdd	Zdd Z	dd	 Z
  ZS )LocationAwareAttentiona{  This class implements location-aware attention module for seq2seq learning.

    Reference: Attention-Based Models for Speech Recognition, Chorowski et.al.
    https://arxiv.org/pdf/1506.07503.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder.
    dec_dim : int
        Size of decoder.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    conv_channels : int
        Number of channel for location feature.
    kernel_size : int
        Kernel size of convolutional layer for location feature.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = LocationAwareAttention(
    ...     enc_dim=20,
    ...     dec_dim=25,
    ...     attn_dim=30,
    ...     output_dim=5,
    ...     conv_channels=10,
    ...     kernel_size=100)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    r)   r
   c                    s   t    t||| _t||| _tj|ddd| _tjd|d| d |dd| _t||| _	tj|ddd| _t||| _
|| _tjdd| _|   d S )Nr   Fr      )kernel_sizepaddingr   r   r   )r   r   r   r   r   r   r   Conv1dconv_locmlp_locr   r   r   r   r   )r   r   r   r    r!   conv_channelsrI   r   r"   r$   r%   r      s"   


zLocationAwareAttention.__init__c                 C   s   d| _ d| _d| _d| _dS )z%Reset the memory in attention module.N)r(   r)   r*   	prev_attnr+   r$   r$   r%   r      s   
zLocationAwareAttention.resetc                 C   s   | j du r$| || _ t||d|jd| _| jd|  d | _| 	| jd}| 
|dd}| |d}| t| j | | d}|| jdktj }| || j }| | _t|d|d}| |}||fS )r-   Nr   r.   rH   r   r   )r)   r   r   r1   r0   r*   floatr2   rO   rL   rM   	transposer   r   r3   r4   r5   r6   r7   r8   r   r   detachr9   r   )r   r:   r(   r;   	attn_convr<   r=   r>   r$   r$   r%   r?      s(   


zLocationAwareAttention.forwardr@   )rB   rC   rD   rE   r   r3   Tensor__annotations__r   r   r?   rF   r$   r$   r"   r%   rG   u   s   
 '
!rG   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )KeyValueAttentionae  This class implements a single-headed key-value attention module for seq2seq
    learning.

    Reference: "Attention Is All You Need" by Vaswani et al., sec. 3.2.1

    Arguments
    ---------
    enc_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    dec_dim : int
        Size of the decoder feature vectors from which queries are computed.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = KeyValueAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    c                    sV   t    t||| _t||| _t||| _tt	|
 | _|   d S N)r   r   r   r   
key_linearquery_linearvalue_linearr3   sqrttensorrP   r   r   )r   r   r   r    r!   r"   r$   r%   r     s   
zKeyValueAttention.__init__c                 C   r&   r'   )valueskeysr*   r+   r$   r$   r%   r   !  r,   zKeyValueAttention.resetc                 C   s   | j du r | || _ | || _t||d|jdd| _| 	|d}t
| j || j }|| jdktj }|ddd}t
|| jd}||fS )r-   Nr   r.   rH   r   )r^   rX   rZ   r]   r   r1   r0   r2   r*   rY   r3   matmulr   r6   r7   r8   r   rQ   r5   )r   r:   r(   r;   queryscoresnormalized_scoresoutr$   r$   r%   r?   '  s   
zKeyValueAttention.forwardrA   r$   r$   r"   r%   rV      s
    rV   c                       sX   e Zd ZdZejfdedejf fddZe	 defddZ
d	ejfd
dZ  ZS )RelPosEncXLa  Relative positional encoding for the :class:`~RelPosMHAXL`.

    Arguments
    ---------
    emb_dim : int
        Size of the embedding, which controls the size of the last dimension
        of the positional embedding as well
    dtype : torch.dtype, optional
        If unspecified, defaults to `torch.float32`. Controls the data type of
        the output embedding (but does not affect the precision of the
        computations, which remain `torch.float32`).
    emb_dimdtypec                    sT   t    || _ttjd| jdtjdtd| j   }| 	d| || _
d S )Nr   rH   )rf        @inv_freq)r   r   re   r3   exparangefloat32mathlogregister_buffer	emb_dtype)r   re   rf   rh   r"   r$   r%   r   U  s   

zRelPosEncXL.__init__seq_lenc           
      C   sB  | j }| jj}t  tjd|| jftj|d}|d }|d }tjd|tj|d	d}t
|| j }||dddddf< t|| j |dddddf< ||dddddf< t| | j |dddddf< t|d	d}|dd 	d}tj||gdd}	|	|}	W d   |	S 1 sw   Y  |	S )	ab  
        Builds the positional embedding tensor for a given sequence length.

        Arguments
        ---------
        seq_len : int
            The length of the sequence to create the position embedding for.

        Returns
        -------
        torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        rH   rf   r0   r   r   r   N)r   r   )ro   rh   r0   r3   no_grademptyre   rk   rj   r2   sincosflipcatto)
r   rp   ro   r0   tot_pepe_past	pe_future	positions	sinusoidsper$   r$   r%   make_pea  s>   

"$
zRelPosEncXL.make_pexc                 C   s   | j |ddS )a  
        Builds the positional embedding tensor. Similar to
        :meth:`~RelPosEncXL.make_pe` but uses the shape information from the
        provided tensor.

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape batch_size, seq_len, embed_dim

        Returns
        -------
        pos_emb : torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        r   )rp   )r   r1   r   r   r$   r$   r%   r?     s   zRelPosEncXL.forward)rB   rC   rD   rE   r3   rk   intrf   r   rr   r   rT   r?   rF   r$   r$   r"   r%   rd   G  s    2rd   c                       sJ   e Zd ZdZ				d fdd	Zdd Zd	d
 Z			dddZ  ZS )RelPosMHAXLa  This class implements the relative multihead implementation similar to that in Transformer XL
    https://arxiv.org/pdf/1901.02860.pdf

    Arguments
    ---------
    embed_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    num_heads: int
        Number of attention heads.
    dropout : float, optional
        Dropout rate.
    vbias: bool, optional
        Whether to use bias for computing value.
    vdim: int, optional
        Size for value. Default is embed_dim (Note each head is embed_dim // num_heads).
    mask_pos_future: bool, optional
        Whether to mask future positional encodings values.
        Must be true for causal applications e.g. decoder.

    Example
    -------
    >>> inputs = torch.rand([6, 60, 512])
    >>> pos_emb = torch.rand([1, 2*60-1, 512])
    >>> net = RelPosMHAXL(num_heads=8, embed_dim=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs, pos_emb)
    >>> outputs.shape
    torch.Size([6, 60, 512])
            FNc                    s  t    || _|d ur|n|| _| j|k| _|| _|| _|| _|| _|| | _	| j| | _
| j	| | jks:J d| j
| | jksFJ d| jdu rcttd| || _tt| j|| _nttd| || _|r|tt| j| _nd | _t|| _t| j|| _tj||dd| _tt| j	| j| _tt| j	| j| _t|  jtjkrd| _ntd | _|    d	t!"| j | _#d S )
N(embed_dim must be divisible by num_heads#vdim must be divisible by num_headsFrH      r   r8   r   )$r   r   	embed_dimvdim_qkv_same_embed_dimmask_pos_futurevbias	num_headsdropouthead_dim	vhead_dimr   	Parameterr3   rs   qk_proj_weightv_proj_weightin_proj_weightvalue_bias_weightDropoutdropout_attr   out_proj
linear_pos
pos_bias_u
pos_bias_vnext
parametersrf   float16attn_fill_valuerP   _reset_parametersrl   r[   scale)r   r   r   r   r   r   r   r"   r$   r%   r     sR   
	

zRelPosMHAXL.__init__c                 C   sx   | j rtjj| j ntjj| j tjj| j | jd ur*tjj	| j
d tjj| j tjj| j d S Nr   )r   r3   r   initxavier_uniform_r   r   r   r   	constant_r   r   r   r+   r$   r$   r%   r     s   
zRelPosMHAXL._reset_parametersc                 C   s   |  \}}}}tjjj|dd}|||d|}|ddddddf ||||}| jrYtj| d| df|jd}|t	|| d| d ddddddf  }|d	d|d d f S )
zRelative shift implementation.)r   r   )padr   Nr   rH   r   r0   .)
r1   r3   r   
functionalr   viewr   onesr0   tril)r   r   bhqlenpos_lenr   r$   r$   r%   	rel_shift  s   & 4zRelPosMHAXL.rel_shiftTc              	   C   s  |j d }|j d }	|j d }
| jrz||u st||rA||u s&t||rAtj|| j|d| j	| j
d jddd\}}}n;| jjddd\}}}tj|||d| j	| j
}tj|||d| j	| j
}tj|||d| j	| j
}nt| jdur|| jdd| j	| j }| |dd| j	| j
}|| jdd| j	| j
 dd}|| jdd| j	| j
 dd}t|| j |dddd}t|| j |dddd}| |}|| }|dur|jdkr|dd|
|	}n	|d| j	|
|	}|jtjkr||| j}n||7 }|dur%|||dd|	| j}tj |dtj!d}| "|}|durG|jtjkrF||d	}n	 |durX|||dd|	d	}t||dd}|dd# |d| j| j	 }| $|}|r||fS |S )
aX	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        pos_embs : torch.Tensor
            bidirectional sinusoidal positional embedding tensor (1, 2*S-1, E) where S is the max length between source and target sequence lengths,
            and E is the embedding dimension.
        key_padding_mask : torch.Tensor
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        attn_mask : torch.Tensor
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        return_attn_weights : bool
            Whether to additionally return the attention weights.

        Returns
        -------
        out : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_score : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
        r   r   r   r   r   rH   N)r   rf   r   )%shaper   r3   equalr   r   linearr   r   r   r   chunkNotImplementedErrorr   r   r   r   r   r   r   rQ   r   r_   r   permuter   ndimrf   boolr6   r   Fr   rk   r   
contiguousr   )r   r`   keyvaluepos_embskey_padding_mask	attn_maskreturn_attn_weightsbszklenr   qweightkweightvweightp_kq_with_bias_uq_with_bias_v	matrix_ac	matrix_bd
attn_scorer   rc   r$   r$   r%   r?   %  s   
:














zRelPosMHAXL.forward)r   FNF)NNT)	rB   rC   rD   rE   r   r   r   r?   rF   r$   r$   r"   r%   r     s    !?r   c                
       sd   e Zd ZdZ						d fdd	Z				ddeej d	eej d
edeej fddZ	  Z
S )MultiheadAttentiona  The class is a wrapper of MultiHead Attention for torch.nn.MultiHeadAttention.

    Reference: https://pytorch.org/docs/stable/nn.html

    Arguments
    ---------
    nhead : int
        parallel attention heads.
    d_model : int
        The size of the model layers.
    dropout : float
        a Dropout layer on attn_output_weights (default: 0.0).
    bias : bool
        add bias as module parameter (default: True).
    add_bias_kv : bool
        add bias to the key and value sequences at dim=0.
    add_zero_attn : bool
        add a new batch of zeros to the key and value sequences at dim=1.
    kdim : int
        total number of features in key (default: None).
    vdim : int
        total number of features in value (default: None).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = MultiheadAttention(nhead=8, d_model=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    r   TFNc	           	   
      s*   t    tj||||||||d| _d S )N)r   r   r   r   add_bias_kvadd_zero_attnkdimr   )r   r   r   r   att)	r   nheadd_modelr   r   r   r   r   r   r"   r$   r%   r     s   
zMultiheadAttention.__init__r   r   r   r   c           
      C   s   | ddd}| ddd}| ddd}|dur$|dur"||7 }n|}| j||||||d\}}	| ddd}|r>||	fS |S )a	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        attn_mask : torch.Tensor, optional
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        key_padding_mask : torch.Tensor, optional
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        return_attn_weights : bool, optional
            True to additionally return the attention weights, False otherwise.
        pos_embs : torch.Tensor, optional
            Positional embeddings added to the attention map of shape (L, S, E) or (L, S, 1).

        Returns
        -------
        attn_output : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_output_weights : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
            This is returned only if `return_attn_weights=True` (True by default).
        r   r   rH   N)r   r   need_weights)r   r   )
r   r`   r   r   r   r   r   r   outputattention_weightsr$   r$   r%   r?     s&   9


zMultiheadAttention.forward)r   TFFNN)NNTN)rB   rC   rD   rE   r   r   r3   rT   r   r?   rF   r$   r$   r"   r%   r     s,    $r   c                       s4   e Zd ZdZdddejf fdd	Zdd Z  ZS )PositionalwiseFeedForwardu  The class implements the positional-wise feed forward module in
    “Attention Is All You Need”.

    Arguments
    ---------
    d_ffn: int
        Hidden layer size.
    input_shape : tuple, optional
        Expected shape of the input. Alternatively use ``input_size``.
    input_size : int, optional
        Expected size of the input. Alternatively use ``input_shape``.
    dropout: float, optional
        Dropout rate.
    activation: torch.nn.Module, optional
        activation functions to be applied (Recommendation: ReLU, GELU).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = PositionalwiseFeedForward(256, input_size=inputs.shape[-1])
    >>> outputs = net(inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    Nr   c              	      s`   t    |d u r|d u rtd|d u r|d }tt||| t|t||| _d S )Nz)Expected one of input_shape or input_sizer   )r   r   
ValueErrorr   
Sequentialr   r   ffn)r   d_ffninput_shape
input_sizer   
activationr"   r$   r%   r     s   



z"PositionalwiseFeedForward.__init__c                 C   s*   | ddd}| |}| ddd}|S )z8Applies PositionalwiseFeedForward to the input tensor x.r   r   rH   )r   r   r   r$   r$   r%   r?     s   
z!PositionalwiseFeedForward.forward)	rB   rC   rD   rE   r   ReLUr   r?   rF   r$   r$   r"   r%   r   s  s    r   c                       s6   e Zd ZdZdededejdejf fddZ  Z	S )PrecomputedRoPESinusoidsa  
    A cache for the sines and cosines needed to rotate the vectors for rotary
    position embeddings (RoPE).
    This stores the nonzero entries from eq(15) from
    https://arxiv.org/pdf/2104.09864

    Arguments
    ---------
    max_length : int
        The allowed max length of the input sequence.
        For a fixed setting of the other arguments, the computation takes
        O(max_length) time.
    input_size : int
        Size of each vector in the input sequence, i.e. the dimension of each
        attention head.
    dtype : torch.dtype
        The dtype of the tensors.
    device : torch.device
        The Torch device to put the tensors on.

    Example
    -------
    >>> precomputed = PrecomputedRoPESinusoids(3, 8, torch.float32, torch.device('cpu'))
    >>> precomputed.cosines.shape
    torch.Size([3, 8])
    >>> precomputed.sines.shape == precomputed.cosines.shape
    True
    >>> precomputed.cosines
    tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
            [ 0.5403,  0.5403,  0.9950,  0.9950,  0.9999,  0.9999,  1.0000,  1.0000],
            [-0.4161, -0.4161,  0.9801,  0.9801,  0.9998,  0.9998,  1.0000,  1.0000]])
    >>> precomputed.sines
    tensor([[-0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000],
            [-0.8415,  0.8415, -0.0998,  0.0998, -0.0100,  0.0100, -0.0010,  0.0010],
            [-0.9093,  0.9093, -0.1987,  0.1987, -0.0200,  0.0200, -0.0020,  0.0020]])
    >>> precomputed.index_swap
    tensor([1, 0, 3, 2, 5, 4, 7, 6])
    
max_lengthr   rf   r0   c              	      sN  t    |tjkrtjntj}|d dksJ || _ttjd|d||dt	d|   }tj||d}tjd|||d}t
||}	t|	}
tj|
|
gdd||}
t|	}tj||gdd||}dtj|||d |  }tj|dd d |d d d gddd}| d	|
| | d
|| | d| d S )NrH   r   rq   rg   r   r   r   r   cosinessines
index_swap)r   r   r3   float64rk   r   ri   rj   rl   rm   outerru   stackreshapert   rn   rx   )r   r   r   rf   r0   internal_dtypeangles
dimensionstimestimes_anglesr   unsigned_sinesunsigned_repeated_sinesr   r   r"   r$   r%   r     sH   


	z!PrecomputedRoPESinusoids.__init__)
rB   rC   rD   rE   r   r3   rf   r0   r   rF   r$   r$   r"   r%   r     s    'r   c                   @   s:   e Zd ZdZdedeegef fddZdefddZd	S )
MemoiseAtLeastSizea  
    Memoises a function which has as its first argument a value that indicates a
    minimum value to call the underlying function with.

    Arguments
    ---------
    function: Callable
        The function to call.
    round_up: Callable[[Any], Any]
        A function that rounds up.
        The fewer values this rounds up to, the less likely it is that the
        function will be called repeatedly.
    functionround_upc                 C   s   || _ || _i | _d S rW   )r   r   memo)r   r   r   r$   r$   r%   r   )  s   
zMemoiseAtLeastSize.__init__r1   c                 G   s\   || j vs| j | d |k r'| |}||k rJ || j|g|R  f| j |< | j | d S )Nr   r   )r   r   r   )r   r1   argsrounded_sizer$   r$   r%   __call__1  s
   
zMemoiseAtLeastSize.__call__N)rB   rC   rD   rE   r   r   r   r   r$   r$   r$   r%   r     s    r   r   returnc                    s   dt dtf fdd}|S )a  
    Decorator that memoises a function which has as its first argument a value
    that indicates a minimum value to call the underlying function with.
    If the memo has stored the result from a matching previous function call,
    The stored result will be returned instead of calling the function again.

    Arguments
    ---------
    round_up: Callable[[Any], Any]
        A function that rounds up.
        This will be called with the first argument passed in.
        The underlying function will receive, instead of this first argument,
        the rounded-up version.
        The fewer values this rounds up to, the less likely it is that the
        function will be called repeatedly.

    Returns
    -------
    The passed function but with MemoiseAtLeastSize capability.
    r   r   c                    s
   t |  S )z2
        Set the function to be memoised.
        )r   )r   r   r$   r%   with_functionQ  s   
z'memoise_at_least.<locals>.with_function)r   r   )r   r   r$   r   r%   memoise_at_least9  s   r   c                 C   s   dt tt|  S )NrH   )r   rl   ceillog2)lengthr$   r$   r%   <lambda>Z  s    r   r   r   rf   r0   c                 C   s0   t tt| }| d| ksJ t| |||S )a)  
    Return an object of type PrecomputedRoPESinusoids that is valid for the
    length, input_size, dtype and device.
    Consider a single (input_size, dtype, device), which are usually fixed for
    one model.
    The sinusoids will be recomputed only if they are not yet available for such
    a long length (because of the decorator applied to the function).
    Each time they are precomputed, the length is rounded up to the next power
    of two.

    As a consequence, the total number of calls during one program run is
    upper-bounded by ceil(log2(max_length)) where max_length is the highest
    length that is seen in the program run.
    On realistic lengths, the total number of calls is likely only a few.
    The total number of time steps for which sinusoids are precomputed during
    the program run is O(max_length).

    Arguments
    ---------
    length : int
        The length of the input sequence.
    input_size : int
        Size of each vector in the input sequence, i.e. the dimension of each
        attention head.
    dtype : torch.dtype
        The dtype of the tensors.
    device : torch.device
        The Torch device to put the tensors on.

    Return
    ------
    An object of type PrecomputedRoPESinusoids that is valid for the length,
    input_size, dtype and device.
    rH   )r   roundrl   r   r   )r   r   rf   r0   length_powerr$   r$   r%   _get_precomputed_valuesZ  s   (r  c           	      C   sz   | j \}}}}|d dksJ t||| j| j}|jd| }|jd| }tj| d|jd}| |	d ||	d  S )z~
    Perform the rotation for RoPE on each of the vectors in x.
    Details about RoPE: https://arxiv.org/pdf/2104.09864.
    rH   r   Nr   )r   indexr   )
r   r  rf   r0   r   r   r3   index_selectr   r2   )	r   _batch_sizer   
_num_headsr   precomputedr   r   swapped_pairsr$   r$   r%   _rope_rotate  s   r  c                       sB   e Zd ZdZ			d fdd	Zdd Z					dd
dZ  ZS )RoPEMHAa  This is an implementation of multihead self-attention with RoPE positional embeddings. As it relies on Torch for self-attention, it is
    significantly faster than RelPosMHAXL while offering the same or better levels of accuracy.

    Details about RoPE: https://arxiv.org/pdf/2104.09864.


    Arguments
    ---------
    embed_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    num_heads: int
        Number of attention heads.
    dropout : float, optional
        Dropout rate.
    vbias: bool, optional
        Whether to use bias for computing value.
    vdim: int, optional
        Size for value. Default is embed_dim (Note each head is embed_dim // num_heads).

    Example
    -------
    >>> max_len = 64
    >>> inputs = torch.rand([6, 60, 512])
    >>> num_heads = 8
    >>> net = RoPEMHA(num_heads=num_heads, embed_dim=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([6, 60, 512])
    r   FNc                    sZ  t    || _|d ur|n|| _| j|k| _|| _|| _|| _|| | _| j| | _	| j| | jks7J d| j	| | jksCJ d| jdu r`t
td| || _t
t| j|| _nt
td| || _|ryt
t| j| _nd | _t
|| _t
| j|| _t|  jtjkrd| _ntd | _|   dt| j | _d S )	Nr   r   FrH   r   r   r8   r   ) r   r   r   r   r   r   r   r   r   r   r   r   r3   rs   r   r   r   r   r   r   r   r   r   r   rf   r   r   rP   r   rl   r[   r   )r   r   r   r   r   r   r"   r$   r%   r     sB   


zRoPEMHA.__init__c                 C   s\   | j rtjj| j ntjj| j tjj| j | jd ur,tjj	| j
d d S d S r   )r   r3   r   r   r   r   r   r   r   r   r   r+   r$   r$   r%   r     s   
zRoPEMHA._reset_parametersTc              	   C   s  |du sJ d|j d }|j d }	| jr}||u st||rD||u s)t||rDtj|| j|d| j	| j
d jddd\}}}n;| jjddd\}
}}tj||
|d| j	| j
}tj|||d| j	| j
}tj|||d| j	| j
}nt| jdur|| jdd| j	| j }t|}t|}t||	| j	||}tj|dddd|dddd|dddd|| j| jd	}|dd |d| j| j	 }| |}|r|dfS |S )
a  Compute attention through Pytorch attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        key_padding_mask : torch.Tensor
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        attn_mask : torch.BoolTensor
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.
        pos_embs : torch.Tensor
            Not used by this class. It is kept for compliance.
        return_attn_weights : bool
            Whether to additionally return the attention weights.

        Returns
        -------
        out : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_score : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
        Nzpos_embs is not supportedr   r   r   r   r   rH   )r`   r   r   r   	dropout_pr   )r   r   r3   r   r   r   r   r   r   r   r   r   r   r   r   r   r  masks_unionr   scaled_dot_product_attentionr   r   r   rQ   r   r   )r   r`   r   r   r   r   r   r   r   r   r   r   r   	q_rotated	k_rotatedfinal_masksr   rc   r$   r$   r%   r?     s^   0






zRoPEMHA.forward)r   FN)NNNT)rB   rC   rD   rE   r   r   r?   rF   r$   r$   r"   r%   r	    s    "5r	  c                 C   s   d}|dur| | dd|| |||}|}|dur*| dd||| |||}|}|dur8|dur8t||}|durAt|}|S )a  This is an utility function combining standard key_padding_mask and
    attn_mask from SpeechBrain into a single one for scaled_dot_product_attention. This function does not support weighting of the attn_score. Hence, if one wish to use float values as masks, they should not use this function.

    Arguments
    ---------
    bsz : int
        Batch size dimension.
    klen : int
        Time dimension of the key tensor. (Sequence length).
    num_heads : int
        Number of heads of the attention module using these masks.
    attn_mask : torch.BoolTensor
        2D mask (L, S) where L is the target sequence length, S is
        the source sequence length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.
    key_padding_mask : torch.BoolTensor
        (B, S) where B is the batch size, S is the source sequence
        length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.

    Returns
    -------
    out : torch.BoolTensor
        (bsz, num_heads, klen, klen) where False values are masked and True are unmasked (opposite of the input tensors).

    Nr   )r   expandr3   
logical_orlogical_not)r   r   r   r   r   
final_maskr$   r$   r%   r  l  s    
r  )(rE   rl   typingr   r   r   r   r   numpyr7   r3   torch.nnr   torch.nn.functionalr   r   speechbrain.dataio.dataior   speechbrain.utils.loggerr   rB   loggerModuler	   rG   rV   rd   r   r   r   r   r   r   r   rf   r0   r  r  r	  r  r$   r$   r$   r%   <module>   sX    
\ Ma  = =j

!, N