o
    ½e¦i’Â  ã                   @   sŽ  d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
Z
ddlmZ ddlm  mZ ddlmZ ddlmZ eeƒZG dd„ dejƒZG dd	„ d	ejƒZG d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dƒZdeegef deegef fdd„Z e dd„ ƒde!de!d e
j"d!e
j#def
d"d#„ƒZ$d$d%„ Z%G d&d'„ d'ejƒZ&d(d)„ Z'dS )*z¢Library implementing attention modules.

Authors
 * Ju-Chieh Chou 2020
 * Jianyuan Zhong 2020
 * Loren Lugosch 2020
 * Samuele Cornell 2020
 * Shucong Zhang 2024
é    N)ÚAnyÚCallableÚDictÚOptionalÚTuple)Úlength_to_mask)Ú
get_loggerc                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚContentBasedAttentiona”  This class implements content-based attention module for seq2seq
    learning.

    Reference: NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN
    AND TRANSLATE, Bahdanau et.al. https://arxiv.org/pdf/1409.0473.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder layer.
    dec_dim : int
        Size of decoder layer.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = ContentBasedAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    ç      ð?c                    sf   t ƒ  ¡  t ||¡| _t ||¡| _tj|ddd| _t ||¡| _|| _tj	dd| _
|  ¡  d S )Né   F©Úbiaséÿÿÿÿ©Údim)ÚsuperÚ__init__ÚnnÚLinearÚmlp_encÚmlp_decÚmlp_attnÚmlp_outÚscalingÚSoftmaxÚsoftmaxÚreset)ÚselfÚenc_dimÚdec_dimÚattn_dimÚ
output_dimr   ©Ú	__class__© úX/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/nnet/attention.pyr   8   s   
zContentBasedAttention.__init__c                 C   ó   d| _ d| _d| _dS ©z)Reset the memory in the attention module.N)Úenc_lenÚprecomputed_enc_hÚmask©r   r$   r$   r%   r   G   ó   
zContentBasedAttention.resetc                 C   sª   | j du r|  |¡| _ t|| d¡|jd| _|  | d¡¡}|  t	 
| j | ¡¡ d¡}| | jdktj ¡}|  || j ¡}t	 | d¡|¡ d¡}|  |¡}||fS )á  Returns the output of the attention module.

        Arguments
        ---------
        enc_states : torch.Tensor
            The tensor to be attended.
        enc_len : torch.Tensor
            The real length (without padding) of enc_states for each sentence.
        dec_states : torch.Tensor
            The query tensor.

        Returns
        -------
        The output of the attention module.
        Nr   ©Úmax_lenÚdevicer   r   )r)   r   r   Úsizer0   r*   r   Ú	unsqueezer   ÚtorchÚtanhÚsqueezeÚmasked_fillÚnpÚinfr   r   Úbmmr   )r   Ú
enc_statesr(   Ú
dec_statesÚdec_hÚattnÚcontextr$   r$   r%   ÚforwardM   s    
ÿÿþ
zContentBasedAttention.forward©r
   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r?   Ú__classcell__r$   r$   r"   r%   r	      s
    r	   c                       sD   e Zd ZU dZeej ed< 	d
‡ fdd„	Zdd„ Z	dd	„ Z
‡  ZS )ÚLocationAwareAttentiona{  This class implements location-aware attention module for seq2seq learning.

    Reference: Attention-Based Models for Speech Recognition, Chorowski et.al.
    https://arxiv.org/pdf/1506.07503.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder.
    dec_dim : int
        Size of decoder.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    conv_channels : int
        Number of channel for location feature.
    kernel_size : int
        Kernel size of convolutional layer for location feature.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = LocationAwareAttention(
    ...     enc_dim=20,
    ...     dec_dim=25,
    ...     attn_dim=30,
    ...     output_dim=5,
    ...     conv_channels=10,
    ...     kernel_size=100)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    r)   r
   c                    s¤   t ƒ  ¡  t ||¡| _t ||¡| _tj|ddd| _tjd|d| d |dd| _t ||¡| _	tj|ddd| _t ||¡| _
|| _tjdd| _|  ¡  d S )Nr   Fr   é   )Úkernel_sizeÚpaddingr   r   r   )r   r   r   r   r   r   r   ÚConv1dÚconv_locÚmlp_locr   r   r   r   r   )r   r   r   r    r!   Úconv_channelsrI   r   r"   r$   r%   r   Ÿ   s"   


ûzLocationAwareAttention.__init__c                 C   s   d| _ d| _d| _d| _dS )z%Reset the memory in attention module.N)r(   r)   r*   Ú	prev_attnr+   r$   r$   r%   r   À   s   
zLocationAwareAttention.resetc                 C   sö   | j du r$|  |¡| _ t|| d¡|jd| _| jd| ¡   d¡ | _|  	| j d¡¡}|  
| dd¡¡}|  | d¡¡}|  t | j | | ¡¡ d¡}| | jdktj ¡}|  || j ¡}| ¡ | _t | d¡|¡ d¡}|  |¡}||fS )r-   Nr   r.   rH   r   r   )r)   r   r   r1   r0   r*   Úfloatr2   rO   rL   rM   Ú	transposer   r   r3   r4   r5   r6   r7   r8   r   r   Údetachr9   r   )r   r:   r(   r;   Ú	attn_convr<   r=   r>   r$   r$   r%   r?   Ç   s(   
ÿÿþ

zLocationAwareAttention.forwardr@   )rB   rC   rD   rE   r   r3   ÚTensorÚ__annotations__r   r   r?   rF   r$   r$   r"   r%   rG   u   s   
 '
ø!rG   c                       s0   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Z‡  ZS )ÚKeyValueAttentionae  This class implements a single-headed key-value attention module for seq2seq
    learning.

    Reference: "Attention Is All You Need" by Vaswani et al., sec. 3.2.1

    Arguments
    ---------
    enc_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    dec_dim : int
        Size of the decoder feature vectors from which queries are computed.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = KeyValueAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    c                    sV   t ƒ  ¡  t ||¡| _t ||¡| _t ||¡| _t t 	|¡ 
¡ ¡| _|  ¡  d S ©N)r   r   r   r   Ú
key_linearÚquery_linearÚvalue_linearr3   ÚsqrtÚtensorrP   r   r   )r   r   r   r    r!   r"   r$   r%   r     s   
zKeyValueAttention.__init__c                 C   r&   r'   )ÚvaluesÚkeysr*   r+   r$   r$   r%   r   !  r,   zKeyValueAttention.resetc                 C   s¨   | j du r |  |¡| _ |  |¡| _t|| d¡|jd d¡| _|  	|¡ d¡}t
 | j |¡| j }| | jdktj ¡}| d¡ dd¡}t
 || j¡ d¡}||fS )r-   Nr   r.   rH   r   )r^   rX   rZ   r]   r   r1   r0   r2   r*   rY   r3   Úmatmulr   r6   r7   r8   r   rQ   r5   )r   r:   r(   r;   ÚqueryÚscoresÚnormalized_scoresÚoutr$   r$   r%   r?   '  s   
ÿþzKeyValueAttention.forwardrA   r$   r$   r"   r%   rV   ú   s
    rV   c                       sX   e Zd ZdZejfdedejf‡ fdd„Ze 	¡ defdd„ƒZ
d	ejfd
d„Z‡  ZS )ÚRelPosEncXLaÕ  Relative positional encoding for the :class:`~RelPosMHAXL`.

    Arguments
    ---------
    emb_dim : int
        Size of the embedding, which controls the size of the last dimension
        of the positional embedding as well
    dtype : torch.dtype, optional
        If unspecified, defaults to `torch.float32`. Controls the data type of
        the output embedding (but does not affect the precision of the
        computations, which remain `torch.float32`).
    Úemb_dimÚdtypec                    sT   t ƒ  ¡  || _t tjd| jdtjdt d¡| j   ¡}|  	d|¡ || _
d S )Nr   rH   )rf   ç     ˆÃ@Úinv_freq)r   r   re   r3   ÚexpÚarangeÚfloat32ÚmathÚlogÚregister_bufferÚ	emb_dtype)r   re   rf   rh   r"   r$   r%   r   U  s   
ÿÿ
zRelPosEncXL.__init__Úseq_lenc           
      C   sB  | j }| jj}t ¡ Œ tjd|| jftj|d}|d }|d }tjd|tj|d 	d¡}t 
|| j ¡}||dd…ddd…f< t || j ¡|dd…ddd…f< ||dd…ddd…f< t | | j ¡|dd…ddd…f< t |d¡ 	d¡}|dd…  	d¡}tj||gdd}	|	 |¡}	W d  ƒ |	S 1 sšw   Y  |	S )	ab  
        Builds the positional embedding tensor for a given sequence length.

        Arguments
        ---------
        seq_len : int
            The length of the sequence to create the position embedding for.

        Returns
        -------
        torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        rH   ©rf   r0   r   r   r   N)r   r   )ro   rh   r0   r3   Úno_gradÚemptyre   rk   rj   r2   ÚsinÚcosÚflipÚcatÚto)
r   rp   ro   r0   Útot_peÚpe_pastÚ	pe_futureÚ	positionsÚ	sinusoidsÚper$   r$   r%   Úmake_pea  s>   

ýüû"$
äâzRelPosEncXL.make_peÚxc                 C   s   | j | d¡dS )aº  
        Builds the positional embedding tensor. Similar to
        :meth:`~RelPosEncXL.make_pe` but uses the shape information from the
        provided tensor.

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape batch_size, seq_len, embed_dim

        Returns
        -------
        pos_emb : torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        r   )rp   )r   r1   ©r   r€   r$   r$   r%   r?   ”  s   zRelPosEncXL.forward)rB   rC   rD   rE   r3   rk   Úintrf   r   rr   r   rT   r?   rF   r$   r$   r"   r%   rd   G  s    2rd   c                       sJ   e Zd ZdZ				d‡ fdd„	Zdd„ Zd	d
„ Z			ddd„Z‡  ZS )ÚRelPosMHAXLa÷  This class implements the relative multihead implementation similar to that in Transformer XL
    https://arxiv.org/pdf/1901.02860.pdf

    Arguments
    ---------
    embed_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    num_heads: int
        Number of attention heads.
    dropout : float, optional
        Dropout rate.
    vbias: bool, optional
        Whether to use bias for computing value.
    vdim: int, optional
        Size for value. Default is embed_dim (Note each head is embed_dim // num_heads).
    mask_pos_future: bool, optional
        Whether to mask future positional encodings values.
        Must be true for causal applications e.g. decoder.

    Example
    -------
    >>> inputs = torch.rand([6, 60, 512])
    >>> pos_emb = torch.rand([1, 2*60-1, 512])
    >>> net = RelPosMHAXL(num_heads=8, embed_dim=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs, pos_emb)
    >>> outputs.shape
    torch.Size([6, 60, 512])
    ç        FNc                    s¢  t ƒ  ¡  || _|d ur|n|| _| j|k| _|| _|| _|| _|| _|| | _	| j| | _
| j	| | jks:J dƒ‚| j
| | jksFJ dƒ‚| jdu rct t d| |¡¡| _t t | j|¡¡| _nt t d| |¡¡| _|r|t t | j¡¡| _nd | _t |¡| _t | j|¡| _tj||dd| _t t | j	| j¡¡| _t t | j	| j¡¡| _t|  ¡ ƒjtjkr¼d| _ntdƒ | _|   ¡  d	t! "| j¡ | _#d S )
Nú(embed_dim must be divisible by num_headsú#vdim must be divisible by num_headsFrH   é   r   éÿÿr8   r   )$r   r   Ú	embed_dimÚvdimÚ_qkv_same_embed_dimÚmask_pos_futureÚvbiasÚ	num_headsÚdropoutÚhead_dimÚ	vhead_dimr   Ú	Parameterr3   rs   Úqk_proj_weightÚv_proj_weightÚin_proj_weightÚvalue_bias_weightÚDropoutÚdropout_attr   Úout_projÚ
linear_posÚ
pos_bias_uÚ
pos_bias_vÚnextÚ
parametersrf   Úfloat16Úattn_fill_valuerP   Ú_reset_parametersrl   r[   Úscale)r   r‰   rŽ   r   r   rŠ   rŒ   r"   r$   r%   r   Æ  sR   
	
ÿÿ
ÿÿÿÿzRelPosMHAXL.__init__c                 C   sx   | j rtjj | j¡ ntjj | j¡ tjj | j¡ | jd ur*tjj 	| j
d¡ tjj | j¡ tjj | j¡ d S ©Nr„   )r‹   r3   r   ÚinitÚxavier_uniform_r•   r“   r”   r   Ú	constant_r–   r›   rœ   r+   r$   r$   r%   r¡     s   
zRelPosMHAXL._reset_parametersc                 C   sÊ   |  ¡ \}}}}tjjj|dd}| ||d|¡}|dd…dd…dd…f  ||||¡}| jrYtj|  d¡|  d¡f|jd}|t 	||  d¡|  d¡ ¡dddd…dd…f  }|d	d|d d …f S )
zRelative shift implementation.)r   r   )Úpadr   Nr   rH   r‡   ©r0   .)
r1   r3   r   Ú
functionalr§   ÚviewrŒ   Úonesr0   Útril)r   r€   ÚbÚhÚqlenÚpos_lenr«   r$   r$   r%   Ú	rel_shift  s   & 4zRelPosMHAXL.rel_shiftTc              	   C   s  |j d }|j d }	|j d }
| jrz||u st ||¡rA||u s&t ||¡rAtj || j¡ |d| j	| j
d ¡jddd\}}}n;| jjddd\}}}tj ||¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}nt‚| jdurŽ|| j dd| j	| j¡ }|  |¡ dd| j	| j
¡}|| j dd| j	| j
¡  dd¡}|| j dd| j	| j
¡  dd¡}t || j | dddd¡¡}t || j | dddd¡¡}|  |¡}|| }|dur|jdkr÷| dd|
|	¡}n	| d| j	|
|	¡}|jtjkr| || j¡}n||7 }|dur%| | |dd|	¡| j¡}tj |dtj!d}|  "|¡}|durG|jtjkrF| |d	¡}n	 |durX| | |dd|	¡d	¡}t || dd¡¡}| dd¡ #¡  |d| j| j	 ¡}|  $|¡}|r||fS |S )
aX	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        pos_embs : torch.Tensor
            bidirectional sinusoidal positional embedding tensor (1, 2*S-1, E) where S is the max length between source and target sequence lengths,
            and E is the embedding dimension.
        key_padding_mask : torch.Tensor
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        attn_mask : torch.Tensor
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        return_attn_weights : bool
            Whether to additionally return the attention weights.

        Returns
        -------
        out : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_score : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
        r   r   r   r‡   r   rH   N)r   rf   r„   )%Úshaper‹   r3   Úequalr   r©   Úlinearr•   rª   rŽ   r   ÚchunkÚNotImplementedErrorr“   r”   r‘   r   r–   rš   r›   rQ   rœ   r_   r¢   Úpermuter±   Úndimrf   Úboolr6   r    ÚFr   rk   r˜   Ú
contiguousr™   )r   r`   ÚkeyÚvalueÚpos_embsÚkey_padding_maskÚ	attn_maskÚreturn_attn_weightsÚbszÚklenr¯   ÚqweightÚkweightÚvweightÚp_kÚq_with_bias_uÚq_with_bias_vÚ	matrix_acÚ	matrix_bdÚ
attn_scorer€   rc   r$   r$   r%   r?   %  sž   
:

þ
ÿÿÿÿ

ÿ
ÿþþÿÿ


ÿ
þ


þÿ
ý
zRelPosMHAXL.forward)r„   FNF)NNT)	rB   rC   rD   rE   r   r¡   r±   r?   rF   r$   r$   r"   r%   rƒ   ¨  s    !ù?ørƒ   c                
       sd   e Zd ZdZ						d‡ fdd„	Z				ddeej d	eej d
edeej fdd„Z	‡  Z
S )ÚMultiheadAttentionaß  The class is a wrapper of MultiHead Attention for torch.nn.MultiHeadAttention.

    Reference: https://pytorch.org/docs/stable/nn.html

    Arguments
    ---------
    nhead : int
        parallel attention heads.
    d_model : int
        The size of the model layers.
    dropout : float
        a Dropout layer on attn_output_weights (default: 0.0).
    bias : bool
        add bias as module parameter (default: True).
    add_bias_kv : bool
        add bias to the key and value sequences at dim=0.
    add_zero_attn : bool
        add a new batch of zeros to the key and value sequences at dim=1.
    kdim : int
        total number of features in key (default: None).
    vdim : int
        total number of features in value (default: None).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = MultiheadAttention(nhead=8, d_model=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    r„   TFNc	           	   
      s*   t ƒ  ¡  tj||||||||d| _d S )N)r‰   rŽ   r   r   Úadd_bias_kvÚadd_zero_attnÚkdimrŠ   )r   r   r   rÍ   Úatt)	r   ÚnheadÚd_modelr   r   rÎ   rÏ   rÐ   rŠ   r"   r$   r%   r     s   
øzMultiheadAttention.__init__rÀ   r¿   rÁ   r¾   c           
      C   s€   |  ddd¡}|  ddd¡}|  ddd¡}|dur$|dur"||7 }n|}| j||||||d\}}	|  ddd¡}|r>||	fS |S )a	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        attn_mask : torch.Tensor, optional
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        key_padding_mask : torch.Tensor, optional
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        return_attn_weights : bool, optional
            True to additionally return the attention weights, False otherwise.
        pos_embs : torch.Tensor, optional
            Positional embeddings added to the attention map of shape (L, S, E) or (L, S, 1).

        Returns
        -------
        attn_output : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_output_weights : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
            This is returned only if `return_attn_weights=True` (True by default).
        r   r   rH   N)rÀ   r¿   Úneed_weights)r·   rÑ   )
r   r`   r¼   r½   rÀ   r¿   rÁ   r¾   ÚoutputÚattention_weightsr$   r$   r%   r?     s&   9

ú
zMultiheadAttention.forward)r„   TFFNN)NNTN)rB   rC   rD   rE   r   r   r3   rT   r¹   r?   rF   r$   r$   r"   r%   rÍ   ã  s,    $÷øûúùørÍ   c                       s4   e Zd ZdZdddejf‡ fdd„	Zdd„ Z‡  ZS )ÚPositionalwiseFeedForwardu  The class implements the positional-wise feed forward module in
    â€œAttention Is All You Needâ€.

    Arguments
    ---------
    d_ffn: int
        Hidden layer size.
    input_shape : tuple, optional
        Expected shape of the input. Alternatively use ``input_size``.
    input_size : int, optional
        Expected size of the input. Alternatively use ``input_shape``.
    dropout: float, optional
        Dropout rate.
    activation: torch.nn.Module, optional
        activation functions to be applied (Recommendation: ReLU, GELU).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = PositionalwiseFeedForward(256, input_size=inputs.shape[-1])
    >>> outputs = net(inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    Nr„   c              	      s`   t ƒ  ¡  |d u r|d u rtdƒ‚|d u r|d }t t ||¡|ƒ t |¡t ||¡¡| _d S )Nz)Expected one of input_shape or input_sizer   )r   r   Ú
ValueErrorr   Ú
Sequentialr   r—   Úffn)r   Úd_ffnÚinput_shapeÚ
input_sizer   Ú
activationr"   r$   r%   r     s   



üz"PositionalwiseFeedForward.__init__c                 C   s*   |  ddd¡}|  |¡}|  ddd¡}|S )z8Applies PositionalwiseFeedForward to the input tensor x.r   r   rH   )r·   rÚ   r   r$   r$   r%   r?   ¤  s   
z!PositionalwiseFeedForward.forward)	rB   rC   rD   rE   r   ÚReLUr   r?   rF   r$   r$   r"   r%   r×   s  s    úr×   c                       s6   e Zd ZdZdededejdejf‡ fdd„Z‡  Z	S )ÚPrecomputedRoPESinusoidsa  
    A cache for the sines and cosines needed to rotate the vectors for rotary
    position embeddings (RoPE).
    This stores the nonzero entries from eq(15) from
    https://arxiv.org/pdf/2104.09864

    Arguments
    ---------
    max_length : int
        The allowed max length of the input sequence.
        For a fixed setting of the other arguments, the computation takes
        O(max_length) time.
    input_size : int
        Size of each vector in the input sequence, i.e. the dimension of each
        attention head.
    dtype : torch.dtype
        The dtype of the tensors.
    device : torch.device
        The Torch device to put the tensors on.

    Example
    -------
    >>> precomputed = PrecomputedRoPESinusoids(3, 8, torch.float32, torch.device('cpu'))
    >>> precomputed.cosines.shape
    torch.Size([3, 8])
    >>> precomputed.sines.shape == precomputed.cosines.shape
    True
    >>> precomputed.cosines
    tensor([[ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
            [ 0.5403,  0.5403,  0.9950,  0.9950,  0.9999,  0.9999,  1.0000,  1.0000],
            [-0.4161, -0.4161,  0.9801,  0.9801,  0.9998,  0.9998,  1.0000,  1.0000]])
    >>> precomputed.sines
    tensor([[-0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000, -0.0000,  0.0000],
            [-0.8415,  0.8415, -0.0998,  0.0998, -0.0100,  0.0100, -0.0010,  0.0010],
            [-0.9093,  0.9093, -0.1987,  0.1987, -0.0200,  0.0200, -0.0020,  0.0020]])
    >>> precomputed.index_swap
    tensor([1, 0, 3, 2, 5, 4, 7, 6])
    Ú
max_lengthrÝ   rf   r0   c              	      sN  t ƒ  ¡  |tjkrtjntj}|d dksJ ‚|| _t tjd|d||dt 	d¡|   ¡}tj||d}tjd|||d}t 
||¡}	t |	¡}
tj|
|
gdd ||¡}
t |	¡}tj||gdd ||¡}dtj|||d |  }tj|dd d… |d d d… gdd d¡}|  d	|
 |¡¡ |  d
| |¡¡ |  d|¡ d S )NrH   r   rq   rg   r¨   r   r   r   ÚcosinesÚsinesÚ
index_swap)r   r   r3   Úfloat64rk   rá   ri   rj   rl   rm   Úouterru   ÚstackÚreshapert   rn   rx   )r   rá   rÝ   rf   r0   Úinternal_dtypeÚanglesÚ
dimensionsÚtimesÚtimes_anglesrâ   Úunsigned_sinesÚunsigned_repeated_sinesrã   rä   r"   r$   r%   r   Ø  sH   
ÿÿÿ
ÿ
ÿþÿý	ÿþz!PrecomputedRoPESinusoids.__init__)
rB   rC   rD   rE   r‚   r3   rf   r0   r   rF   r$   r$   r"   r%   rà   °  s    'þýüûrà   c                   @   s:   e Zd ZdZdedeegef fdd„Zdefdd„Zd	S )
ÚMemoiseAtLeastSizea  
    Memoises a function which has as its first argument a value that indicates a
    minimum value to call the underlying function with.

    Arguments
    ---------
    function: Callable
        The function to call.
    round_up: Callable[[Any], Any]
        A function that rounds up.
        The fewer values this rounds up to, the less likely it is that the
        function will be called repeatedly.
    ÚfunctionÚround_upc                 C   s   || _ || _i | _d S rW   )rñ   rò   Úmemo)r   rñ   rò   r$   r$   r%   r   )  s   
zMemoiseAtLeastSize.__init__r1   c                 G   s\   || j vs| j | d |k r'|  |¡}||k rJ ‚|| j|g|¢R Ž f| j |< | j | d S )Nr   r   )ró   rò   rñ   )r   r1   ÚargsÚrounded_sizer$   r$   r%   Ú__call__1  s
   
zMemoiseAtLeastSize.__call__N)rB   rC   rD   rE   r   r   r   rö   r$   r$   r$   r%   rð     s    rð   rò   Úreturnc                    s   dt dtf‡ fdd„}|S )a  
    Decorator that memoises a function which has as its first argument a value
    that indicates a minimum value to call the underlying function with.
    If the memo has stored the result from a matching previous function call,
    The stored result will be returned instead of calling the function again.

    Arguments
    ---------
    round_up: Callable[[Any], Any]
        A function that rounds up.
        This will be called with the first argument passed in.
        The underlying function will receive, instead of this first argument,
        the rounded-up version.
        The fewer values this rounds up to, the less likely it is that the
        function will be called repeatedly.

    Returns
    -------
    The passed function but with MemoiseAtLeastSize capability.
    rñ   r÷   c                    s
   t | ˆ ƒS )z2
        Set the function to be memoised.
        )rð   )rñ   ©rò   r$   r%   Úwith_functionQ  s   
z'memoise_at_least.<locals>.with_function)r   rð   )rò   rù   r$   rø   r%   Úmemoise_at_least9  s   rú   c                 C   s   dt t t | ¡¡ƒ S )NrH   )r‚   rl   ÚceilÚlog2)Úlengthr$   r$   r%   Ú<lambda>Z  s    rþ   rý   rÝ   rf   r0   c                 C   s0   t tt | ¡ƒƒ}| d| ksJ ‚t| |||ƒS )a)  
    Return an object of type PrecomputedRoPESinusoids that is valid for the
    length, input_size, dtype and device.
    Consider a single (input_size, dtype, device), which are usually fixed for
    one model.
    The sinusoids will be recomputed only if they are not yet available for such
    a long length (because of the decorator applied to the function).
    Each time they are precomputed, the length is rounded up to the next power
    of two.

    As a consequence, the total number of calls during one program run is
    upper-bounded by ceil(log2(max_length)) where max_length is the highest
    length that is seen in the program run.
    On realistic lengths, the total number of calls is likely only a few.
    The total number of time steps for which sinusoids are precomputed during
    the program run is O(max_length).

    Arguments
    ---------
    length : int
        The length of the input sequence.
    input_size : int
        Size of each vector in the input sequence, i.e. the dimension of each
        attention head.
    dtype : torch.dtype
        The dtype of the tensors.
    device : torch.device
        The Torch device to put the tensors on.

    Return
    ------
    An object of type PrecomputedRoPESinusoids that is valid for the length,
    input_size, dtype and device.
    rH   )r‚   Úroundrl   rü   rà   )rý   rÝ   rf   r0   Úlength_powerr$   r$   r%   Ú_get_precomputed_valuesZ  s   (r  c           	      C   sz   | j \}}}}|d dksJ ‚t||| j| jƒ}|jd|… }|jd|… }tj| d|jd}| | 	d¡ || 	d¡  S )z~
    Perform the rotation for RoPE on each of the vectors in x.
    Details about RoPE: https://arxiv.org/pdf/2104.09864.
    rH   r   Nr   )r   Úindexr   )
r²   r  rf   r0   râ   rã   r3   Úindex_selecträ   r2   )	r€   Ú_batch_sizerý   Ú
_num_headsr   Úprecomputedrâ   rã   Úswapped_pairsr$   r$   r%   Ú_rope_rotate‡  s   r  c                       sB   e Zd ZdZ			d‡ fdd„	Zdd„ Z					dd
d„Z‡  ZS )ÚRoPEMHAaï  This is an implementation of multihead self-attention with RoPE positional embeddings. As it relies on Torch for self-attention, it is
    significantly faster than RelPosMHAXL while offering the same or better levels of accuracy.

    Details about RoPE: https://arxiv.org/pdf/2104.09864.


    Arguments
    ---------
    embed_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    num_heads: int
        Number of attention heads.
    dropout : float, optional
        Dropout rate.
    vbias: bool, optional
        Whether to use bias for computing value.
    vdim: int, optional
        Size for value. Default is embed_dim (Note each head is embed_dim // num_heads).

    Example
    -------
    >>> max_len = 64
    >>> inputs = torch.rand([6, 60, 512])
    >>> num_heads = 8
    >>> net = RoPEMHA(num_heads=num_heads, embed_dim=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([6, 60, 512])
    r„   FNc                    sZ  t ƒ  ¡  || _|d ur|n|| _| j|k| _|| _|| _|| _|| | _| j| | _	| j| | jks7J dƒ‚| j	| | jksCJ dƒ‚| jdu r`t
 t d| |¡¡| _t
 t | j|¡¡| _nt
 t d| |¡¡| _|ryt
 t | j¡¡| _nd | _t
 |¡| _t
 | j|¡| _t|  ¡ ƒjtjkr˜d| _ntdƒ | _|  ¡  dt | j¡ | _d S )	Nr…   r†   FrH   r‡   rˆ   r8   r   ) r   r   r‰   rŠ   r‹   r   rŽ   r   r   r‘   r   r’   r3   rs   r“   r”   r•   r–   r—   r˜   r   r™   r   rž   rf   rŸ   r    rP   r¡   rl   r[   r¢   )r   r‰   rŽ   r   r   rŠ   r"   r$   r%   r   ¾  sB   

ÿÿ
ÿÿzRoPEMHA.__init__c                 C   s\   | j rtjj | j¡ ntjj | j¡ tjj | j¡ | jd ur,tjj 	| j
d¡ d S d S r£   )r‹   r3   r   r¤   r¥   r•   r“   r”   r   r¦   r–   r+   r$   r$   r%   r¡   ó  s   
ÿzRoPEMHA._reset_parametersTc              	   C   s¾  |du sJ dƒ‚|j d }|j d }	| jr}||u st ||¡rD||u s)t ||¡rDtj || j¡ |d| j	| j
d ¡jddd\}}}n;| jjddd\}
}}tj ||
¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}nt‚| jdur‘|| j dd| j	| j¡ }t|ƒ}t|ƒ}t||	| j	||ƒ}tj| dddd¡| dddd¡| dddd¡|| j| jd	}| dd¡ ¡  |d| j| j	 ¡}|  |¡}|rÝ|dfS |S )
a  Compute attention through Pytorch attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        key_padding_mask : torch.Tensor
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        attn_mask : torch.BoolTensor
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.
        pos_embs : torch.Tensor
            Not used by this class. It is kept for compliance.
        return_attn_weights : bool
            Whether to additionally return the attention weights.

        Returns
        -------
        out : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_score : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
        Nzpos_embs is not supportedr   r   r   r‡   r   rH   )r`   r¼   r½   rÀ   Ú	dropout_pr¢   )r²   r‹   r3   r³   r   r©   r´   r•   rª   rŽ   r   rµ   r¶   r   r–   r‘   r  Úmasks_unionrº   Úscaled_dot_product_attentionr·   r   r¢   rQ   r»   r™   )r   r`   r¼   r½   r¿   rÀ   r¾   rÁ   rÂ   rÃ   rÄ   rÅ   rÆ   Ú	q_rotatedÚ	k_rotatedÚfinal_masksr€   rc   r$   r$   r%   r?   ý  s^   0

þ
ÿÿÿÿ
ÿÿú

ý
zRoPEMHA.forward)r„   FN)NNNT)rB   rC   rD   rE   r   r¡   r?   rF   r$   r$   r"   r%   r	  Ÿ  s    "ú5ør	  c                 C   s†   d}|dur|  | dd|¡ | |||¡}|}|dur*|  dd||¡ | |||¡}|}|dur8|dur8t ||¡}|durAt |¡}|S )a¯  This is an utility function combining standard key_padding_mask and
    attn_mask from SpeechBrain into a single one for scaled_dot_product_attention. This function does not support weighting of the attn_score. Hence, if one wish to use float values as masks, they should not use this function.

    Arguments
    ---------
    bsz : int
        Batch size dimension.
    klen : int
        Time dimension of the key tensor. (Sequence length).
    num_heads : int
        Number of heads of the attention module using these masks.
    attn_mask : torch.BoolTensor
        2D mask (L, S) where L is the target sequence length, S is
        the source sequence length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.
    key_padding_mask : torch.BoolTensor
        (B, S) where B is the batch size, S is the source sequence
        length. The positions with the value of True will be ignored while the position with the value of False will be unchanged.

    Returns
    -------
    out : torch.BoolTensor
        (bsz, num_heads, klen, klen) where False values are masked and True are unmasked (opposite of the input tensors).

    Nr   )rª   Úexpandr3   Ú
logical_orÚlogical_not)rÂ   rÃ   rŽ   rÀ   r¿   Ú
final_maskr$   r$   r%   r  l  s    ÿÿ
r  )(rE   rl   Útypingr   r   r   r   r   Únumpyr7   r3   Útorch.nnr   Útorch.nn.functionalr©   rº   Úspeechbrain.dataio.dataior   Úspeechbrain.utils.loggerr   rB   ÚloggerÚModuler	   rG   rV   rd   rƒ   rÍ   r×   rà   rð   rú   r‚   rf   r0   r  r  r	  r  r$   r$   r$   r%   Ú<module>   sX    
\ Ma  = =jÿ
þ
!ÿÿÿÿþ, N