o
    p’×iž|  ã                   @   sè   d Z ddlZddlmZ ddlZddlZddlmZ ddl	m  m
Z ddlmZ ddlmZ eeƒZG dd„ dejƒZG dd	„ d	ejƒZG d
d„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )zŒLibrary implementing attention modules.

Authors
 * Ju-Chieh Chou 2020
 * Jianyuan Zhong 2020
 * Loren Lugosch 2020
 * Samuele Cornell 2020
é    N)ÚOptional)Úlength_to_mask)Ú
get_loggerc                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚContentBasedAttentiona”  This class implements content-based attention module for seq2seq
    learning.

    Reference: NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN
    AND TRANSLATE, Bahdanau et.al. https://arxiv.org/pdf/1409.0473.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder layer.
    dec_dim : int
        Size of decoder layer.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = ContentBasedAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    ç      ð?c                    sf   t ƒ  ¡  t ||¡| _t ||¡| _tj|ddd| _t ||¡| _|| _tj	dd| _
|  ¡  d S )Né   F©Úbiaséÿÿÿÿ©Údim)ÚsuperÚ__init__ÚnnÚLinearÚmlp_encÚmlp_decÚmlp_attnÚmlp_outÚscalingÚSoftmaxÚsoftmaxÚreset)ÚselfÚenc_dimÚdec_dimÚattn_dimÚ
output_dimr   ©Ú	__class__© úX/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/speechbrain/nnet/attention.pyr   7   s   
zContentBasedAttention.__init__c                 C   ó   d| _ d| _d| _dS ©z)Reset the memory in the attention module.N)Úenc_lenÚprecomputed_enc_hÚmask©r   r    r    r!   r   F   ó   
zContentBasedAttention.resetc                 C   sª   | j du r|  |¡| _ t|| d¡|jd| _|  | d¡¡}|  t	 
| j | ¡¡ d¡}| | jdktj ¡}|  || j ¡}t	 | d¡|¡ d¡}|  |¡}||fS )á  Returns the output of the attention module.

        Arguments
        ---------
        enc_states : torch.Tensor
            The tensor to be attended.
        enc_len : torch.Tensor
            The real length (without padding) of enc_states for each sentence.
        dec_states : torch.Tensor
            The query tensor.

        Returns
        -------
        The output of the attention module.
        Nr   ©Úmax_lenÚdevicer
   r   )r%   r   r   Úsizer,   r&   r   Ú	unsqueezer   ÚtorchÚtanhÚsqueezeÚmasked_fillÚnpÚinfr   r   Úbmmr   )r   Ú
enc_statesr$   Ú
dec_statesÚdec_hÚattnÚcontextr    r    r!   ÚforwardL   s    
ÿÿþ
zContentBasedAttention.forward©r   ©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r;   Ú__classcell__r    r    r   r!   r      s
    r   c                       sD   e Zd ZU dZeej ed< 	d
‡ fdd„	Zdd„ Z	dd	„ Z
‡  ZS )ÚLocationAwareAttentiona{  This class implements location-aware attention module for seq2seq learning.

    Reference: Attention-Based Models for Speech Recognition, Chorowski et.al.
    https://arxiv.org/pdf/1506.07503.pdf

    Arguments
    ---------
    enc_dim : int
        Size of encoder.
    dec_dim : int
        Size of decoder.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.
    conv_channels : int
        Number of channel for location feature.
    kernel_size : int
        Kernel size of convolutional layer for location feature.
    scaling : float
        The factor controls the sharpening degree (default: 1.0).

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = LocationAwareAttention(
    ...     enc_dim=20,
    ...     dec_dim=25,
    ...     attn_dim=30,
    ...     output_dim=5,
    ...     conv_channels=10,
    ...     kernel_size=100)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    r%   r   c                    s¤   t ƒ  ¡  t ||¡| _t ||¡| _tj|ddd| _tjd|d| d |dd| _t ||¡| _	tj|ddd| _t ||¡| _
|| _tjdd| _|  ¡  d S )Nr   Fr   é   )Úkernel_sizeÚpaddingr	   r
   r   )r   r   r   r   r   r   r   ÚConv1dÚconv_locÚmlp_locr   r   r   r   r   )r   r   r   r   r   Úconv_channelsrE   r   r   r    r!   r   ž   s"   


ûzLocationAwareAttention.__init__c                 C   s   d| _ d| _d| _d| _dS )z%Reset the memory in attention module.N)r$   r%   r&   Ú	prev_attnr'   r    r    r!   r   ¿   s   
zLocationAwareAttention.resetc                 C   sö   | j du r$|  |¡| _ t|| d¡|jd| _| jd| ¡   d¡ | _|  	| j d¡¡}|  
| dd¡¡}|  | d¡¡}|  t | j | | ¡¡ d¡}| | jdktj ¡}|  || j ¡}| ¡ | _t | d¡|¡ d¡}|  |¡}||fS )r)   Nr   r*   rD   r
   r   )r%   r   r   r-   r,   r&   Úfloatr.   rK   rH   rI   Ú	transposer   r   r/   r0   r1   r2   r3   r4   r   r   Údetachr5   r   )r   r6   r$   r7   Ú	attn_convr8   r9   r:   r    r    r!   r;   Æ   s(   
ÿÿþ

zLocationAwareAttention.forwardr<   )r>   r?   r@   rA   r   r/   ÚTensorÚ__annotations__r   r   r;   rB   r    r    r   r!   rC   t   s   
 '
ø!rC   c                       s0   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Z‡  ZS )ÚKeyValueAttentionae  This class implements a single-headed key-value attention module for seq2seq
    learning.

    Reference: "Attention Is All You Need" by Vaswani et al., sec. 3.2.1

    Arguments
    ---------
    enc_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    dec_dim : int
        Size of the decoder feature vectors from which queries are computed.
    attn_dim : int
        Size of the attention feature.
    output_dim : int
        Size of the output context vector.

    Example
    -------
    >>> enc_tensor = torch.rand([4, 10, 20])
    >>> enc_len = torch.ones([4]) * 10
    >>> dec_tensor = torch.rand([4, 25])
    >>> net = KeyValueAttention(enc_dim=20, dec_dim=25, attn_dim=30, output_dim=5)
    >>> out_tensor, out_weight = net(enc_tensor, enc_len, dec_tensor)
    >>> out_tensor.shape
    torch.Size([4, 5])
    c                    sV   t ƒ  ¡  t ||¡| _t ||¡| _t ||¡| _t t 	|¡ 
¡ ¡| _|  ¡  d S )N)r   r   r   r   Ú
key_linearÚquery_linearÚvalue_linearr/   ÚsqrtÚtensorrL   r   r   )r   r   r   r   r   r   r    r!   r     s   
zKeyValueAttention.__init__c                 C   r"   r#   )ÚvaluesÚkeysr&   r'   r    r    r!   r      r(   zKeyValueAttention.resetc                 C   s¨   | j du r |  |¡| _ |  |¡| _t|| d¡|jd d¡| _|  	|¡ d¡}t
 | j |¡| j }| | jdktj ¡}| d¡ dd¡}t
 || j¡ d¡}||fS )r)   Nr   r*   rD   r   )rY   rS   rU   rX   r   r-   r,   r.   r&   rT   r/   Úmatmulr   r2   r3   r4   r   rM   r1   )r   r6   r$   r7   ÚqueryÚscoresÚnormalized_scoresÚoutr    r    r!   r;   &  s   
ÿþzKeyValueAttention.forwardr=   r    r    r   r!   rR   ù   s
    rR   c                       sX   e Zd ZdZejfdedejf‡ fdd„Ze 	¡ defdd„ƒZ
d	ejfd
d„Z‡  ZS )ÚRelPosEncXLaÕ  Relative positional encoding for the :class:`~RelPosMHAXL`.

    Arguments
    ---------
    emb_dim : int
        Size of the embedding, which controls the size of the last dimension
        of the positional embedding as well
    dtype : torch.dtype, optional
        If unspecified, defaults to `torch.float32`. Controls the data type of
        the output embedding (but does not affect the precision of the
        computations, which remain `torch.float32`).
    Úemb_dimÚdtypec                    sT   t ƒ  ¡  || _t tjd| jdtjdt d¡| j   ¡}|  	d|¡ || _
d S )Nr   rD   )ra   g     ˆÃ@Úinv_freq)r   r   r`   r/   ÚexpÚarangeÚfloat32ÚmathÚlogÚregister_bufferÚ	emb_dtype)r   r`   ra   rb   r   r    r!   r   T  s   
ÿÿ
zRelPosEncXL.__init__Úseq_lenc           
      C   sB  | j }| jj}t ¡ Œ tjd|| jftj|d}|d }|d }tjd|tj|d 	d¡}t 
|| j ¡}||dd…ddd…f< t || j ¡|dd…ddd…f< ||dd…ddd…f< t | | j ¡|dd…ddd…f< t |d¡ 	d¡}|dd…  	d¡}tj||gdd}	|	 |¡}	W d  ƒ |	S 1 sšw   Y  |	S )	ab  
        Builds the positional embedding tensor for a given sequence length.

        Arguments
        ---------
        seq_len : int
            The length of the sequence to create the position embedding for.

        Returns
        -------
        torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        rD   )ra   r,   r   r   r
   N)r   r   )ri   rb   r,   r/   Úno_gradÚemptyr`   re   rd   r.   ÚsinÚcosÚflipÚcatÚto)
r   rj   ri   r,   Útot_peÚpe_pastÚ	pe_futureÚ	positionsÚ	sinusoidsÚper    r    r!   Úmake_pe`  s>   

ýüû"$
äâzRelPosEncXL.make_peÚxc                 C   s   | j | d¡dS )aº  
        Builds the positional embedding tensor. Similar to
        :meth:`~RelPosEncXL.make_pe` but uses the shape information from the
        provided tensor.

        Arguments
        ---------
        x : torch.Tensor
            input tensor with shape batch_size, seq_len, embed_dim

        Returns
        -------
        pos_emb : torch.Tensor
            Positional embedding tensor of shape `[1, 2*seq_len-1, embed_dim]`
        r   )rj   )rx   r-   ©r   ry   r    r    r!   r;   “  s   zRelPosEncXL.forward)r>   r?   r@   rA   r/   re   Úintra   r   rk   rx   rP   r;   rB   r    r    r   r!   r_   F  s    2r_   c                       sJ   e Zd ZdZ				d‡ fdd„	Zdd„ Zd	d
„ Z			ddd„Z‡  ZS )ÚRelPosMHAXLa÷  This class implements the relative multihead implementation similar to that in Transformer XL
    https://arxiv.org/pdf/1901.02860.pdf

    Arguments
    ---------
    embed_dim : int
        Size of the encoder feature vectors from which keys and values are computed.
    num_heads: int
        Number of attention heads.
    dropout : float, optional
        Dropout rate.
    vbias: bool, optional
        Whether to use bias for computing value.
    vdim: int, optional
        Size for value. Default is embed_dim (Note each head is embed_dim // num_heads).
    mask_pos_future: bool, optional
        Whether to mask future positional encodings values.
        Must be true for causal applications e.g. decoder.

    Example
    -------
    >>> inputs = torch.rand([6, 60, 512])
    >>> pos_emb = torch.rand([1, 2*60-1, 512])
    >>> net = RelPosMHAXL(num_heads=8, embed_dim=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs, pos_emb)
    >>> outputs.shape
    torch.Size([6, 60, 512])
    ç        FNc                    s¢  t ƒ  ¡  || _|d ur|n|| _| j|k| _|| _|| _|| _|| _|| | _	| j| | _
| j	| | jks:J dƒ‚| j
| | jksFJ dƒ‚| jdu rct t d| |¡¡| _t t | j|¡¡| _nt t d| |¡¡| _|r|t t | j¡¡| _nd | _t |¡| _t | j|¡| _tj||dd| _t t | j	| j¡¡| _t t | j	| j¡¡| _t|  ¡ ƒjtjkr¼d| _ntdƒ | _|   ¡  d	t! "| j¡ | _#d S )
Nz(embed_dim must be divisible by num_headsz#vdim must be divisible by num_headsFrD   é   r   iÿÿr4   r   )$r   r   Ú	embed_dimÚvdimÚ_qkv_same_embed_dimÚmask_pos_futureÚvbiasÚ	num_headsÚdropoutÚhead_dimÚ	vhead_dimr   Ú	Parameterr/   rl   Úqk_proj_weightÚv_proj_weightÚin_proj_weightÚvalue_bias_weightÚDropoutÚdropout_attr   Úout_projÚ
linear_posÚ
pos_bias_uÚ
pos_bias_vÚnextÚ
parametersra   Úfloat16Úattn_fill_valuerL   Ú_reset_parametersrf   rV   Úscale)r   r   r„   r…   rƒ   r€   r‚   r   r    r!   r   Å  sR   
	
ÿÿ
ÿÿÿÿzRelPosMHAXL.__init__c                 C   sx   | j rtjj | j¡ ntjj | j¡ tjj | j¡ | jd ur*tjj 	| j
d¡ tjj | j¡ tjj | j¡ d S )Nr}   )r   r/   r   ÚinitÚxavier_uniform_r‹   r‰   rŠ   rƒ   Ú	constant_rŒ   r‘   r’   r'   r    r    r!   r—     s   
zRelPosMHAXL._reset_parametersc                 C   sÊ   |  ¡ \}}}}tjjj|dd}| ||d|¡}|dd…dd…dd…f  ||||¡}| jrYtj|  d¡|  d¡f|jd}|t 	||  d¡|  d¡ ¡dddd…dd…f  }|d	d|d d …f S )
zRelative shift implementation.)r   r   )Úpadr
   Nr   rD   r~   )r,   .)
r-   r/   r   Ú
functionalrœ   Úviewr‚   Úonesr,   Útril)r   ry   ÚbÚhÚqlenÚpos_lenrŸ   r    r    r!   Ú	rel_shift  s   & 4zRelPosMHAXL.rel_shiftTc              	   C   s  |j d }|j d }	|j d }
| jrz||u st ||¡rA||u s&t ||¡rAtj || j¡ |d| j	| j
d ¡jddd\}}}n;| jjddd\}}}tj ||¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}tj ||¡ |d| j	| j
¡}nt‚| jdurŽ|| j dd| j	| j¡ }|  |¡ dd| j	| j
¡}|| j dd| j	| j
¡  dd¡}|| j dd| j	| j
¡  dd¡}t || j | dddd¡¡}t || j | dddd¡¡}|  |¡}|| }|dur|jdkr÷| dd|
|	¡}n	| d| j	|
|	¡}|jtjkr| || j¡}n||7 }|dur%| | |dd|	¡| j¡}tj |dtj!d}|  "|¡}|durG|jtjkrF| |d	¡}n	 |durX| | |dd|	¡d	¡}t || dd¡¡}| dd¡ #¡  |d| j| j	 ¡}|  $|¡}|r||fS |S )
aX	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        pos_embs : torch.Tensor
            bidirectional sinusoidal positional embedding tensor (1, 2*S-1, E) where S is the max length between source and target sequence lengths,
            and E is the embedding dimension.
        key_padding_mask : torch.Tensor
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        attn_mask : torch.Tensor
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        return_attn_weights : bool
            Whether to additionally return the attention weights.

        Returns
        -------
        out : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_score : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
        r   r   r
   r~   r   rD   N)r   ra   r}   )%Úshaper   r/   Úequalr   r   Úlinearr‹   rž   r„   r†   ÚchunkÚNotImplementedErrorr‰   rŠ   r‡   rƒ   rŒ   r   r‘   rM   r’   rZ   r˜   Úpermuter¥   Úndimra   Úboolr2   r–   ÚFr   re   rŽ   Ú
contiguousr   )r   r[   ÚkeyÚvalueÚpos_embsÚkey_padding_maskÚ	attn_maskÚreturn_attn_weightsÚbszÚklenr£   ÚqweightÚkweightÚvweightÚp_kÚq_with_bias_uÚq_with_bias_vÚ	matrix_acÚ	matrix_bdÚ
attn_scorery   r^   r    r    r!   r;   $  sž   
:

þ
ÿÿÿÿ

ÿ
ÿþþÿÿ


ÿ
þ


þÿ
ý
zRelPosMHAXL.forward)r}   FNF)NNT)	r>   r?   r@   rA   r   r—   r¥   r;   rB   r    r    r   r!   r|   §  s    !ù?ør|   c                
       sd   e Zd ZdZ						d‡ fdd„	Z				ddeej d	eej d
edeej fdd„Z	‡  Z
S )ÚMultiheadAttentionaß  The class is a wrapper of MultiHead Attention for torch.nn.MultiHeadAttention.

    Reference: https://pytorch.org/docs/stable/nn.html

    Arguments
    ---------
    nhead : int
        parallel attention heads.
    d_model : int
        The size of the model layers.
    dropout : float
        a Dropout layer on attn_output_weights (default: 0.0).
    bias : bool
        add bias as module parameter (default: True).
    add_bias_kv : bool
        add bias to the key and value sequences at dim=0.
    add_zero_attn : bool
        add a new batch of zeros to the key and value sequences at dim=1.
    kdim : int
        total number of features in key (default: None).
    vdim : int
        total number of features in value (default: None).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = MultiheadAttention(nhead=8, d_model=inputs.shape[-1])
    >>> outputs, attn = net(inputs, inputs, inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    r}   TFNc	           	   
      s*   t ƒ  ¡  tj||||||||d| _d S )N)r   r„   r…   r	   Úadd_bias_kvÚadd_zero_attnÚkdimr€   )r   r   r   rÁ   Úatt)	r   ÚnheadÚd_modelr…   r	   rÂ   rÃ   rÄ   r€   r   r    r!   r     s   
øzMultiheadAttention.__init__r´   r³   rµ   r²   c           
      C   s€   |  ddd¡}|  ddd¡}|  ddd¡}|dur$|dur"||7 }n|}| j||||||d\}}	|  ddd¡}|r>||	fS |S )a	  Compute attention.

        Arguments
        ---------
        query : torch.Tensor
            (B, L, E) where L is the target sequence length,
            B is the batch size, E is the embedding dimension.
        key : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        value : torch.Tensor
            (B, S, E) where S is the source sequence length,
            B is the batch size, E is the embedding dimension.
        attn_mask : torch.Tensor, optional
            2D mask (L, S) where L is the target sequence length, S is
            the source sequence length.
            3D mask (N*num_heads, L, S) where N is the batch
            size, L is the target sequence length, S is the source sequence
            length. attn_mask ensure that position i is allowed to attend the
            unmasked positions. If a ByteTensor is provided, the non-zero
            positions are not allowed to attend while the zero positions will
            be unchanged. If a BoolTensor is provided, positions with True is
            not allowed to attend while False values will be unchanged. If a
            FloatTensor is provided, it will be added to the attention weight.
        key_padding_mask : torch.Tensor, optional
            (B, S) where B is the batch size, S is the source sequence
            length. If a ByteTensor is provided, the non-zero positions will
            be ignored while the position with the zero positions will be
            unchanged. If a BoolTensor is provided, the positions with the
            value of True will be ignored while the position with the value
            of False will be unchanged.
        return_attn_weights : bool, optional
            True to additionally return the attention weights, False otherwise.
        pos_embs : torch.Tensor, optional
            Positional embeddings added to the attention map of shape (L, S, E) or (L, S, 1).

        Returns
        -------
        attn_output : torch.Tensor
            (B, L, E) where L is the target sequence length, B is the
            batch size, E is the embedding dimension.
        attn_output_weights : torch.Tensor
            (B, L, S) where B is the batch size, L is the target
            sequence length, S is the source sequence length.
            This is returned only if `return_attn_weights=True` (True by default).
        r   r   rD   N)r´   r³   Úneed_weights)r«   rÅ   )
r   r[   r°   r±   r´   r³   rµ   r²   ÚoutputÚattention_weightsr    r    r!   r;     s&   9

ú
zMultiheadAttention.forward)r}   TFFNN)NNTN)r>   r?   r@   rA   r   r   r/   rP   r­   r;   rB   r    r    r   r!   rÁ   â  s,    $÷øûúùørÁ   c                       s4   e Zd ZdZdddejf‡ fdd„	Zdd„ Z‡  ZS )ÚPositionalwiseFeedForwardu  The class implements the positional-wise feed forward module in
    â€œAttention Is All You Needâ€.

    Arguments
    ---------
    d_ffn: int
        Hidden layer size.
    input_shape : tuple, optional
        Expected shape of the input. Alternatively use ``input_size``.
    input_size : int, optional
        Expected size of the input. Alternatively use ``input_shape``.
    dropout: float, optional
        Dropout rate.
    activation: torch.nn.Module, optional
        activation functions to be applied (Recommendation: ReLU, GELU).

    Example
    -------
    >>> inputs = torch.rand([8, 60, 512])
    >>> net = PositionalwiseFeedForward(256, input_size=inputs.shape[-1])
    >>> outputs = net(inputs)
    >>> outputs.shape
    torch.Size([8, 60, 512])
    Nr}   c              	      s`   t ƒ  ¡  |d u r|d u rtdƒ‚|d u r|d }t t ||¡|ƒ t |¡t ||¡¡| _d S )Nz)Expected one of input_shape or input_sizer
   )r   r   Ú
ValueErrorr   Ú
Sequentialr   r   Úffn)r   Úd_ffnÚinput_shapeÚ
input_sizer…   Ú
activationr   r    r!   r   Œ  s   



üz"PositionalwiseFeedForward.__init__c                 C   s*   |  ddd¡}|  |¡}|  ddd¡}|S )z8Applies PositionalwiseFeedForward to the input tensor x.r   r   rD   )r«   rÎ   rz   r    r    r!   r;   £  s   
z!PositionalwiseFeedForward.forward)	r>   r?   r@   rA   r   ÚReLUr   r;   rB   r    r    r   r!   rË   r  s    úrË   )rA   rf   Útypingr   Únumpyr3   r/   Útorch.nnr   Útorch.nn.functionalr   r®   Úspeechbrain.dataio.dataior   Úspeechbrain.utils.loggerr   r>   ÚloggerÚModuler   rC   rR   r_   r|   rÁ   rË   r    r    r    r!   Ú<module>   s*    	\ Ma  = 