o
    i                     @   s,   d Z ddlZddlZG dd dejjZdS )zFastformer attention definition.

Reference:
    Wu et al., "Fastformer: Additive Attention Can Be All You Need"
    https://arxiv.org/abs/2108.09084
    https://github.com/wuch15/Fastformer

    Nc                       s@   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Z  Z	S )FastSelfAttentionz'Fast self-attention used in Fastformer.c                    s   t    || dkrtd| d| d|| | _|| _tj||| _tj||| _	tj||| _
tj||| _tj||| _tj|| _d S )Nr   zHidden size (z1) is not an integer multiple of attention heads ())super__init__
ValueErrorattention_head_sizenum_attention_headstorchnnLinearquery	query_attkeykey_att	transformDropoutdropout)selfsizeattention_headsdropout_rate	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/branchformer/fastformer.pyr      s   

zFastSelfAttention.__init__c                 C   s   |  | j d S )N)applyinit_weights)r   r   r   r   espnet_initialization_fn'   s   z*FastSelfAttention.espnet_initialization_fnc                 C   sP   t |tjjr|jjjddd t |tjjr$|jd ur&|jj  d S d S d S )N        g{Gz?)meanstd)	
isinstancer	   r
   r   weightdatanormal_biaszero_)r   moduler   r   r   r   *   s
   zFastSelfAttention.init_weightsc                 C   s,   |j dd | j| jf }|j| ddS )zReshape and transpose to compute scores.

        Args:
            x: (batch, time, size = n_heads * attn_dim)

        Returns:
            (batch, n_heads, time, attn_dim)
        N      )shaper   r   reshape	transpose)r   xnew_x_shaper   r   r   transpose_for_scores0   s
   
z&FastSelfAttention.transpose_for_scoresc                 C   s  |j \}}}| |}| |}|dur|d}| |dd| jd  }|durNtt	t
jd|jd jj}	|||	}t
j|dd|d	}
nt
j|dd}
|
d}
| |}t
|
|dddd| j| j }| |}|d|d}|| }| || jd  dd}|durtt	t
jd|jd jj}	|||	}t
j|dd|d	}nt
j|dd}|d}| |}t
||}| |}|| dd}||j dd
 | j| j f }| | || }|S )zForward method.

        Args:
            xs_pad: (batch, time, size = n_heads * attn_dim)
            mask: (batch, 1, time), nonpadding is 1, padding is 0

        Returns:
            torch.Tensor: (batch, time, size)
        Nr   r)   r*   g      ?)dtyper(   )dimr   )r+   r   r   eqr   r-   r   floatnumpyfinfor	   tensorr1   minmasked_fillsoftmax	unsqueezer0   matmulr,   r   r   repeatr   r   )r   xs_padmask
batch_sizeseq_len_mixed_query_layermixed_key_layerquery_for_score	min_valuequery_weightquery_layerpooled_querypooled_query_repeatmixed_query_key_layerquery_key_scorequery_key_weight	key_layer
pooled_keyweighted_valuer   r   r   forward@   sb   







zFastSelfAttention.forward)
__name__
__module____qualname____doc__r   r   r   r0   rR   __classcell__r   r   r   r   r      s    r   )rV   r6   r	   r
   Moduler   r   r   r   r   <module>   s    	