o
    ´©i8[  ã                   @   sÚ   d Z ddlZddlZddlZddlmZ ddlmZmZ ddlm  m	Z
 ddlmZ ddlm  m  mZ G dd„ dejƒZG dd	„ d	ejƒZG d
d„ deƒZG dd„ deƒZG dd„ deƒZG dd„ dejjƒZdS )z&Multi-Head Attention layer definition.é    N)Únn)ÚOptionalÚTuple)Úmake_pad_maskc                       s8   e Zd ZdZ‡ fdd„Zdd„ Zdd„ Zdd	„ Z‡  ZS )
ÚMultiHeadedAttentionz±Multi-Head Attention layer.

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.

    c                    s~   t t| ƒ ¡  || dksJ ‚|| | _|| _t ||¡| _t ||¡| _t ||¡| _	t ||¡| _
d| _tj|d| _dS )ú)Construct an MultiHeadedAttention object.r   N©Úp)Úsuperr   Ú__init__Úd_kÚhr   ÚLinearÚlinear_qÚlinear_kÚlinear_vÚ
linear_outÚattnÚDropoutÚdropout)ÚselfÚn_headÚn_featÚdropout_rate©Ú	__class__© úW/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transformer/attention.pyr      s   
zMultiHeadedAttention.__init__c                 C   s†   |  d¡}|  |¡ |d| j| j¡}|  |¡ |d| j| j¡}|  |¡ |d| j| j¡}| dd¡}| dd¡}| dd¡}|||fS )a	  Transform query, key and value.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).

        Returns:
            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).

        r   éÿÿÿÿé   é   )Úsizer   Úviewr   r   r   r   Ú	transpose©r   ÚqueryÚkeyÚvalueÚn_batchÚqÚkÚvr   r   r   Úforward_qkv-   s   

z MultiHeadedAttention.forward_qkvc           	      C   s    |  d¡}|dur(| d¡ d¡}tdƒ }| ||¡}tj|dd |d¡}ntj|dd}|  |¡}t ||¡}| 	dd¡ 
¡  |d| j| j ¡}|  |¡S )	aÒ  Compute attention context vector.

        Args:
            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).

        Returns:
            torch.Tensor: Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).

        r   Nr   Úinfr   ©Údimç        r    )r!   Ú	unsqueezeÚeqÚfloatÚmasked_fillÚtorchÚsoftmaxr   Úmatmulr#   Ú
contiguousr"   r   r   r   )	r   r'   ÚscoresÚmaskr(   Ú	min_valuer   Úp_attnÚxr   r   r   Úforward_attentionE   s    
ÿÿ
 ÿ
z&MultiHeadedAttention.forward_attentionc           	      C   óB   |   |||¡\}}}t || dd¡¡t | j¡ }|  |||¡S )aË  Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        éþÿÿÿr   ©r,   r5   r7   r#   ÚmathÚsqrtr   r>   ©	r   r%   r&   r'   r:   r)   r*   r+   r9   r   r   r   Úforwardh   s    zMultiHeadedAttention.forward)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r,   r>   rE   Ú__classcell__r   r   r   r   r      s    	#r   c                       sJ   e Zd Z‡ fdd„Zdd„ Zdejdejfdd„Zd	d
„ Zdd„ Z	‡  Z
S )ÚMultiHeadedAttentionExportc                    sR   t ƒ  ¡  |j| _|j| _|j| _|j| _|j| _|j| _d | _| j| j | _	d S ©N)
r
   r   r   r   r   r   r   r   r   Úall_head_size©r   Úmodelr   r   r   r   |   s   
z#MultiHeadedAttentionExport.__init__c           	      C   r?   )Nr@   r   rA   rD   r   r   r   rE   ‡   s    z"MultiHeadedAttentionExport.forwardr=   Úreturnc                 C   s6   |  ¡ d d… | j| jf }| |¡}| dddd¡S )Nr   r   r    r   é   )r!   r   r   r"   Úpermute)r   r=   Únew_x_shaper   r   r   Útranspose_for_scoresŒ   s   
z/MultiHeadedAttentionExport.transpose_for_scoresc                 C   sF   |   |¡}|  |¡}|  |¡}|  |¡}|  |¡}|  |¡}|||fS rL   )r   r   r   rT   )r   r%   r&   r'   r)   r*   r+   r   r   r   r,   ‘   s   






z&MultiHeadedAttentionExport.forward_qkvc                 C   ób   || }t j|dd}t  ||¡}| dddd¡ ¡ }| ¡ d d… | jf }| |¡}|  |¡S ©Nr   r.   r   r    r   rQ   r@   ©	r5   r6   r7   rR   r8   r!   rM   r"   r   ©r   r'   r9   r:   r   Úcontext_layerÚnew_context_layer_shaper   r   r   r>   š   ó   

z,MultiHeadedAttentionExport.forward_attention)rF   rG   rH   r   rE   r5   ÚTensorrT   r,   r>   rJ   r   r   r   r   rK   {   s    	rK   c                       s4   e Zd Z‡ fdd„Zdd„ Zdd„ Zdd„ Z‡  ZS )	Ú RelPosMultiHeadedAttentionExportc                    s(   t ƒ  |¡ |j| _|j| _|j| _d S rL   )r
   r   Ú
linear_posÚ
pos_bias_uÚ
pos_bias_vrN   r   r   r   r   §   s   z)RelPosMultiHeadedAttentionExport.__init__c                 C   s¨   |   |||¡\}}}| dd¡}|  |  |¡¡}	|| j  dd¡}
|| j  dd¡}t |
| dd¡¡}t ||	 dd¡¡}|  |¡}|| t	 
| j¡ }|  |||¡S )Nr   r    r@   r   )r,   r#   rT   r^   r_   r`   r5   r7   Ú	rel_shiftrB   rC   r   r>   )r   r%   r&   r'   Úpos_embr:   r)   r*   r+   r	   Úq_with_bias_uÚq_with_bias_vÚ	matrix_acÚ	matrix_bdr9   r   r   r   rE   ­   s   
z(RelPosMultiHeadedAttentionExport.forwardc                 C   s¾   t jg | ¡ d d… ¢d‘R |j|jd}t j||gdd}|jg | ¡ d d… ¢| d¡d ‘| d¡‘R Ž }|d d …d d …dd …f  |¡d d …d d …d d …d | d¡d d …f }|S )NrQ   r   ©ÚdeviceÚdtyper   r.   r    )r5   Úzerosr!   rh   ri   Úcatr"   Úview_as)r   r=   Úzero_padÚx_paddedr   r   r   ra   Ç   s   *4(ÿz*RelPosMultiHeadedAttentionExport.rel_shiftc                 C   rU   rV   rW   rX   r   r   r   r>   Ñ   r[   z2RelPosMultiHeadedAttentionExport.forward_attention)rF   rG   rH   r   rE   ra   r>   rJ   r   r   r   r   r]   ¦   s
    
r]   c                       ó2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
Ú%LegacyRelPositionMultiHeadedAttentiona®  Multi-Head Attention layer with relative position encoding (old version).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    Paper: https://arxiv.org/abs/1901.02860

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.

    Fc                    ó|   t ƒ  |||¡ || _tj||dd| _t t | j	| j
¡¡| _t t | j	| j
¡¡| _tjj | j¡ tjj | j¡ dS ©z4Construct an RelPositionMultiHeadedAttention object.F©ÚbiasN©r
   r   Ú	zero_triur   r   r^   Ú	Parameterr5   r\   r   r   r_   r`   ÚinitÚxavier_uniform_©r   r   r   r   rv   r   r   r   r   ì   ó   z.LegacyRelPositionMultiHeadedAttention.__init__c                 C   sè   t jg | ¡ dd… ¢d‘R |j|jd}t j||gdd}|jg | ¡ dd… ¢| d¡d ‘| d¡‘R Ž }|dd…dd…dd…f  |¡}| jrrt  	| d¡| d¡f¡}|t  
|| d¡| d¡ ¡dddd…dd…f  }|S )zÁCompute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, time2).

        Returns:
            torch.Tensor: Output tensor.

        NrQ   r   rg   r   r.   r    ©r5   rj   r!   rh   ri   rk   r"   rl   rv   ÚonesÚtril©r   r=   rm   rn   r}   r   r   r   ra   ù   s   *
4 4z/LegacyRelPositionMultiHeadedAttention.rel_shiftc                 C   óÈ   |   |||¡\}}}| dd¡}| d¡}	|  |¡ |	d| j| j¡}
|
 dd¡}
|| j  dd¡}|| j  dd¡}t	 
|| dd¡¡}t	 
||
 dd¡¡}|  |¡}|| t | j¡ }|  |||¡S )aB  Compute 'Scaled Dot Product Attention' with rel. positional encoding.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        r   r    r   r   r@   ©r,   r#   r!   r^   r"   r   r   r_   r`   r5   r7   ra   rB   rC   r>   ©r   r%   r&   r'   rb   r:   r)   r*   r+   Ún_batch_posr	   rc   rd   re   rf   r9   r   r   r   rE     s   

z-LegacyRelPositionMultiHeadedAttention.forward©F©rF   rG   rH   rI   r   ra   rE   rJ   r   r   r   r   rp   Ý   s
    rp   c                       ro   )
ÚRelPositionMultiHeadedAttentionaµ  Multi-Head Attention layer with relative position encoding (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    Paper: https://arxiv.org/abs/1901.02860

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.

    Fc                    rq   rr   ru   rz   r   r   r   r   I  r{   z(RelPositionMultiHeadedAttention.__init__c                 C   s  t jg | ¡ dd… ¢d‘R |j|jd}t j||gdd}|jg | ¡ dd… ¢| d¡d ‘| d¡‘R Ž }|dd…dd…dd…f  |¡dd…dd…dd…d| d¡d d …f }| jrŠt j	| d¡| d¡f|jd}|t  
|| d¡| d¡ ¡dddd…dd…f  }|S )	zùCompute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
            time1 means the length of query vector.

        Returns:
            torch.Tensor: Output tensor.

        NrQ   r   rg   r   r.   r    )rh   r|   r   r   r   r   ra   V  s   *4(ÿ 4z)RelPositionMultiHeadedAttention.rel_shiftc                 C   r€   )aV  Compute 'Scaled Dot Product Attention' with rel. positional encoding.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            pos_emb (torch.Tensor): Positional embedding tensor
                (#batch, 2*time1-1, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

        r   r    r   r   r@   r   r‚   r   r   r   rE   o  s   

z'RelPositionMultiHeadedAttention.forwardr„   r…   r   r   r   r   r†   :  s
    r†   c                       sN  e Zd ZdZ		d"dededededd	f
‡ fd
d„Zd#dej	dedej	fdd„Z
	d#dej	dej	dej	dedej	f
dd„Z	d#dej	dej	dej	dedej	f
dd„Zdej	dej	dej	deej	ej	ej	f fdd„Z		d$dej	dej	dej	deej	 dej	f
dd„Z			d%dej	dej	dej	dej	dej	deej	 dedej	fd d!„Z‡  ZS )&Ú$RelPositionMultiHeadedAttentionChunkz°RelPositionMultiHeadedAttention definition.
    Args:
        num_heads: Number of attention heads.
        embed_size: Embedding size.
        dropout_rate: Dropout rate.
    r0   FÚ	num_headsÚ
embed_sizer   Úsimplified_attention_scorerP   Nc                    s  t ƒ  ¡  || | _|| _| j| |ksJ d||ffƒ‚tj ||¡| _tj ||¡| _tj ||¡| _	tj ||¡| _
|rKtj ||¡| _| j| _n6tjj||dd| _tj t || j¡¡| _tj t || j¡¡| _tjj | j¡ tjj | j¡ | j| _tjj|d| _d| _dS )r   z3embed_size (%d) must be divisible by num_heads (%d)Frs   r   N)r
   r   r   rˆ   r5   r   r   r   r   r   r   r^   Ú"compute_simplified_attention_scoreÚcompute_att_scorerw   r\   r_   r`   rx   ry   Úcompute_attention_scorer   r   r   )r   rˆ   r‰   r   rŠ   r   r   r   r   £  s,   

þ

z-RelPositionMultiHeadedAttentionChunk.__init__r   r=   Úleft_contextc                 C   sR   |j \}}}}|| }| ¡ \}}	}
}|j||||f||	|
| |f||d  dS )zòCompute relative positional encoding.
        Args:
            x: Input sequence. (B, H, T_1, 2 * T_1 - 1)
            left_context: Number of frames in left context.
        Returns:
            x: Output sequence. (B, H, T_1, T_2)
        r   )Ústorage_offset)ÚshapeÚstrideÚ
as_strided)r   r=   rŽ   Ú
batch_sizeÚn_headsÚtime1ÚnÚtime2Úbatch_strideÚn_heads_strideÚtime1_strideÚn_strider   r   r   ra   Ì  s   

ýz.RelPositionMultiHeadedAttentionChunk.rel_shiftr%   r&   Úpos_encc                 C   s`   |   |¡}t || dd¡¡}| j| dd¡ d¡ dd| d¡d¡|d}|| t 	| j
¡ S )aº  Simplified attention score computation.
        Reference: https://github.com/k2-fsa/icefall/pull/458
        Args:
            query: Transformed query tensor. (B, H, T_1, d_k)
            key: Transformed key tensor. (B, H, T_2, d_k)
            pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size)
            left_context: Number of frames in left context.
        Returns:
            : Attention score. (B, H, T_1, T_2)
        r    rQ   r   ©rŽ   )r^   r5   r7   r#   ra   r1   Úrepeatr!   rB   rC   r   )r   r%   r&   rœ   rŽ   re   rf   r   r   r   r‹   ß  s   
"þzGRelPositionMultiHeadedAttentionChunk.compute_simplified_attention_scorec           
   	   C   sž   |   |¡ | d¡d| j| j¡}| dd¡}|| j  dd¡}|| j  dd¡}t 	|| dd¡¡}t 	|| 
dddd¡¡}	| j|	|d}	||	 t | j¡ S )aq  Attention score computation.
        Args:
            query: Transformed query tensor. (B, H, T_1, d_k)
            key: Transformed key tensor. (B, H, T_2, d_k)
            pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size)
            left_context: Number of frames in left context.
        Returns:
            : Attention score. (B, H, T_1, T_2)
        r   r   r   r    r@   rQ   r   )r^   r"   r!   rˆ   r   r#   r_   r`   r5   r7   rR   ra   rB   rC   )
r   r%   r&   rœ   rŽ   r	   rc   rd   re   rf   r   r   r   r   û  s    z<RelPositionMultiHeadedAttentionChunk.compute_attention_scorer'   c                 C   sz   |  d¡}|  |¡ |d| j| j¡ dd¡}|  |¡ |d| j| j¡ dd¡}|  |¡ |d| j| j¡ dd¡}|||fS )a{  Transform query, key and value.
        Args:
            query: Query tensor. (B, T_1, size)
            key: Key tensor. (B, T_2, size)
            v: Value tensor. (B, T_2, size)
        Returns:
            q: Transformed query tensor. (B, H, T_1, d_k)
            k: Transformed key tensor. (B, H, T_2, d_k)
            v: Transformed value tensor. (B, H, T_2, d_k)
        r   r   r   r    )r!   r   r"   rˆ   r   r#   r   r   r$   r   r   r   r,     s
   
"""
z0RelPositionMultiHeadedAttentionChunk.forward_qkvr9   r:   Ú
chunk_maskc                 C   sž   |  d¡}| d¡ d¡}|dur| d¡ d¡|B }| |tdƒ¡}tj|dd |d¡}|  |¡}t ||¡}|  | 	dd¡ 
¡  |d| j| j ¡¡}|S )	af  Compute attention context vector.
        Args:
            value: Transformed value. (B, H, T_2, d_k)
            scores: Attention score. (B, H, T_1, T_2)
            mask: Source mask. (B, T_2)
            chunk_mask: Chunk mask. (T_1, T_1)
        Returns:
           attn_output: Transformed value weighted by attention score. (B, T_1, H * d_k)
        r   r   r    Nz-infr   r.   r0   )r!   r1   r4   r3   r5   r6   r   r7   r   r#   r8   r"   rˆ   r   )r   r'   r9   r:   rŸ   r“   r   Úattn_outputr   r   r   r>   -  s   

 ÿz6RelPositionMultiHeadedAttentionChunk.forward_attentionc                 C   s8   |   |||¡\}}	}
| j||	||d}| j|
|||dS )a  Compute scaled dot product attention with rel. positional encoding.
        Args:
            query: Query tensor. (B, T_1, size)
            key: Key tensor. (B, T_2, size)
            value: Value tensor. (B, T_2, size)
            pos_enc: Positional embedding tensor. (B, 2 * T_1 - 1, size)
            mask: Source mask. (B, T_2)
            chunk_mask: Chunk mask. (T_1, T_1)
            left_context: Number of frames in left context.
        Returns:
            : Output tensor. (B, T_1, H * d_k)
        r   )rŸ   )r,   rŒ   r>   )r   r%   r&   r'   rœ   r:   rŸ   rŽ   r)   r*   r+   r9   r   r   r   rE   M  s   z,RelPositionMultiHeadedAttentionChunk.forward)r0   F)r   rL   )Nr   )rF   rG   rH   rI   Úintr3   Úboolr   r5   r\   ra   r‹   r   r   r,   r   r>   rE   rJ   r   r   r   r   r‡   ›  s     ûþýüûú)ûþýüû
ú!ûþýüû
úÿÿÿ
þûþýüû
ú'øþýüûúùø	÷r‡   )rI   rB   Únumpyr5   r   Útypingr   r   Útorch.nn.functionalÚ
functionalÚFÚ*funasr.models.transformer.utils.nets_utilsr   Úfunasr.models.lora.layersÚmodelsÚloraÚlayersÚModuler   rK   r]   rp   r†   r‡   r   r   r   r   Ú<module>   s   f+7]a