o
    iW                    @   s  d Z ddlZddlZddlZddlm  mZ ddlm	Z	 ddlm
Z
 d,ddZG d	d
 d
ejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd  d ejjZG d!d" d"ejjZG d#d$ d$ejjZd-d&d'Z	%d.d(d)Zd*d+ ZdS )/zAttention modules for RNN.    N)make_pad_mask)	to_device      c                 C   st   |  ddkrtd|| }|| }|dkr$td | ddd|f< ||  dk r8td | dd|df< | S )a  Apply monotonic attention constraint.

    This function apply the monotonic attention constraint
    introduced in `Deep Voice 3: Scaling
    Text-to-Speech with Convolutional Sequence Learning`_.

    Args:
        e (Tensor): Attention energy before applying softmax (1, T).
        last_attended_idx (int): The index of the inputs of the last attended [0, T].
        backward_window (int, optional): Backward window size in attention constraint.
        forward_window (int, optional): Forward window size in attetion constraint.

    Returns:
        Tensor: Monotonic constrained attention energy (1, T).

    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
        https://arxiv.org/abs/1710.07654

    r   r   z2Batch attention constraining is not yet supported.infN)sizeNotImplementedErrorfloat)elast_attended_idxbackward_windowforward_windowbackward_idxforward_idx r   _/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/language_model/rnn/attentions.py_apply_attention_constraint   s   r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )NoAttzNo attentionc                    s*   t t|   d | _d | _d | _d | _d S N)superr   __init__h_lengthenc_hpre_compute_enc_hcself	__class__r   r   r   /   s
   
zNoAtt.__init__c                 C      d| _ d| _d| _d| _dS zreset statesN)r   r   r   r   r   r   r   r   reset6      
zNoAtt.resetc                 C   s   t |}| jdu r|| _| jd| _|du r@dt|  }|||d }|	| j}t
j| j||| jd dd| _| j|fS )a  NoAtt forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B, T_max, D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: dummy (does not use)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights
        :rtype: torch.Tensor
        Nr         ?dim)lenr   r   r   r   r   r	   new	unsqueezetotorchsumviewr   )r   
enc_hs_pad
enc_hs_lendec_zatt_prevbatchmaskr   r   r   forward=   s   
"
zNoAtt.forward__name__
__module____qualname____doc__r   r!   r4   __classcell__r   r   r   r   r   ,   s
    r   c                       4   e Zd ZdZd
 fdd	Zdd Zddd	Z  ZS )AttDota  Dot product attention

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    Fc                    sb   t t|   tj||| _tj||| _|| _|| _	|| _
d | _d | _d | _d | _|| _d S r   )r   r<   r   r+   nnLinearmlp_encmlp_decdunitseprojsatt_dimr   r   r   r3   han_moder   rB   rA   rC   rD   r   r   r   r   d   s   
zAttDot.__init__c                 C   r   r    r   r   r   r3   r   r   r   r   r!   r   r"   zAttDot.reset       @c           
      C   s   | d}| jdu s| jr!|| _| j d| _t| | j| _|du r-||| j	}n|
|| j	}tj| jt| |
|d| j dd}| jdu rWt|t|| _|| jtd  tj|| dd}tj| j|
|| jd dd}	|	|fS )a   AttDot forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: dummy (does not use)
        :param torch.Tensor att_prev: dummy (does not use)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weight (B x T_max)
        :rtype: torch.Tensor
        r   Nr      r%   r   )r   r   rD   r   r   r+   tanhr?   	new_zerosrA   r-   r,   r@   rC   r3   r   r   masked_fill_r	   Fsoftmax)
r   r.   r/   r0   r1   scalingr2   r
   wr   r   r   r   r4   y   s$   
 
 zAttDot.forwardFrG   r5   r   r   r   r   r<   Z   
    	r<   c                       r;   )AttAdda  Additive attention

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    Fc                    sv   t t|   tj||| _tjj||dd| _tj|d| _|| _	|| _
|| _d | _d | _d | _d | _|| _d S NFbiasr   )r   rS   r   r+   r=   r>   r?   r@   gvecrA   rB   rC   r   r   r   r3   rD   rE   r   r   r   r      s   
zAttAdd.__init__c                 C   r   r    rF   r   r   r   r   r!      r"   zAttAdd.resetrG   c                 C   s   t |}| jdu s| jr|| _| jd| _| | j| _|du r)||| j}n|	|| j}| 
|	|d| j}| t| j| d}| jdu rVt|t|| _|| jtd  tj|| dd}	tj| j|		|| jd dd}
|
|	fS )a-  AttAdd forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        Nr   rH   r   r%   )r'   r   rD   r   r   r   r?   rJ   rA   r-   r@   rC   rW   r+   rI   squeezer3   r   r   rK   r	   rL   rM   r,   )r   r.   r/   r0   r1   rN   r2   dec_z_tiledr
   rO   r   r   r   r   r4      s    
 zAttAdd.forwardrP   rQ   r5   r   r   r   r   rS      rR   rS   c                       s<   e Zd ZdZd fdd	Zdd Z					
dddZ  ZS )AttLoca  location-aware attention module.

    Reference: Attention-Based Models for Speech Recognition
        (https://arxiv.org/pdf/1506.07503.pdf)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    Fc                    s   t t|   tj||| _tjj||dd| _tjj||dd| _tjj	d|dd| d fd|fdd| _
tj|d| _|| _|| _|| _d | _d | _d | _d | _|| _d S NFrU   r   rH   r   paddingrV   )r   rZ   r   r+   r=   r>   r?   r@   mlp_attConv2dloc_convrW   rA   rB   rC   r   r   r   r3   rD   r   rB   rA   rC   aconv_chansaconv_filtsrD   r   r   r   r     s(   
zAttLoc.__init__c                 C   r   r    rF   r   r   r   r   r!     r"   zAttLoc.resetrG   Nr   r   c	                 C   s~  t |}	| jdu s| jr|| _| jd| _| | j| _|du r)||	| j}n|	|	| j}|du rKdt
|j|j|jd }|||d }| |	|	dd| j}
|
ddd}
| |
}
| |	|	d| j}| t|
| j | d}| jdu rt|t
|| _|| jtd  |durt||||}tj|| dd}tj | j|	|	| jd dd}||fS )	a  Calculate AttLoc forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: previous attention weight (B x T_max)
        :param float scaling: scaling parameter before applying softmax
        :param torch.Tensor forward_window:
            forward window size when constraining attention
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        Nr   r#   )devicedtyper$   rH   r   r%   )!r'   r   rD   r   r   r   r?   rJ   rA   r-   r   r*   rd   re   r(   r)   r`   rX   	transposer^   r@   rC   rW   r+   rI   r3   r   rK   r	   r   rL   rM   r,   )r   r.   r/   r0   r1   rN   r   r   r   r2   att_convrY   r
   rO   r   r   r   r   r4   "  s0   
 
 zAttLoc.forwardrP   )rG   Nr   r   r5   r   r   r   r   rZ      s    rZ   c                       r;   )AttCova  Coverage mechanism attention

    Reference: Get To The Point: Summarization with Pointer-Generator Network
       (https://arxiv.org/abs/1704.04368)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_enc_h
    Fc                    s   t t|   tj||| _tjj||dd| _tjd|| _tj|d| _	|| _
|| _|| _d | _d | _d | _d | _|| _d S rT   )r   rh   r   r+   r=   r>   r?   r@   wvecrW   rA   rB   rC   r   r   r   r3   rD   rE   r   r   r   r     s   
zAttCov.__init__c                 C   r   r    rF   r   r   r   r   r!     r"   zAttCov.resetrG   c                 C   sT  t |}| jdu s| jr|| _| jd| _| | j| _|du r)||| j}n|	|| j}|du rJt
|dt|  }|||d g}t|}| |d}| |	|d| j}| t|| j | d}	| jdu r~t
|t|| _|	| jtd  tj||	 dd}
||
g7 }tj| j|
	|| jd dd}||fS )a+  AttCov forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param list att_prev_list: list of previous attention weight
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weights
        :rtype: list
        Nr   r#   r$   rH   r   r%   )r'   r   rD   r   r   r   r?   rJ   rA   r-   r   r   r	   r(   r)   r,   ri   r@   rC   rW   r+   rI   rX   r3   rK   rL   rM   )r   r.   r/   r0   att_prev_listrN   r2   cov_vecrY   r
   rO   r   r   r   r   r4     s,    

 zAttCov.forwardrP   rQ   r5   r   r   r   r   rh   r  s
    rh   c                       r;   )AttLoc2Da  2D location-aware attention

    This attention is an extended version of location aware attention.
    It take not only one frame before attention weights,
    but also earlier frames into account.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param int att_win: attention window size (default=5)
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    Fc                    s   t t|   tj||| _tjj||dd| _tjj||dd| _tjj	d||d| d fd|fdd| _
tj|d| _|| _|| _|| _d | _d | _d | _|| _|| _d | _|| _d S r[   )r   rl   r   r+   r=   r>   r?   r@   r^   r_   r`   rW   rA   rB   rC   r   r   r   rb   att_winr3   rD   )r   rB   rA   rC   rm   rb   rc   rD   r   r   r   r     s,   
zAttLoc2D.__init__c                 C   r   r    rF   r   r   r   r   r!     r"   zAttLoc2D.resetrG   c                 C   s  t |}| jdu s| jr|| _| jd| _| | j| _|du r)||| j}n|	|| j}|du rTt
|dt|  }|||d }|dd| jd}| |d}|ddd}| |}| |	|d| j}| t|| j | d}	| jdu rt
|t|| _|	| jtd  tj||	 dd}
tj| j|
	|| jd dd}tj||
dgdd}|ddddf }||fS )aT  AttLoc2D forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: previous attention weight (B x att_win x T_max)
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x att_win x T_max)
        :rtype: torch.Tensor
        Nr   r#   r$   rH   r   r%   ) r'   r   rD   r   r   r   r?   rJ   rA   r-   r   r   r	   r(   r)   expandrm   r`   rX   rf   r^   r@   rC   rW   r+   rI   r3   rK   rL   rM   r,   cat)r   r.   r/   r0   r1   rN   r2   rg   rY   r
   rO   r   r   r   r   r4     s2   
 
 zAttLoc2D.forwardrP   rQ   r5   r   r   r   r   rl     s
    rl   c                       r;   )	AttLocRecaP  location-aware recurrent attention

    This attention is an extended version of location aware attention.
    With the use of RNN,
    it take the effect of the history of attention weights into account.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    Fc                    s   t t|   tj||| _tjj||dd| _tjjd|dd| d fd|fdd| _	tjj
||dd| _tj|d| _|| _|| _|| _d | _d | _d | _d | _|| _d S r[   )r   rp   r   r+   r=   r>   r?   r@   r_   r`   LSTMCellatt_lstmrW   rA   rB   rC   r   r   r   r3   rD   ra   r   r   r   r   W  s(   
zAttLocRec.__init__c                 C   r   r    rF   r   r   r   r   r!   n  r"   zAttLocRec.resetrG   c                 C   s  t |}| jdu s| jr|| _| jd| _| | j| _|du r)||| j}n|	|| j}|du r\t
|dt|  }|||d }||| j}||| j}	||	f}
n|d }|d }
| |	|dd| j}t|}t|d|df	|d}| ||
\}}	| |	|d| j}| t|d| j | d}| jdu rt
|t|| _|| jtd  tj|| dd	}tj| j|	|| jd dd	}||||	fffS )
a  AttLocRec forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param tuple att_prev_states: previous attention weight and lstm states
                                      ((B, T_max), ((B, att_dim), (B, att_dim)))
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights and lstm states (w, (hx, cx))
                 ((B, T_max), ((B, att_dim), (B, att_dim)))
        :rtype: tuple
        Nr   r#   r$   r   r   rH   r   r%   )r'   r   rD   r   r   r   r?   rJ   rA   r-   r   r   r	   r(   r)   rC   r`   rL   relu
max_pool2drr   r@   rW   r+   rI   rX   r3   rK   rM   r,   )r   r.   r/   r0   att_prev_statesrN   r2   r1   att_hatt_c
att_statesrg   rY   r
   rO   r   r   r   r   r4   u  s@   


 zAttLocRec.forwardrP   rQ   r5   r   r   r   r   rp   G  s
    rp   c                       r;   )	AttCovLoca  Coverage mechanism location aware attention

    This attention is a combination of coverage and location-aware attentions.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode:
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    Fc                    s   t t|   tj||| _tjj||dd| _tjj||dd| _tjj	d|dd| d fd|fdd| _
tj|d| _|| _|| _|| _d | _d | _d | _|| _d | _|| _d S r[   )r   ry   r   r+   r=   r>   r?   r@   r^   r_   r`   rW   rA   rB   rC   r   r   r   rb   r3   rD   ra   r   r   r   r     s*   
zAttCovLoc.__init__c                 C   r   r    rF   r   r   r   r   r!     r"   zAttCovLoc.resetrG   c                 C   sx  t |}| jdu s| jr|| _| jd| _| | j| _|du r)||| j}n|	|| j}|du rJdt
|  }t||||d g}t|}| |	|dd| j}	|	ddd}	| |	}	| |	|d| j}
| t|	| j |
 d}| jdu rt|t
|| _|| jtd  tj|| dd}||g7 }tj| j|	|| jd dd}||fS )a.  AttCovLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param list att_prev_list: list of previous attention weight
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weights
        :rtype: list
        Nr   r#   r$   rH   r   r%   )r'   r   rD   r   r   r   r?   rJ   rA   r-   r   r	   r   r(   r)   r,   r`   rX   rf   r^   r@   rC   rW   r+   rI   r3   rK   rL   rM   )r   r.   r/   r0   rj   rN   r2   r3   rk   rg   rY   r
   rO   r   r   r   r   r4     s0   
 

 zAttCovLoc.forwardrP   rQ   r5   r   r   r   r   ry     s
    ry   c                       2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
AttMultiHeadDota  Multi head dot product attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    Fc                    s  t t|   tj | _tj | _tj | _t	j
|D ]-}|  jtj||g7  _|  jtjj||ddg7  _|  jtjj||ddg7  _qtjj|| |dd| _|| _|| _|| _|| _|| _dt| | _d | _d | _d | _d | _d | _|| _d S )NFrU   r#   )r   r{   r   r+   r=   
ModuleListmlp_qmlp_kmlp_vsixmovesranger>   mlp_orA   rB   aheads	att_dim_k	att_dim_vmathsqrtrN   r   r   pre_compute_kpre_compute_vr3   rD   r   rB   rA   r   r   r   rD   _r   r   r   r   =  s*    
zAttMultiHeadDot.__init__c                 C   "   d| _ d| _d| _d| _d| _dS r    r   r   r   r   r3   r   r   r   r   r!   T  
   
zAttMultiHeadDot.resetc           
   	      s  | d} jdu s jr&| _ j d _ fddtj jD  _ j	du s. jrG| _ j d _ fddtj jD  _	|du rS|
| j}n|| j}g }g }tj jD ]W}tj j| t j| ||d j dd}	 jdu rt|t| _|	 jtd	  |tj j|	 ddg7 }|tj j	| || | jd ddg7 }qe tj|dd}||fS )
a  AttMultiHeadDot forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        r   Nr   c                    s"   g | ]}t  j|  jqS r   )r+   rI   r~   r   .0hr   r   r   
<listcomp>o  s    z+AttMultiHeadDot.forward.<locals>.<listcomp>c                       g | ]
} j |  jqS r   r   r   r   r   r   r   r   w      rH   r%   r   )r   r   rD   r   r   r   r   r   r   r   rJ   rA   r-   r+   r,   rI   r}   r   r3   r   r   rK   r	   rL   rM   rN   r   ro   
r   r.   r/   r0   r1   r2   r   rO   r   r
   r   r   r   r4   \  s<   


0zAttMultiHeadDot.forwardrP   r5   r   r   r   r   r{   .  s
    r{   c                       rz   )
AttMultiHeadAdda^  Multi head additive attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using additive attention for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    Fc                    s.  t t|   tj | _tj | _tj | _tj | _	t
j|D ]:}|  jtj||g7  _|  jtjj||ddg7  _|  jtjj||ddg7  _|  j	tj|dg7  _	q%tjj|| |dd| _|| _|| _|| _|| _|| _dt| | _d | _d | _d | _d | _d | _|| _d S )NFrU   r   r#   )r   r   r   r+   r=   r|   r}   r~   r   rW   r   r   r   r>   r   rA   rB   r   r   r   r   r   rN   r   r   r   r   r3   rD   r   r   r   r   r     s.   
zAttMultiHeadAdd.__init__c                 C   r   r    r   r   r   r   r   r!     r   zAttMultiHeadAdd.resetc           
   
      s  | d} jdu s jr&| _ j d _ fddtj jD  _ j	du s. jrG| _ j d _ fddtj jD  _	|du rS|
| j}n|| j}g }g }tj jD ]Z} j| t j|  j| ||d j d}	 jdu rt|t| _|	 jtd  |tj j|	 dd	g7 }|tj j	| || | jd dd	g7 }qe tj|dd	}||fS )
a  AttMultiHeadAdd forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: dummy (does not use)
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        r   Nr   c                    r   r   r~   r   r   r   r   r   r     r   z+AttMultiHeadAdd.forward.<locals>.<listcomp>c                    r   r   r   r   r   r   r   r     r   rH   r   r%   )r   r   rD   r   r   r   r   r   r   r   rJ   rA   r-   rW   r+   rI   r}   r   rX   r3   r   r   rK   r	   rL   rM   rN   r,   r   ro   r   r   r   r   r4     s:   
"
0zAttMultiHeadAdd.forwardrP   r5   r   r   r   r   r     s
    r   c                       s6   e Zd ZdZ	d
 fdd	Zdd Zddd	Z  ZS )AttMultiHeadLoca  Multi head location based attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using location-aware attention for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    Fc	           
   
      s  t t|   tj | _tj | _tj | _tj | _	tj | _
tj | _tj|D ]b}	|  jtj||g7  _|  jtjj||ddg7  _|  jtjj||ddg7  _|  j	tj|dg7  _	|  j
tjjd|dd| d fd|fddg7  _
|  jtjj||ddg7  _q1tjj|| |dd| _|| _|| _|| _|| _|| _dt| | _d | _d | _d | _d | _d | _|| _d S NFrU   r   rH   r   r\   r#   ) r   r   r   r+   r=   r|   r}   r~   r   rW   r`   r^   r   r   r   r>   r_   r   rA   rB   r   r   r   r   r   rN   r   r   r   r   r3   rD   )
r   rB   rA   r   r   r   rb   rc   rD   r   r   r   r   r     sF    	
zAttMultiHeadLoc.__init__c                 C   r   r    r   r   r   r   r   r!   F  r   zAttMultiHeadLoc.resetrG   c              
      s.  | d} jdu s jr&| _ j d _ fddtj jD  _ j	du s. jrG| _ j d _ fddtj jD  _	|du rS|
| j}n|| j}|du rg }tj jD ]}dt|  }|t||||d g7 }qgg }	g }
tj jD ]{} j| || |dd j}|d	dd	} j| |} j| t j| |  j| ||d j d	} jdu rt|t| _| jtd
  |
tj|| ddg7 }
|	tj j	| |
| | jd ddg7 }	q  tj!|	dd}	|	|
fS )am  AttMultiHeadLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev:
            list of previous attention weight (B x T_max) * aheads
        :param float scaling: scaling parameter before applying softmax
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        r   Nr   c                    r   r   r   r   r   r   r   r   c  r   z+AttMultiHeadLoc.forward.<locals>.<listcomp>c                    r   r   r   r   r   r   r   r   i  r   r#   r$   rH   r   r%   )"r   r   rD   r   r   r   r   r   r   r   rJ   rA   r-   r   r	   r   r(   r)   r`   rX   rf   r^   rW   r+   rI   r}   r   r3   rK   rL   rM   r,   r   ro   )r   r.   r/   r0   r1   rN   r2   r   r3   r   rO   r   rg   r
   r   r   r   r4   N  sR   
" 
	0zAttMultiHeadLoc.forwardrP   rQ   r5   r   r   r   r   r     s    /r   c                       s4   e Zd ZdZ	d	 fdd	Zdd Zdd Z  ZS )
AttMultiHeadMultiResLoca  Multi head multi resolution location based attention

    Reference: Attention is all you need
        (https://arxiv.org/abs/1706.03762)

    This attention is multi head attention using location-aware attention for each head.
    Furthermore, it uses different filter size for each head.

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int att_dim_k: dimension k in multi head attention
    :param int att_dim_v: dimension v in multi head attention
    :param int aconv_chans: maximum # channels of attention convolution
        each head use #ch = aconv_chans * (head + 1) / aheads
        e.g. aheads=4, aconv_chans=100 => filter size = 25, 50, 75, 100
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
        and not store pre_compute_k and pre_compute_v
    Fc	              
      s  t t|   tj | _tj | _tj | _tj | _	tj | _
tj | _tj|D ]j}	|  jtj||g7  _|  jtjj||ddg7  _|  jtjj||ddg7  _|  j	tj|dg7  _	||	d  | }
|  j
tjjd|dd|
 d fd|
fddg7  _
|  jtjj||ddg7  _q1tjj|| |dd| _|| _|| _|| _|| _|| _dt| | _d | _d | _d | _d | _d | _|| _d S r   ) r   r   r   r+   r=   r|   r}   r~   r   rW   r`   r^   r   r   r   r>   r_   r   rA   rB   r   r   r   r   r   rN   r   r   r   r   r3   rD   )r   rB   rA   r   r   r   rb   rc   rD   r   afiltsr   r   r   r     s@    
z AttMultiHeadMultiResLoc.__init__c                 C   r   r    r   r   r   r   r   r!     r   zAttMultiHeadMultiResLoc.resetc              
      s0  | d} jdu s jr&| _ j d _ fddtj jD  _ j	du s. jrG| _ j d _ fddtj jD  _	|du rS|
| j}n|| j}|du rg }tj jD ]}dt|  }|t||||d g7 }qgg }g }	tj jD ]|}
 j|
 ||
 |dd j}|d	dd	} j|
 |} j|
 t j|
 |  j|
 ||d j d	} jdu rt|t| _| jtd
  |	tj j| ddg7 }	|tj  j	|
 |	|
 | jd ddg7 }q !tj"|dd}||	fS )a-  AttMultiHeadMultiResLoc forward

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: list of previous attention weight
            (B x T_max) * aheads
        :return: attention weighted encoder state (B x D_enc)
        :rtype: torch.Tensor
        :return: list of previous attention weight (B x T_max) * aheads
        :rtype: list
        r   Nr   c                    r   r   r   r   r   r   r   r     r   z3AttMultiHeadMultiResLoc.forward.<locals>.<listcomp>c                    r   r   r   r   r   r   r   r     r   r#   r$   rH   r   r%   )#r   r   rD   r   r   r   r   r   r   r   rJ   rA   r-   r   r	   r   r(   r)   r`   rX   rf   r^   rW   r+   rI   r}   r   r3   rK   rL   rM   rN   r,   r   ro   )r   r.   r/   r0   r1   r2   r   r3   r   rO   r   rg   r
   r   r   r   r4     sR   
" 
	0zAttMultiHeadMultiResLoc.forwardrP   r5   r   r   r   r   r     s    ,r   c                       :   e Zd ZdZ fddZdd Z					dd
dZ  ZS )
AttForwarda  Forward attention module.

    Reference:
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)

    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    c                    s   t t|   tj||| _tjj||dd| _tjj||dd| _tjj	d|dd| d fd|fdd| _
tj|d| _|| _|| _|| _d | _d | _d | _d | _d S r[   )r   r   r   r+   r=   r>   r?   r@   r^   r_   r`   rW   rA   rB   rC   r   r   r   r3   )r   rB   rA   rC   rb   rc   r   r   r   r   7  s&   
zAttForward.__init__c                 C   r   r    rF   r   r   r   r   r!   L  r"   zAttForward.resetr#   Nr   r   c	                 C   s  t |}	| jdu r|| _| jd| _| | j| _|du r&||	| j}n||	| j}|du rD|j| dd  }d|dddf< | 	||	dd| j}
|

ddd}
| |
}
| |d}| t| j| |
 
d}| jdu rt|t|| _|| jtd  |durt||||}tj|| dd}t|ddddd	f }|| | }tjt|d
ddd}tj| j|d	 dd}||fS )aD  Calculate AttForward forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B x T_max x D_enc)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B x D_dec)
        :param torch.Tensor att_prev: attention weights of previous step
        :param float scaling: scaling parameter before applying softmax
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, D_enc)
        :rtype: torch.Tensor
        :return: previous attention weights (B x T_max)
        :rtype: torch.Tensor
        Nr   rH   r#   r   r   r%   r   r   r$   ư>pr&   )r'   r   r   r   r   r?   rJ   rA   r-   r`   rX   rf   r^   r@   r)   rW   r+   rI   r3   r   r   rK   r	   r   rL   rM   pad	normalizeclampr,   )r   r.   r/   r0   r1   rN   r   r   r   r2   rg   rY   r
   rO   att_prev_shiftr   r   r   r   r4   S  s6   

 
zAttForward.forwardr#   Nr   r   r5   r   r   r   r   r   )  s    r   c                       r   )AttForwardTAa  Forward attention with transition agent module.

    Reference:
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)

    :param int eunits: # units of encoder
    :param int dunits: # units of decoder
    :param int att_dim: attention dimension
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param int odim: output dimension
    c                    s   t t|   tj||| _tjj||dd| _tj|| | d| _tjj||dd| _	tjj
d|dd| d fd|fdd| _tj|d| _|| _|| _|| _d | _d | _d | _d | _d| _d S )NFrU   r   rH   r   r\         ?)r   r   r   r+   r=   r>   r?   r@   mlp_tar^   r_   r`   rW   rA   eunitsrC   r   r   r   r3   trans_agent_prob)r   r   rA   rC   rb   rc   odimr   r   r   r     s*   
zAttForwardTA.__init__c                 C   s"   d | _ d | _d | _d | _d| _d S )Nr   )r   r   r   r3   r   r   r   r   r   r!     s
   
zAttForwardTA.resetr#   Nr   r   c
                 C   s  t |}
| jdu r|| _| jd| _| | j| _|du r&||
| j}n||
| j}|du rD|j| dd  }d|dddf< | 	||
dd| j}|
ddd}| |}| ||
d| j}| t|| j | 
d}| jdu rt|t|| _|| jtd  |durt||||	}tj|| dd}t|ddddd	f }| j| d| j |  | }tjt|d
ddd}tj| j||
| jd dd}t|  tj!|||gdd| _||fS )a  Calculate AttForwardTA forward propagation.

        :param torch.Tensor enc_hs_pad: padded encoder hidden state (B, Tmax, eunits)
        :param list enc_hs_len: padded encoder hidden state length (B)
        :param torch.Tensor dec_z: decoder hidden state (B, dunits)
        :param torch.Tensor att_prev: attention weights of previous step
        :param torch.Tensor out_prev: decoder outputs of previous step (B, odim)
        :param float scaling: scaling parameter before applying softmax
        :param int last_attended_idx: index of the inputs of the last attended
        :param int backward_window: backward window size in attention constraint
        :param int forward_window: forward window size in attetion constraint
        :return: attention weighted encoder state (B, dunits)
        :rtype: torch.Tensor
        :return: previous attention weights (B, Tmax)
        :rtype: torch.Tensor
        Nr   rH   r#   r   r   r%   r   r$   r   r   )"r'   r   r   r   r   r?   rJ   rA   r-   r`   rX   rf   r^   r@   rC   rW   r+   rI   r3   r   r   rK   r	   r   rL   rM   r   r   r   r   r,   sigmoidr   ro   )r   r.   r/   r0   r1   out_prevrN   r   r   r   r2   rg   rY   r
   rO   r   r   r   r   r   r4     s8   

 
 "zAttForwardTA.forwardr   r5   r   r   r   r   r     s    r   Fc                 C   s,  t j }t| dd}t| dd}t| dd}t| dd}t| dd}|dkrDt|D ]}	t| j| j| j|| j	|||}
|
|
 q+|S |dkr|rat| j| j| j| j| j| j| j| jdd		}
|
S t j }t|D ]"}t| j| | j| j|| | j	| || || || }
|
|
 qj|S td
|)af  Instantiates an attention module given the program arguments

    :param Namespace args: The arguments
    :param int num_att: number of attention modules
        (in multi-speaker case, it can be 2 or more)
    :param bool han_mode: switch on/off mode of hierarchical attention network (HAN)
    :rtype torch.nn.Module
    :return: The attention module
    num_encsr   r   Nawinrb   rc   T)rD   z0Number of encoders needs to be more than one. {})r+   r=   r|   getattrr   initial_attatyperB   rA   adimappendhan_type	han_headshan_dimhan_winhan_conv_chanshan_conv_filts
ValueErrorformat)argsnum_attrD   att_listr   r   r   rb   rc   iattidxr   r   r   att_for-  sb   




r   c	           
   	   C   sZ  | dkr	t  }	|	S | dkrt||||}	|	S | dkr#t||||}	|	S | dkr2t||||||}	|	S | dkrBt|||||||}	|	S | dkrQt||||||}	|	S | dkr^t||||}	|	S | dkrmt||||||}	|	S | d	kr|t||||||}	|	S | d
krt	||||||}	|	S | dkrt
||||||||}	|	S | dkrt||||||||}	|	S )a(  Instantiates a single attention module

    :param str atype: attention type
    :param int eprojs: # projection-units of encoder
    :param int dunits: # units of decoder
    :param int aheads: # heads of multi head attention
    :param int adim: attention dimension
    :param int awin: attention window size
    :param int aconv_chans: # channels of attention convolution
    :param int aconv_filts: filter size of attention convolution
    :param bool han_mode: flag to swith on mode of hierarchical attention
    :return: The attention module
    noattdotaddlocation
location2dlocation_recurrentcoveragecoverage_locationmulti_head_dotmulti_head_addmulti_head_locmulti_head_multi_res_loc)r   r<   rS   rZ   rl   rp   rh   ry   r{   r   r   r   )
r   rB   rA   r   r   r   rb   rc   rD   r   r   r   r   r   l  sP   	r   c                    s   t |trtjdd | D dd  } | S t |ttfr2tjdd t| D dd  } | S t |t	rItjdd | D dd  } | S t |t
tttfrt| d }g }tj|D ] tj fdd| D dd}||g7 }q`tj|dd  } | S tj| dd  } | S )	zConverts attention weights to a numpy array given the attention

    :param list att_ws: The attention weights
    :param torch.nn.Module att: The attention
    :rtype: np.ndarray
    :return: The numpy array of the attention weights
    c                 S   s   g | ]
}|d d df qS )Nr$   r   r   awr   r   r   r     r   z att_to_numpy.<locals>.<listcomp>r   r%   c                 S   s   g | ]\}}|| qS r   r   )r   r   r   r   r   r   r     s    c                 S   s   g | ]}|d  qS )r   r   r   r   r   r   r         r   c                    s   g | ]}|  qS r   r   r   r   r   r   r     r   )
isinstancerl   r+   stackcpunumpyrh   ry   	enumeraterp   r{   r   r   r   r'   r   r   r   )att_wsr   n_headsatt_ws_sorted_by_headatt_ws_headr   r   r   att_to_numpy  s,   
	 $
 
r   )r   r   )r   FrP   )r9   r   r   r+   torch.nn.functionalr=   
functionalrL   *funasr.models.transformer.utils.nets_utilsr   r   r   Moduler   r<   rS   rZ   rh   rl   rp   ry   r{   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s:    
.LO}`uynjl  ~ 
@
0