o
    ´©iH  ã                   @   sþ   d Z ddlZddlZddlm  mZ ddlmZ dd„ ZG dd„ dejj	ƒZ
G dd	„ d	e
ƒZG d
d„ dejj	ƒZG dd„ de
ƒZG dd„ dejj	ƒZG dd„ dejj	ƒZG dd„ dejj	ƒZG dd„ dejj	ƒZG dd„ dejj	ƒZG dd„ dejj	ƒZdS )zPositional Encoding Module.é    N)Úeinsumc                 C   s"   |d }|| v r|   |¡ dS dS )zîPerform pre-hook in load_state_dict for backward compatibility.

    Note:
        We saved self.pe until v.0.5.2 but we have omitted it later.
        Therefore, we remove the item "pe" from `state_dict` for backward compatibility.

    ÚpeN)Úpop)Ú
state_dictÚprefixÚlocal_metadataÚstrictÚmissing_keysÚunexpected_keysÚ
error_msgsÚk© r   úW/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transformer/embedding.pyÚ	_pre_hook   s   ÿr   c                       s:   e Zd ZdZd‡ fdd„	Zdd„ Zdejfd	d
„Z‡  Z	S )ÚPositionalEncodingaa  Positional encoding.

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.
        reverse (bool): Whether to reverse the input position. Only for
        the class LegacyRelPositionalEncoding. We remove it in the current
        class RelPositionalEncoding.
    éˆ  Fc                    sd   t t| ƒ ¡  || _|| _t | j¡| _tj	j
|d| _d| _|  t d¡ d|¡¡ |  t¡ dS ©z'Construct an PositionalEncoding object.©ÚpNç        é   )Úsuperr   Ú__init__Úd_modelÚreverseÚmathÚsqrtÚxscaleÚtorchÚnnÚDropoutÚdropoutr   Ú	extend_peÚtensorÚexpandÚ"_register_load_state_dict_pre_hookr   )Úselfr   Údropout_rateÚmax_lenr   ©Ú	__class__r   r   r   0   s   zPositionalEncoding.__init__c                 C   sD  | j dur+| j  d¡| d¡kr+| j j|jks| j j|jkr)| j j|j|jd| _ dS t | d¡| j¡}| jrKtj	| d¡d ddtj
d d¡}ntj	d| d¡tj
d d¡}t tj	d| jdtj
dt d	¡| j   ¡}t || ¡|dd…ddd…f< t || ¡|dd…ddd…f< | d¡}|j|j|jd
| _ dS )úReset the positional encodings.Nr   ©ÚdtypeÚdeviceéÿÿÿÿg      ð¿©r-   r   é   ç     ˆÃ@©r.   r-   )r   Úsizer-   r.   Útor   Úzerosr   r   ÚarangeÚfloat32Ú	unsqueezeÚexpr   ÚlogÚsinÚcos)r&   Úxr   ÚpositionÚdiv_termr   r   r   r"   ;   s$   
&ÿÿ  
zPositionalEncoding.extend_per>   c                 C   s:   |   |¡ || j | jdd…d| d¡…f  }|  |¡S )ú¾Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
        Nr   ©r"   r   r   r4   r!   ©r&   r>   r   r   r   ÚforwardP   s   
	&
zPositionalEncoding.forward)r   F©
Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r"   r   ÚTensorrD   Ú__classcell__r   r   r)   r   r   $   s
    r   c                       s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚScaledPositionalEncodingzìScaled positional encoding module.

    See Sec. 3.2  https://arxiv.org/abs/1809.08895

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    r   c                    s*   t ƒ j|||d tj t d¡¡| _dS )úInitialize class.)r   r'   r(   ç      ð?N)r   r   r   r   Ú	Parameterr#   Úalpha©r&   r   r'   r(   r)   r   r   r   j   s   z!ScaledPositionalEncoding.__init__c                 C   s   t  d¡| j_dS )zReset parameters.rN   N)r   r#   rP   Údata©r&   r   r   r   Úreset_parameterso   s   z)ScaledPositionalEncoding.reset_parametersc                 C   s:   |   |¡ || j| jdd…d| d¡…f   }|  |¡S )ú¿Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        Nr   )r"   rP   r   r4   r!   rC   r   r   r   rD   s   s   

&
z ScaledPositionalEncoding.forward©r   )rF   rG   rH   rI   r   rT   rD   rK   r   r   r)   r   rL   ^   s
    rL   c                       sL   e Zd ZdZ					d‡ fdd„	Zd	d
„ Zdd„ Zdejfdd„Z	‡  Z
S )ÚLearnableFourierPosEncaV  Learnable Fourier Features for Positional Encoding.

    See https://arxiv.org/pdf/2106.02795.pdf

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.
        gamma (float): init parameter for the positional kernel variance
            see https://arxiv.org/pdf/2106.02795.pdf.
        apply_scaling (bool): Whether to scale the input before adding the pos encoding.
        hidden_dim (int): if not None, we modulate the pos encodings with
            an MLP whose hidden layer has hidden_dim neurons.
    r   r   rN   FNc                    sØ   t t| ƒ ¡  || _|rt | j¡| _nd| _tj 	|¡| _
|| _|| _| jdu r/| jd | _|d dks9J dƒ‚tj t d|d ¡¡| _|  ¡  || _| jdurjtj tj ||¡tj ¡ tj ||¡¡| _dS dS )rM   rN   Nr1   r   z>d_model should be divisible by two in order to use this layer.r   )r   rW   r   r   r   r   r   r   r   r    r!   r(   ÚgammarO   ÚemptyÚw_rÚ_resetÚ
hidden_dimÚ
SequentialÚLinearÚGELUÚmlp)r&   r   r'   r(   rX   Úapply_scalingr\   r)   r   r   r   ’   s*   



ýÿzLearnableFourierPosEnc.__init__c                 C   s,   t  ddt | j¡ d| jd f¡| j_d S )Nr   r   r1   )r   Únormalr   r   rX   r   rZ   rR   rS   r   r   r   r[   ¸   s   ,zLearnableFourierPosEnc._resetc                 C   s   t jd| d¡t jd d¡ |¡}t  t  || j¡¡}t  	t  || j¡¡}t  
||fd¡}|t | j¡ }| jdu r@| d¡S |  | d¡¡S )r+   r   r   r0   r/   N)r   r7   r4   r8   r9   r5   r=   ÚmatmulrZ   r<   Úcatr   r   r   r\   r`   )r&   r>   Ú
position_vÚcosineÚsineÚpos_encr   r   r   r"   »   s   $

z LearnableFourierPosEnc.extend_per>   c                 C   s"   |   |¡}|| j | }|  |¡S )rA   )r"   r   r!   )r&   r>   r   r   r   r   rD   É   s   
	
zLearnableFourierPosEnc.forward)r   r   rN   FN)rF   rG   rH   rI   r   r[   r"   r   rJ   rD   rK   r   r   r)   r   rW   ‚   s    ù&rW   c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚLegacyRelPositionalEncodingaK  Relative positional encoding module (old version).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    r   c                    s   t ƒ j|||dd dS )rM   T)r   r'   r(   r   N)r   r   rQ   r)   r   r   r   å   s   
üz$LegacyRelPositionalEncoding.__init__c                 C   sD   |   |¡ || j }| jdd…d| d¡…f }|  |¡|  |¡fS )a	  Compute positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
            torch.Tensor: Positional embedding tensor (1, time, `*`).

        Nr   rB   ©r&   r>   Úpos_embr   r   r   rD   î   s   

z#LegacyRelPositionalEncoding.forwardrV   )rF   rG   rH   rI   r   rD   rK   r   r   r)   r   ri   ×   s    	ri   c                       s:   e Zd ZdZd
‡ fdd„	Zdd„ Zdejfdd	„Z‡  Z	S )ÚRelPositionalEncodingaR  Relative positional encoding module (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    r   c                    sT   t t| ƒ ¡  || _t | j¡| _tjj	|d| _
d| _|  t d¡ d|¡¡ dS r   )r   rl   r   r   r   r   r   r   r   r    r!   r   r"   r#   r$   rQ   r)   r   r   r     s   zRelPositionalEncoding.__init__c                 C   sª  | j dur/| j  d¡| d¡d d kr/| j j|jks"| j j|jkr-| j j|j|jd| _ dS t | d¡| j¡}t | d¡| j¡}tjd| d¡tj	d 
d¡}t tjd| jdtj	dt d¡| j   ¡}t || ¡|dd…ddd…f< t || ¡|dd…ddd…f< t d| | ¡|dd…ddd…f< t d| | ¡|dd…ddd…f< t |dg¡ 
d¡}|dd…  
d¡}tj||gdd	}|j|j|jd
| _ dS )r+   Nr   r1   r,   r   r0   r2   r/   ©Údimr3   )r   r4   r-   r.   r5   r   r6   r   r7   r8   r9   r:   r   r;   r<   r=   Úfliprd   )r&   r>   Úpe_positiveÚpe_negativer?   r@   r   r   r   r   r"     s*   
ÿÿ  $$zRelPositionalEncoding.extend_per>   c                 C   sn   |   |¡ || j }| jdd…| j d¡d | d¡ d | j d¡d | d¡ …f }|  |¡|  |¡fS )rU   Nr   r1   rB   rj   r   r   r   rD   6  s   


6ÿÿzRelPositionalEncoding.forwardrV   rE   r   r   r)   r   rl   ÿ   s
    	 rl   c                       s@   e Zd ZdZd‡ fdd„	Zdd„ Zddejd	efd
d„Z	‡  Z
S )ÚStreamPositionalEncodingz´Streaming Positional encoding.

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    r   c                    sx   t t| ƒ ¡  || _t | j¡| _tjj	|d| _
d| _t d¡ d|¡| _|  | j d¡| jj| jj¡ |  t¡ dS r   )r   rr   r   r   r   r   r   r   r   r    r!   r   r#   r$   Útmpr"   r4   r.   r-   r%   r   rQ   r)   r   r   r   S  s   z!StreamPositionalEncoding.__init__c                 C   sú   | j dur$| j  d¡|kr$| j j|ks| j j|kr"| j j||d| _ dS t || j¡}tjd|tj	d 
d¡}t tjd| jdtj	dt d¡| j   ¡}t || ¡|dd…ddd…f< t || ¡|dd…ddd…f< | 
d¡}|j||d| _ dS )	r+   Nr   r,   r   r0   r1   r2   r3   )r   r4   r-   r.   r5   r   r6   r   r7   r8   r9   r:   r   r;   r<   r=   )r&   Úlengthr.   r-   r   r?   r@   r   r   r   r"   ^  s    
ÿÿ  
z"StreamPositionalEncoding.extend_per   r>   Ú	start_idxc                 C   sP   |   | d¡| |j|j¡ || j | jdd…||| d¡ …f  }|  |¡S )rU   r   N)r"   r4   r.   r-   r   r   r!   )r&   r>   ru   r   r   r   rD   p  s   
*
z StreamPositionalEncoding.forwardrV   ©r   )rF   rG   rH   rI   r   r"   r   rJ   ÚintrD   rK   r   r   r)   r   rr   I  s
    	rr   c                   @   sF   e Zd ZdZddd„Zddejfdejded	ej	fd
d„Z
dd„ ZdS )ÚSinusoidalPositionEncoderú éP   çš™™™™™¹?c                 C   ó   d S ©Nr   ©r&   r   r'   r   r   r   Ú__int__‚  ó   z!SinusoidalPositionEncoder.__int__NÚ	positionsÚdepthr-   c           
      C   sº   |  d¡}| |¡}|j}t tjdg||d¡|d d  }t tj|d |d |¡|  ¡}t ||dg¡}t |g d¢¡t |g d	¢¡ }tj	t 
|¡t |¡gdd
}	|	 |¡S )Nr   é'  r,   r1   r   ©r.   r/   ©r   r/   r   ©r   r   r/   rm   )r4   Útyper.   r   r;   r#   r:   r7   Úreshaperd   r<   r=   )
r&   r   r‚   r-   Ú
batch_sizer.   Úlog_timescale_incrementÚinv_timescalesÚscaled_timeÚencodingr   r   r   Úencode…  s   


ÿÿÿ
z SinusoidalPositionEncoder.encodec                 C   sP   |  ¡ \}}}tjd|d |jdd d d …f }|  |||j¡ |j¡}|| S )Nr   r„   )r4   r   r7   r.   rŽ   r-   r5   )r&   r>   r‰   Ú	timestepsÚ	input_dimr   Úposition_encodingr   r   r   rD   ˜  s   "z!SinusoidalPositionEncoder.forward©rz   r{   ©rF   rG   rH   rI   r   r   r8   rJ   rw   r-   rŽ   rD   r   r   r   r   rx     s    
ÿÿÿ
ÿrx   c                   @   sH   e Zd ZdZddd„Zddejfdejded	ej	fd
d„Z
ddd„ZdS )ÚStreamSinusoidalPositionEncoderry   rz   r{   c                 C   r|   r}   r   r~   r   r   r   r   £  r€   z'StreamSinusoidalPositionEncoder.__int__Nr   r‚   r-   c           	      C   s®   |  d¡}| |¡}t tjdg|d¡|d d  }t t |d ¡ |¡|  ¡}t ||dg¡}t |g d¢¡t |g d¢¡ }tjt 	|¡t 
|¡gdd	}| |¡S )
Nr   rƒ   r0   r1   r   r/   r…   r†   rm   )r4   r‡   r   r;   r#   r:   r7   rˆ   rd   r<   r=   )	r&   r   r‚   r-   r‰   rŠ   r‹   rŒ   r   r   r   r   rŽ   ¦  s   

" ÿ
z&StreamSinusoidalPositionEncoder.encodec           	      C   s†   |  ¡ \}}}d}|d ur|d }|d  |7  < t d|| d ¡d d d …f }|  |||j¡ |j¡}||d d …||| …f  S )Nr   ru   r   )r4   r   r7   rŽ   r-   r5   r.   )	r&   r>   Úcacher‰   r   r   ru   r   r‘   r   r   r   rD   ´  s    z'StreamSinusoidalPositionEncoder.forwardr’   r}   r“   r   r   r   r   r”      s    
ÿÿÿ
ÿr”   c                	       sj   e Zd ZdZddedededdf‡ fd	d
„Zddejdeddfdd„Z	ddejdedejfdd„Z
‡  ZS )ÚStreamingRelPositionalEncodingz’Relative positional encoding.
    Args:
        size: Module size.
        max_len: Maximum input length.
        dropout_rate: Dropout rate.
    r   r   r4   r'   r(   ÚreturnNc                    sL   t ƒ  ¡  || _d| _tjj|d| _|  t 	d¡ 
d|¡¡ |  t¡ dS )z.Construct a RelativePositionalEncoding object.Nr   r   r   )r   r   r4   r   r   r   r    r!   r"   r#   r$   r%   r   )r&   r4   r'   r(   r)   r   r   r   Ç  s   
z'StreamingRelPositionalEncoding.__init__r   r>   Úleft_contextc                 C   sœ  |  d¡| }| jdur3| j  d¡|d d kr3| jj|jks&| jj|jkr1| jj|j|jd| _dS t || j ¡}t || j ¡}tjd|tjd 	d¡}t 
tjd| j dtjdt d¡| j    ¡}t || ¡|dd…ddd…f< t || ¡|dd…ddd…f< t |dg¡ 	d¡}t d| | ¡|dd…ddd…f< t d| | ¡|dd…ddd…f< |dd…  	d¡}tj||gdd	j|j|jd
| _dS )z—Reset positional encoding.
        Args:
            x: Input sequences. (B, T, ?)
            left_context: Number of frames in left context.
        r   Nr1   r3   r   r0   r2   r/   rm   r,   )r4   r   r-   r.   r5   r   r6   r7   r8   r9   r:   r   r;   r<   r=   ro   rd   )r&   r>   r˜   Útime1rp   rq   r?   r@   r   r   r   r"   Ó  s&   
&ÿ  $$&z(StreamingRelPositionalEncoding.extend_pec                 C   sj   | j ||d | d¡| }| jdd…| j d¡d | d | j d¡d | d¡ …f }|  |¡}|S )zóCompute positional encoding.
        Args:
            x: Input sequences. (B, T, ?)
            left_context: Number of frames in left context.
        Returns:
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), ?)
        )r˜   r   Nr1   )r"   r4   r   r!   )r&   r>   r˜   r™   rh   r   r   r   rD   ó  s
   @
z&StreamingRelPositionalEncoding.forward)r   r   rv   )rF   rG   rH   rI   rw   Úfloatr   r   rJ   r"   rD   rK   r   r   r)   r   r–   ¿  s
     $ r–   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚScaledSinuEmbeddingc                    sL   t ƒ  ¡  tj t d¡¡| _ddt d|d¡ ¡ |   }|  	d|¡ d S )Nr   rN   rƒ   r   r1   Úinv_freq)
r   r   r   r   rO   ÚonesÚscaler7   rš   Úregister_buffer)r&   rn   rœ   r)   r   r   r     s   
ÿÿzScaledSinuEmbedding.__init__c                 C   sZ   |j d |j}}tj||d | j¡}td|| jƒ}tj| ¡ | 	¡ fdd}|| j
 S )Nr   r„   zi , j -> i jr/   rm   )Úshaper.   r   r7   Útype_asrœ   r   rd   r<   r=   rž   )r&   r>   Únr.   ÚtÚsinuÚembr   r   r   rD     s
   
zScaledSinuEmbedding.forward)rF   rG   rH   r   rD   rK   r   r   r)   r   r›     s    
r›   )rI   r   r   Útorch.nn.functionalr   Ú
functionalÚFr   r   ÚModuler   rL   rW   ri   rl   rr   rx   r”   r–   r›   r   r   r   r   Ú<module>   s    :$U(J6!F