o
    i                     @   sp   d Z ddlZddlmZ ddlmZ G dd dejZG dd dejZG dd	 d	ejZ	G d
d dejZ
dS )zL This implementation is adapted from https://github.com/wenet-e2e/wespeaker.    N)make_pad_maskc                       (   e Zd ZdZ fddZdd Z  ZS )TAPzG
    Temporal average pooling, only first-order mean is considered
    c                       t t|   d S N)superr   __init__selfkwargs	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/whisper_lid/eres2net/pooling_layers.pyr         zTAP.__init__c                 C   s   |j dd}|jdd}|S )Ndim   	start_dim)meanflatten)r
   xpooling_meanr   r   r   forward   s   zTAP.forward__name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r          r   c                       r   )TSDPzR
    Temporal standard deviation pooling, only second-order std is considered
    c                    r   r   )r   r#   r   r	   r   r   r   r       r   zTSDP.__init__c                 C   s(   t t j|ddd }|jdd}|S )Nr   r   :0yE>r   r   )torchsqrtvarr   )r
   r   pooling_stdr   r   r   r   #   s   zTSDP.forwardr   r   r   r   r   r#      r"   r#   c                       r   )TSTPz
    Temporal statistics pooling, concatenate mean and std, which is used in
    x-vector
    Comment: simple concatenation can not make full use of both statistics
    c                    r   r   )r   r)   r   r	   r   r   r   r   1   r   zTSTP.__init__c                 C   s   t ||jd dd d d d d d f  |j}|| }tj|dd}tj|dd}|| }||d d | d| }|}	t|d }
|	jdd}	|
jdd}
t	|	|
fd}|S )Nr   )maxlen)axis   r$   r   r   )
r   shapetodevicer%   sum	unsqueezer&   r   cat)r
   r   olensmasksx_maskedsum_without_paddingcount_without_paddingmean_without_paddingvar_without_paddingr   r(   statsr   r   r   r   4   s    0zTSTP.forwardr   r   r   r   r   r)   *   s    r)   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	ASTPzsAttentive statistics pooling: Channel- and context-dependent
    statistics pooling, first used in ECAPA_TDNN.
       Fc                    sX   t t|   || _|rtj|d |dd| _n	tj||dd| _tj||dd| _d S )N   r   )kernel_size)r   r;   r   global_context_attnnConv1dlinear1linear2)r
   in_dimbottleneck_dimr?   r   r   r   r   N   s   

zASTP.__init__c           	      C   s  t |jdkr||jd |jd |jd  |jd }t |jdks%J | jrNtj|ddd|}ttj|dddd	 |}tj	|||fdd
}n|}t
| |}tj| |dd
}tj|| dd
}tj||d  dd
|d  }t|jd	d}tj	||gdd
S )z
        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
           r   r   r,   r=   r   T)r   keepdimg|=r   )min)lenr-   reshaper?   r%   r   	expand_asr&   r'   r2   tanhrB   softmaxrC   r0   clamp)	r
   r   context_meancontext_stdx_inalphar   r'   stdr   r   r   r   `   s   * zASTP.forward)r<   Fr   r   r   r   r   r;   I   s    r;   )r    r%   torch.nnr@   *funasr.models.transformer.utils.nets_utilsr   Moduler   r#   r)   r;   r   r   r   r   <module>   s   