o
    ߥi.                     @   sd   d Z ddlZddlmZ G dd dejZG dd dejZG dd dejZG d	d
 d
ejZdS )zM This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    Nc                       (   e Zd ZdZ fddZdd Z  ZS )TAPzG
    Temporal average pooling, only first-order mean is considered
    c                       t t|   d S N)superr   __init__selfkwargs	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/pooling_layers.pyr         zTAP.__init__c                 C   s   |j dd}|jdd}|S )Ndim   	start_dim)meanflatten)r	   xpooling_meanr   r   r   forward   s   zTAP.forward__name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r          r   c                       r   )TSDPzR
    Temporal standard deviation pooling, only second-order std is considered
    c                    r   r   )r   r"   r   r   r   r   r   r      r   zTSDP.__init__c                 C   s(   t t j|ddd }|jdd}|S Nr   r   g:0yE>r   r   )torchsqrtvarr   )r	   r   pooling_stdr   r   r   r      s   zTSDP.forwardr   r   r   r   r   r"      r!   r"   c                       r   )TSTPz
    Temporal statistics pooling, concatenate mean and std, which is used in
    x-vector
    Comment: simple concatenation can not make full use of both statistics
    c                    r   r   )r   r(   r   r   r   r   r   r   -   r   zTSTP.__init__c                 C   sP   |j dd}ttj|ddd }|jdd}|jdd}t||fd}|S r#   )r   r$   r%   r&   r   cat)r	   r   r   r'   statsr   r   r   r   0   s   zTSTP.forwardr   r   r   r   r   r(   &   s    r(   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	ASTPzx Attentive statistics pooling: Channel- and context-dependent
        statistics pooling, first used in ECAPA_TDNN.
       Fc                    sX   t t|   || _|rtj|d |dd| _n	tj||dd| _tj||dd| _d S )N   r   )kernel_size)r   r+   r   global_context_attnnConv1dlinear1linear2)r	   in_dimbottleneck_dimr/   r   r   r   r   @   s   
zASTP.__init__c           	      C   s  t |jdkr||jd |jd |jd  |jd }t |jdks%J | jrNtj|ddd|}ttj|dddd	 |}tj	|||fdd
}n|}t
| |}tj| |dd
}tj|| dd
}tj||d  dd
|d  }t|jd	d}tj	||gdd
S )z
        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
           r   r      r-   r   T)r   keepdimg|=r   )min)lenshapereshaper/   r$   r   	expand_asr%   r&   r)   tanhr2   softmaxr3   sumclamp)	r	   r   context_meancontext_stdx_inalphar   r&   stdr   r   r   r   R   s(   *zASTP.forward)r,   Fr   r   r   r   r   r+   ;   s    r+   )	r   r$   torch.nnr0   Moduler   r"   r(   r+   r   r   r   r   <module>   s   