o
    iL                     @   s
  d dl Z d dlZd dlmZ d dlm  mZ G dd dejZG dd dejZ	de
de
de
d	e
fd
dZG dd de	ZG dd deZdddZG dd dejZG dd dejjZG dd dejZG dd dejZG dd dejZG dd dejjZdS )    Nc                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )_BatchNorm1dNh㈵>皙?TFc	           	         sV   t    || _|| _|d u r|r|d }n|d u r|d }tj|||||d| _d S )N   )epsmomentumaffinetrack_running_stats)super__init__combine_batch_timeskip_transposennBatchNorm1dnorm)	selfinput_shape
input_sizer   r   r	   r
   r   r   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sond/encoder/ecapa_tdnn_encoder.pyr      s   

z_BatchNorm1d.__init__c                 C   s   |j }| jr,|jdkr||d |d  |d }n||d |d  |d |d }n	| js5|dd}| |}| jrD||}|S | jsM|dd}|S )N   r   r      r   )shaper   ndimreshaper   	transposer   )r   xshape_orx_nr   r   r   forward$   s   
$

z_BatchNorm1d.forward)NNr   r   TTFF)__name__
__module____qualname__r   r"   __classcell__r   r   r   r   r      s    r   c                       sV   e Zd Z									d fdd	Zd	d
 ZdededefddZdd Z  ZS )_Conv1dNr   sameTreflectFc              
      s   t    || _|| _|| _|| _|
| _d| _|| _|d u r&|d u r&t	d|d u r/| 
|}tj||| j| j| jd||	d| _d S )NFz.Must provide one of input_shape or in_channelsr   )stridedilationpaddinggroupsbias)r   r   kernel_sizer*   r+   r,   padding_mode	unsqueezer   
ValueError_check_input_shaper   Conv1dconv)r   out_channelsr/   r   in_channelsr*   r+   r,   r-   r.   r0   r   r   r   r   r   :   s,   

z_Conv1d.__init__c                 C   s   | j s	|dd}| jr|d}| jdkr"| || j| j| j}n#| jdkr8| jd | j }t	||df}n| jdkr>nt
d| j | |}| jrR|d}| j s[|dd}|S )Nr   r   r(   causalr   validz1Padding must be 'same', 'valid' or 'causal'. Got )r   r   r1   r,   _manage_paddingr/   r+   r*   Fpadr2   r5   squeeze)r   r   num_padwxr   r   r   r"   b   s$   





z_Conv1d.forwardr/   r+   r*   c                 C   s.   |j d }t||||}tj||| jd}|S )Nr   )mode)r   get_padding_elemr;   r<   r0   )r   r   r/   r+   r*   L_inr,   r   r   r   r:      s   
z_Conv1d._manage_paddingc                 C   sr   t |dkrd| _d}n| jr|d }nt |dkr|d }n
tdtt | | jd dkr7td| j |S )z@Checks the input shape and returns the number of input channels.r   Tr   r   z"conv1d expects 2d, 3d inputs. Got r   z4The field kernel size must be an odd number. Got %s.)lenr1   r   r2   strr/   )r   r   r7   r   r   r   r3      s   

z_Conv1d._check_input_shape)	NNr   r   r(   r   Tr)   F)	r#   r$   r%   r   r"   intr:   r3   r&   r   r   r   r   r'   9   s(    (
r'   rB   r*   r/   r+   c                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr   r   )mathceil)rB   r*   r/   r+   n_stepsL_outr,   r   r   r   rA      s   rA   c                          e Zd Z fddZ  ZS )r4   c                       t  j|ddi| d S Nr   Tr   r   r   argskwargsr   r   r   r         zConv1d.__init__r#   r$   r%   r   r&   r   r   r   r   r4          r4   c                       rJ   )r   c                    rK   rL   rM   rN   r   r   r   r      rQ   zBatchNorm1d.__init__rR   r   r   r   r   r      rS   r   c                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )Nr   )devicedtype)rU   rT   )rC   r   maxlongitemtorcharangerT   rU   expandr1   	as_tensor)lengthmax_lenrU   rT   maskr   r   r   length_to_mask   s   r`   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r7   r6   r/   r+   r-   r   )r   ra   r   r4   r5   
activationr   r   )r   r7   r6   r/   r+   rc   r-   r   r   r   r      s   	zTDNNBlock.__init__c                 C   s   |  | | |S N)r   rc   r5   )r   r   r   r   r   r"      s   zTDNNBlock.forward)r#   r$   r%   r   ReLUr   r"   r&   r   r   r   r   ra      s
    ra   c                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
Res2NetBlocka  An implementation of Res2NetBlock w/ dilation.

    Arguments
    ---------
    in_channels : int
        The number of channels expected in the input.
    out_channels : int
        The number of output channels.
    scale : int
        The scale of the Res2Net block.
    kernel_size: int
        The kernel size of the Res2Net block.
    dilation : int
        The dilation of the Res2Net block.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 120, 64])
       r   r   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r/   r+   )ra   ).0ir+   hidden_channel
in_channelr/   r   r   
<listcomp>  s    z)Res2NetBlock.__init__.<locals>.<listcomp>r   )r   rf   r   r   
ModuleListrangeblocksscale)r   r7   r6   rq   r/   r+   r   rj   r   r     s   

zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr   dimr   )	enumeraterY   chunkrq   rp   appendcat)r   r   yri   x_iy_ir   r   r   r"     s   zRes2NetBlock.forward)rg   r   r   r#   r$   r%   __doc__r   r"   r&   r   r   r   r   rf      s    rf   c                       s*   e Zd ZdZ fddZdddZ  ZS )SEBlocka3  An implementation of squeeze-and-excitation block.

    Arguments
    ---------
    in_channels : int
        The number of input channels.
    se_channels : int
        The number of output channels after squeeze.
    out_channels : int
        The number of output channels.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> se_layer = SEBlock(64, 16, 64)
    >>> lengths = torch.rand((8,))
    >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 120, 64])
    c                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr   r7   r6   r/   T)inplace)r   r}   r   r4   conv1rY   r   re   reluconv2Sigmoidsigmoid)r   r7   se_channelsr6   r   r   r   r   =  s
   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr   r^   rT   r   r   Trs   keepdim)
r   r`   rT   r1   summeanr   r   r   r   )r   r   lengthsLr_   totalsr   r   r   r"   E  s   

zSEBlock.forwardrd   r{   r   r   r   r   r}   '  s    r}   c                       s,   e Zd ZdZd	 fdd	Zd
ddZ  ZS )AttentiveStatisticsPoolingaT  This class implements an attentive statistic pooling layer for each channel.
    It returns the concatenated mean and std of the input tensor.

    Arguments
    ---------
    channels: int
        The number of input channels.
    attention_channels: int
        The number of attention channels.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> asp_layer = AttentiveStatisticsPooling(64)
    >>> lengths = torch.rand((8,))
    >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 1, 128])
       Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=r   r   r~   )r   r   r   global_contextra   tdnnr   Tanhtanhr4   r5   )r   channelsattention_channelsr   r   r   r   r   j  s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|du rtj|j d |jd}t|| ||jd}|d	}| jr_|jdd
d	 }|||| \}}|d
d	d	|}|d
d	d	|}tj|||gd	d}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fd	d}
|
d}
|
S )zCalculates mean and std for a batch (input tensor).

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape [N, C, L].
        r   r   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr   )r   rY   sqrtr1   powclamp)r   mrs   r   r   stdr   r   r   _compute_statistics  s   *z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsNr   )rT   r   r   Tr   rr   z-inf)r   r   rY   onesrT   r`   r1   r   r   floatrepeatrw   r5   r   r   masked_fillr;   softmax)r   r   r   r   r   r_   r   r   r   attnpooled_statsr   r   r   r"   v  s(   


z"AttentiveStatisticsPooling.forward)r   Trd   r{   r   r   r   r   r   U  s    r   c                       s<   e Zd ZdZddddejjdf fdd	Zd
dd	Z  Z	S )SERes2NetBlocka  An implementation of building block in ECAPA-TDNN, i.e.,
    TDNN-Res2Net-TDNN-SEBlock.

    Arguments
    ----------
    out_channels: int
        The number of output channels.
    res2net_scale: int
        The scale of the Res2Net block.
    kernel_size: int
        The kernel size of the TDNN blocks.
    dilation: int
        The dilation of the Res2Net block.
    activation : torch class
        A class for constructing the activation layers.
    groups: int
    Number of blocked connections from input channels to output channels.

    Example
    -------
    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
    >>> out = conv(x).transpose(1, 2)
    >>> out.shape
    torch.Size([8, 120, 64])
    rg   r   r   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr   )r/   r+   rc   r-   r~   )r   r   r6   ra   tdnn1rf   res2net_blocktdnn2r}   se_blockshortcutr4   )	r   r7   r6   res2net_scaler   r/   r+   rc   r-   r   r   r   r     s<   

zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rd   )r   r   r   r   r   )r   r   r   residualr   r   r   r"     s   



zSERes2NetBlock.forwardrd   )
r#   r$   r%   r|   rY   r   re   r   r"   r&   r   r   r   r   r     s    *r   c                       sb   e Zd ZdZdejjg dg dg dddddg d	d
df fdd	ZdddZdddZ	  Z
S )
ECAPA_TDNNa  An implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).

    Arguments
    ---------
    activation : torch class
        A class for constructing the activation layers.
    channels : list of ints
        Output channels for TDNN/SERes2Net layer.
    kernel_sizes : list of ints
        List of kernel sizes for each layer.
    dilations : list of ints
        List of dilations for kernels in each layer.
    lin_neurons : int
        Number of neurons in linear layers.
    groups : list of ints
        List of groups for kernels in each layer.

    Example
    -------
    >>> input_feats = torch.rand([5, 120, 80])
    >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
    >>> outputs = compute_embedding(input_feats)
    >>> outputs.shape
    torch.Size([5, 1, 192])
       )   r   r   r   i   )   r   r   r   r   )r   r   r      r   r   rg   T)r   r   r   r   r      r   c                    s6  t    t|t|ksJ t|t|ksJ || _t | _|| _|| _| j	t
||d |d |d ||d  tdt|d D ]}| j	t||d  || ||	|| || ||| d qEt
|d |d |d |d ||d d| _t|d ||
d| _t|d d d| _t|d d |dd	| _d S )
Nr   r   )r   r   r/   r+   rc   r-   r   )r-   )r   r   r   rb   r~   )r   r   rC   r   r   rn   rp   window_sizewindow_shiftrv   ra   ro   r   mfar   aspr   asp_bnr4   fc)r   r   lin_neuronsrc   r   kernel_sizes	dilationsr   r   r   r   r-   r   r   ri   r   r   r   r     sb   




zECAPA_TDNN.__init__Nc           
      C   s   |j d }tt|| j }| jd }t|||ddfd}g }t|D ]?}|| j || j | j }}	| j	|d d d d ||	f |d urQt
|| d| jnd d}| |}| |}|| q%t
j|ddS )Nr   r   r)   r   rr   )r   rE   rF   rG   r   r   r;   r<   ro   r   rY   r   r   r   rv   rw   )
r   r   r   tt	num_chunkr<   	stat_listri   stedr   r   r   windowed_poolinge  s   



zECAPA_TDNN.windowed_poolingc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| jdu rV| j||d}| 	|}| 
|}|d}|S | ||}| dd}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        lengths: torch.Tensor
            Tensor of shape (batch, )
        r   r   r   Nrr   )r   rp   	TypeErrorrv   rY   rw   r   r   r   r   r   r=   r   )r   r   r   xllayerr   r   r   r"   |  s(   





zECAPA_TDNN.forwardrd   )r#   r$   r%   r|   rY   r   re   r   r   r"   r&   r   r   r   r   r     s"    
Mr   )NNN)rF   rY   torch.nnr   torch.nn.functional
functionalr;   Moduler   r'   rE   rA   r4   r   r`   ra   rf   r}   r   r   r   r   r   r   r   <module>   s     2n
<.SS