o
    }oi.                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ G dd dejZejj	
d'dejdejdededejf
ddZd(dee dedefddZd)dejdejdedefddZG dd  d ejZG d!d" d"ejZG d#d$ d$ejZG d%d& d&ejZdS )*    )ListN)inf)nn)
functional)get_same_paddinginit_weightsc                	       s>   e Zd ZdZddedededef fd	d
ZdddZ	  Z
S )StatsPoolLayera  Statistics and time average pooling (TAP) layer

    This computes mean and, optionally, standard deviation statistics across the time dimension.

    Args:
        feat_in: Input features with shape [B, D, T]
        pool_mode: Type of pool mode. Supported modes are 'xvector' (mean and standard deviation) and 'tap' (time
            average pooling, i.e., mean)
        eps: Epsilon, minimum value before taking the square root, when using 'xvector' mode.
        unbiased: Whether to use the biased estimator for the standard deviation when using 'xvector' mode. The default
            for torch.Tensor.std() is True.

    Returns:
        Pooled statistics with shape [B, D].

    Raises:
        ValueError if an unsupported pooling mode is specified.
    xvector绽|=Tfeat_in	pool_modeepsunbiasedc                    sh   t    ddh}||vrtd| d| d|| _|| _|| _|| _| jdkr2|  jd9  _d S d S )Nr	   tapzPool mode must be one of z; got ''   )super__init__
ValueErrorr   r   r   r   )selfr   r   r   r   supported_modes	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/tdnn_attention.pyr   -   s   

zStatsPoolLayer.__init__Nc           
      C   s  |d u r1|j dd}| jdkr-| jrdnd}|jd|dj| jd}tj||gdd}|S |}|S t||dd	}|	|d
}|j dd}||j
d | d }| jdkr| jrZdnd}||d	|d
dd|dd|j| jd }	tj||	fdd}|S |}|S )Ndimr	      r   )r   
correction)minF)likelengths
valid_onesg        g       @)meanr   r   stdclampr   torchcatmake_seq_mask_likemasked_fillshape	unsqueezesubpowsumdivviewsqrt)
r   encoder_outputlengthr$   r   r%   pooledmaskmeansstdsr   r   r   forward:   s8   

	zStatsPoolLayer.forward)r	   r
   TN)__name__
__module____qualname____doc__intstrfloatboolr   r9   __classcell__r   r   r   r   r      s     r   Tr   r!   r"   r#   time_dimreturnc                 C   sv   t j| j| | jd|jd d|d}t|  |  D ]}|d}q"|dkr4|	|d}|s9| }|S )Ndevicer   r   r   )
r'   aranger+   rG   repeatltr,   ranger   	transpose)r!   r"   r#   rD   r6   _r   r   r   r)   [   s   0r)   lensmax_lenrG   c                 C   sN   t ||}|d| d| dk }|d}t j|ddd}||fS )aQ  
    outputs masking labels for list of lengths of audio features, with max length of any
    mask as max_len
    input:
        lens: list of lens
        max_len: max length of any audio feature
    output:
        mask: masked labels
        num_values: sum of mask values for each feature (useful for computing statistics later)
    Nr   r   r   Tr   keepdim)r'   rH   tor,   r/   )rN   rO   rG   lens_matr6   
num_valuesr   r   r   lens_to_maskk   s
   
rU   r   r
   xmr   r   c                 C   sD   t j||  |d}t || || d ||}||fS )a  
    compute mean and standard deviation of input(x) provided with its masking labels (m)
    input:
        x: feature input
        m: averaged mask labels
    output:
        mean: mean of input features
        std: stadard deviation of input features
    r   r   )r'   r/   r2   r,   r.   r&   )rV   rW   r   r   r$   r%   r   r   r   get_statistics_with_mask}   s   
*rX   c                       sN   e Zd ZdZ				ddededededed	ef fd
dZdddZ  ZS )
TDNNModulea  
    Time Delayed Neural Module (TDNN) - 1D
    input:
        inp_filters: input filter channels for conv layer
        out_filters: output filter channels for conv layer
        kernel_size: kernel weight size for conv layer
        dilation: dilation for conv layer
        stride: stride for conv layer
        padding: padding for conv layer (default None: chooses padding value such that input and output feature shape matches)
    output:
        tdnn layer output
    r   Ninp_filtersout_filterskernel_sizedilationstridepaddingc                    sP   t    |d u rt|||d}tj|||||d| _t | _t|| _	d S )N)r^   r]   )in_channelsout_channelsr\   r]   r_   )
r   r   r   r   Conv1d
conv_layerReLU
activationBatchNorm1dbn)r   rZ   r[   r\   r]   r^   r_   r   r   r   r      s   
	
zTDNNModule.__init__c                 C   s   |  |}| |}| |S r:   )rc   re   rg   )r   rV   r4   r   r   r   r9      s   


zTDNNModule.forward)r   r   r   Nr:   r;   r<   r=   r>   r?   r   r9   rC   r   r   r   r   rY      s(    rY   c                       sB   e Zd ZdZddededededef
 fdd	ZdddZ  ZS )MaskedSEModulea  
    Squeeze and Excite module implementation with conv1d layers
    input:
        inp_filters: input filter channel size
        se_filters: intermediate squeeze and excite channel output and input size
        out_filters: output filter channel size
        kernel_size: kernel_size for both conv1d layers
        dilation: dilation size for both conv1d layers

    output:
        squeeze and excite layer output
    r   rZ   
se_filtersr[   r\   r]   c                    sL   t    ttj||||dt t|tj||||dt | _d S )Nr\   r]   )	r   r   r   
Sequentialrb   rd   rf   Sigmoidse_layer)r   rZ   rj   r[   r\   r]   r   r   r   r      s$   

zMaskedSEModule.__init__Nc                 C   sb   |d u rt j|ddd}n|d}t|||jd\}}t j|| ddd| }| |}|| S )Nr   T)r   keep_dimrO   rG   rP   )r'   r$   sizerU   rG   r/   rn   )r   inputr4   rV   rO   r6   rT   outr   r   r   r9      s   

zMaskedSEModule.forward)r   r   r:   rh   r   r   r   r   ri      s    $ri   c                       sT   e Zd ZdZ					ddededed	ed
ededef fddZdddZ  ZS )TDNNSEModuleaj  
    Modified building SE_TDNN group module block from ECAPA implementation for faster training and inference
    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
    inputs:
        inp_filters: input filter channel size
        out_filters: output filter channel size
        group_scale: scale value to group wider conv channels (deafult:8)
        se_channels: squeeze and excite output channel size (deafult: 1024/8= 128)
        kernel_size: kernel_size for group conv1d layers (default: 1)
        dilation: dilation size for group conv1d layers  (default: 1)
          r   xavier_uniformrZ   r[   group_scalese_channelsr\   r]   	init_modec           
         s   t    || _t||dd}tj||||||d}	tt||ddd|	t t	|t||ddd| _
t|||| _|  fdd d S )Nr   )r\   r]   r^   )r\   r]   r_   groupsrk   c                    s   t |  dS )N)mode)r   )rV   rz   r   r   <lambda>  s    z'TDNNSEModule.__init__.<locals>.<lambda>)r   r   r[   r   r   rb   rl   rY   rd   rf   group_tdnn_blockri   rn   apply)
r   rZ   r[   rx   ry   r\   r]   rz   padding_val
group_convr   r}   r   r      s(   

zTDNNSEModule.__init__Nc                 C   s   |  |}| ||}|| S r:   )r   rn   )r   rr   r4   rV   r   r   r   r9     s   
zTDNNSEModule.forward)ru   rv   r   r   rw   r:   )	r;   r<   r=   r>   r?   r@   r   r9   rC   r   r   r   r   rt      s.    "rt   c                       sJ   e Zd ZdZ				ddedededed	ef
 fd
dZdddZ  ZS )AttentivePoolLayera  
    Attention pooling layer for pooling speaker embeddings
    Reference: ECAPA-TDNN Embeddings for Speaker Diarization (https://arxiv.org/pdf/2104.01466.pdf)
    inputs:
        inp_filters: input feature channel length from encoder
        attention_channels: intermediate attention channel size
        kernel_size: kernel_size for TDNN and attention conv1d layers (default: 1)
        dilation: dilation size for TDNN and attention conv1d layers  (default: 1)
    rv   r   r
   rZ   attention_channelsr\   r]   r   c              
      sP   t    d| | _tt|d |||dt tj||||d| _|| _	d S )Nr      rk   )r`   ra   r\   r]   )
r   r   r   r   rl   rY   Tanhrb   attention_layerr   )r   rZ   r   r\   r]   r   r   r   r   r   &  s   



zAttentivePoolLayer.__init__Nc                 C   s   | d}|d u rtj|jd |jd}t|||jd\}}t||| \}}|ddd|}|ddd|}tj	|||gdd}| 
|}||dkt }tj|dd}	t||	\}
}tj	|
|fdddS )Nr   r   rF   rp   r   r   )rq   r'   onesr+   rG   rU   rX   r,   rI   r(   r   r*   r   Fsoftmax)r   rV   r4   rO   r6   rT   r$   r%   attnalphamusgr   r   r   r9   >  s   

zAttentivePoolLayer.forward)rv   r   r   r
   r:   )	r;   r<   r=   r>   r?   rA   r   r9   rC   r   r   r   r   r     s$    r   )Tr   r:   )r   r
   )typingr   r'   numpyr   r   torch.nnr   r   ,nemo.collections.asr.parts.submodules.jasperr   r   Moduler   jitscript_if_tracingTensorrB   r?   r)   r@   rU   rA   rX   rY   ri   rt   r   r   r   r   r   <module>   s4   B ,.5