o
    ߥi:                     @   sx  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ ddlmZ d#dd	Zd
edededefddZG dd dejZ G dd dejZ!G dd dejZ"G dd de	jjZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'ej(ej)ej*d G d!d" d"eZ+dS )$z\ This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r   )lenshapemaxlongitemtorcharanger   r   expand	unsqueeze	as_tensor)lengthmax_lenr   r   mask r   Y/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/ecapa_tdnn.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr
      )mathceil)r   r   r   r    n_stepsL_outpaddingr   r   r   get_padding_elem'   s   r'   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr
   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r    r&   groupsbias)
super__init__r   r   r    r&   padding_modennr(   conv)
selfout_channelsr   in_channelsr   r    r&   r+   r,   r/   	__class__r   r   r.   6   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr)   causalr
   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r&   _manage_paddingr   r    r   Fpad
ValueErrorr1   )r2   xnum_padwxr   r   r   forwardT   s    



zConv1d.forwardr   r    r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r'   r:   r;   r/   )r2   r=   r   r    r   r   r&   r   r   r   r9   i   s   
zConv1d._manage_padding)r
   r
   r)   r
   Tr*   )__name__
__module____qualname__r.   r@   intr9   __classcell__r   r   r5   r   r(   4   s     r(   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r-   r.   r0   rH   norm)r2   
input_sizerK   rL   r5   r   r   r.   y   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rM   r2   r=   r   r   r   r@      s   
zBatchNorm1d.forward)rI   rJ   rC   rD   rE   r.   r@   rG   r   r   r5   r   rH   w   s
    rH   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr
   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r4   r3   r   r    r+   rN   )r-   rR   r.   r(   r1   
activationrH   rM   )r2   r4   r3   r   r    rT   r+   r5   r   r   r.      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rO   )rM   rT   r1   rP   r   r   r   r@      s   zTDNNBlock.forward)rC   rD   rE   r0   ReLUr.   r@   rG   r   r   r5   r   rR      s
    rR   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r
   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r    )rR   ).0ir    hidden_channel
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r
   )r-   rV   r.   r0   
ModuleListrangeblocksscale)r2   r4   r3   rb   r   r    r5   r[   r   r.      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr
   dimr   )	enumerater   chunkrb   ra   appendcat)r2   r=   yrZ   x_iy_ir   r   r   r@      s   zRes2NetBlock.forward)rW   rX   r
   rQ   r   r   r5   r   rV      s    rV   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr
   r4   r3   r   T)inplace)r-   rl   r.   r(   conv1r   r0   rU   reluconv2Sigmoidsigmoid)r2   r4   se_channelsr3   r5   r   r   r.      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )NrA   r   r   r
   r!   Trd   keepdim)
r   r   r   r   summeanrp   ro   rs   rq   )r2   r=   lengthsLr   totalsr   r   r   r@      s   

zSEBlock.forwardrO   rQ   r   r   r5   r   rl      s    
rl   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rX   r
   rm   )r-   r.   rK   global_contextrR   tdnnr0   Tanhtanhr(   r1   )r2   channelsattention_channelsr   r5   r   r   r.      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )NrA   r!   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr!   )rx   r   sqrtr   powclamp)r=   mrd   rK   ry   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r   ru   r
   Trv   rc   z-inf)r   rK   r   onesr   r   r   r   rx   floatrepeatrh   r1   r   r   masked_fillr:   softmax)r2   r=   rz   r{   r   r   r|   ry   r   attnpooled_statsr   r   r   r@      s(   


z"AttentiveStatisticsPooling.forward)r   TrO   rQ   r   r   r5   r   r~      s    r~   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrW   r   r
   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr
   )r   r    rT   r+   rm   )r-   r.   r3   rR   tdnn1rV   res2net_blocktdnn2rl   se_blockshortcutr(   )	r2   r4   r3   res2net_scalert   r   r    rT   r+   r5   r   r   r.   &  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rO   )r   r   r   r   r   )r2   r=   rz   residualr   r   r   r@   O  s   



zSERes2NetBlock.forwardrO   )	rC   rD   rE   r   r0   rU   r.   r@   rG   r   r   r5   r   r   $  s    )r   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )   r   r   r   i   )   rX   rX   rX   r
   )r
   r!   rX      r
   r   rW   T)r
   r
   r
   r
   r
   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r
   )r   rt   r   r    rT   r+   rA   )r+   )r   r   r!   rS   rm   )r-   r.   r   r   r0   r_   ra   rg   rR   r`   r   mfar~   asprH   asp_bnr(   fc)r2   rN   r   lin_neuronsrT   r   kernel_sizes	dilationsr   r   rt   r   r+   rZ   r5   r   r   r.   b  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r
   r!   )rz   Nrc   )	transposera   	TypeErrorrg   r   rh   r   r   r   r   squeeze)r2   r=   rz   xllayerr   r   r   r@     s    



zECAPA_TDNN.forwardrO   )
rC   rD   rE   __doc__r   r0   rU   r.   r@   rG   r   r   r5   r   r   \  s    Hr   )module_namec                       sB   e Zd Zdeeef f fddZdd Zdd Zdd	 Z	  Z
S )
SpeakerVerificationECAPATDNNmodel_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| jd | _t| j t	| j|d| _
|d }| | | j
| j | j
  d S )	Nchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   r   )r   pretrained_model)r-   r.   r   other_configr<   feature_dimr	   r   printr   embedding_model/_SpeakerVerificationECAPATDNN__load_check_pointtoeval)r2   	model_dirr   argskwargschannels_configpretrained_model_namer5   r   r   r.     s$   

z%SpeakerVerificationECAPATDNN.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr
   r   r!   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayr   
from_numpyr   r   r   ._SpeakerVerificationECAPATDNN__extract_featurer   r   r   detachr   )r2   audiofeature	embeddingr   r   r   r@     s   


z$SpeakerVerificationECAPATDNN.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )num_mel_binsTrv   )Kaldifbankr   r   ry   rg   r   rh   )r2   r   featuresaur   r   r   r   __extract_feature  s   
z.SpeakerVerificationECAPATDNN.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )Nr   )map_locationT)strict)	r   load_state_dictr   loadospathjoinr   r   )r2   r   r   r   r   __load_check_point  s   
z/SpeakerVerificationECAPATDNN.__load_check_point)rC   rD   rE   r   strr   r.   r@   r   r   rG   r   r   r5   r   r     s
    
r   )NNN),r   r"   r   typingr   r   r   numpyr   r   torch.nnr0   torch.nn.functional
functionalr:   torchaudio.compliance.kaldi
compliancekaldir   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   modelscope.utils.devicer	   r   rF   r'   Moduler(   rH   rR   rV   rl   r~   r   r   register_modulespeaker_verificationecapa_tdnn_svr   r   r   r   r   <module>   s6   
C'=8o