o
    ߥiIH                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ d.ddZd	ed
ededefddZG dd de	jZG dd de	jZG dd de	jZG dd dej	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#dd  Z$d/d%d&Z%G d'd( d(e	jZ&G d)d* d*ej	jZ'ej(ej)ej*d+G d,d- d-eZ+dS )0aB   This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    Self-Distillation Prototypes Network(SDPN) is a self-supervised learning framwork in SV.
    It comprises a teacher and a student network with identical architecture
    but different parameters. Teacher/student network consists of three main modules:
    the encoder for extracting speaker embeddings, multi-layer perceptron for
    feature transformation, and prototypes for computing soft-distributions between
    global and local views. EMA denotes Exponential Moving Average.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r
   )lenshapemaxlongitemtorcharanger
   r   expand	unsqueeze	as_tensor)lengthmax_lenr   r
   mask r   S/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/sdpn.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S Nr	      )mathceil)r   r   r   r   n_stepsL_outpaddingr   r   r   get_padding_elem+   s   r'   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr	   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r&   groupsbias)
super__init__r   r   r   r&   padding_modennr(   conv)
selfout_channelsr   in_channelsr   r   r&   r+   r,   r/   	__class__r   r   r.   :   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr)   causalr	   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r&   _manage_paddingr   r   r   Fpad
ValueErrorr1   )r2   xnum_padwxr   r   r   forwardX   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r'   r:   r;   r/   )r2   r=   r   r   r   r   r&   r   r   r   r9   m   s   
zConv1d._manage_padding)r	   r	   r)   r	   Tr*   )__name__
__module____qualname__r.   r@   intr9   __classcell__r   r   r5   r   r(   8   s     r(   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r-   r.   r0   rH   norm)r2   
input_sizerK   rL   r5   r   r   r.   }   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rM   r2   r=   r   r   r   r@      s   
zBatchNorm1d.forward)rI   rJ   rC   rD   rE   r.   r@   rG   r   r   r5   r   rH   {   s
    rH   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr	   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r4   r3   r   r   r+   rN   )r-   rR   r.   r(   r1   
activationrH   rM   )r2   r4   r3   r   r   rT   r+   r5   r   r   r.      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rO   )rM   rT   r1   rP   r   r   r   r@      s   zTDNNBlock.forward)rC   rD   rE   r0   ReLUr.   r@   rG   r   r   r5   r   rR      s
    rR   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r	   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rR   ).0ir   hidden_channel
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r	   )r-   rV   r.   r0   
ModuleListrangeblocksscale)r2   r4   r3   rb   r   r   r5   r[   r   r.      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr	   dimr   )	enumerater   chunkrb   ra   appendcat)r2   r=   yrZ   x_iy_ir   r   r   r@      s   zRes2NetBlock.forward)rW   rX   r	   rQ   r   r   r5   r   rV      s    rV   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr	   r4   r3   r   T)inplace)r-   rl   r.   r(   conv1r   r0   rU   reluconv2Sigmoidsigmoid)r2   r4   se_channelsr3   r5   r   r   r.      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )NrA   r   r
   r	   r!   Trd   keepdim)
r   r   r
   r   summeanrp   ro   rs   rq   )r2   r=   lengthsLr   totalsr   r   r   r@      s   

zSEBlock.forwardrO   rQ   r   r   r5   r   rl      s    
rl   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rX   r	   rm   )r-   r.   rK   global_contextrR   tdnnr0   Tanhtanhr(   r1   )r2   channelsattention_channelsr   r5   r   r   r.      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )NrA   r!   c                 S   s@   ||   |}t|| || d  ||}||fS )Nr!   )rx   r   sqrtr   powclamp)r=   mrd   rK   ry   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r
   ru   r	   Trv   rc   z-inf)r   rK   r   onesr
   r   r   r   rx   floatrepeatrh   r1   r   r   masked_fillr:   softmax)r2   r=   rz   r{   r   r   r|   ry   r   attnpooled_statsr   r   r   r@      s(   


z"AttentiveStatisticsPooling.forward)r   TrO   rQ   r   r   r5   r   r~      s    r~   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrW   r   r	   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr	   )r   r   rT   r+   rm   )r-   r.   r3   rR   tdnn1rV   res2net_blocktdnn2rl   se_blockshortcutr(   )	r2   r4   r3   res2net_scalert   r   r   rT   r+   r5   r   r   r.   *  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rO   )r   r   r   r   r   )r2   r=   rz   residualr   r   r   r@   S  s   



zSERes2NetBlock.forwardrO   )	rC   rD   rE   r   r0   rU   r.   r@   rG   r   r   r5   r   r   (  s    )r   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )r   r   r   r   i   )   rX   rX   rX   r	   )r	   r!   rX      r	   r   rW   T)r	   r	   r	   r	   r	   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r	   )r   rt   r   r   rT   r+   rA   )r+   )r   r   r!   rS   rm   )r-   r.   r   r   r0   r_   ra   rg   rR   r`   r   mfar~   asprH   asp_bnr(   fc)r2   rN   r
   lin_neuronsrT   r   kernel_sizes	dilationsr   r   rt   r   r+   rZ   r5   r   r   r.   f  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r	   r!   )rz   Nrc   )	transposera   	TypeErrorrg   r   rh   r   r   r   r   squeeze)r2   r=   rz   xllayerr   r   r   r@     s    



zECAPA_TDNN.forwardrO   )
rC   rD   rE   __doc__r   r0   rU   r.   r@   rG   r   r   r5   r   r   `  s    Hr   c                 C   s   dd }||d|  k s||d|  krt jddd t B ||| | }||| | }| d| d d| d  |   | |td  | 	| | j
||d | W  d    S 1 sdw   Y  d S )	Nc                 S   s   dt | t d  d S )N      ?       @)r"   erfr   )r=   r   r   r   norm_cdf  s   z(_no_grad_trunc_normal_.<locals>.norm_cdfr!   zimean is more than 2 std from [a, b] in nn.init.trunc_normal_.The distribution of values may be incorrect.)
stacklevelr	   r   )minr   )warningswarnr   no_graduniform_erfinv_mul_r"   r   add_clamp_)tensorry   r   abr   l_ur   r   r   _no_grad_trunc_normal_  s     

$r           r          r   c                 C   s   t | ||||S rO   )r   )r   ry   r   r   r   r   r   r   trunc_normal_  s   r   c                       s6   e Zd Z				d fdd	Zdd Zd	d
 Z  ZS )SDPNHeadFrX         c                    s   t    t|d}|dkrt||| _nJt||g}|r'|t| |t  t	|d D ]}|t|| |rI|t| |t  q4|t|| tj
| | _| | j d S r    )r-   r.   r   r0   Linearmlprg   rH   GELUr`   
Sequentialapply_init_weights)r2   in_dimuse_bnnlayers
hidden_dimbottleneck_dimlayers_r5   r   r   r.     s    

zSDPNHead.__init__c                 C   sP   t |tjr"t|jdd t |tjr$|jd ur&tj|jd d S d S d S d S )Ng{Gz?)r   r   )
isinstancer0   r   r   weightr,   init	constant_)r2   r   r   r   r   r     s   zSDPNHead._init_weightsc                 C   s    |  |}tjj|ddd}|S )NrA   r!   )rd   p)r   r0   
functional	normalizerP   r   r   r   r@     s   
zSDPNHead.forward)FrX   r   r   )rC   rD   rE   r.   r   r@   rG   r   r   r5   r   r     s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )Combinerz1
    Combine backbone (ECAPA) and head (MLP)
    c                    s   t t|   || _|| _d S rO   )r-   r   r.   backbonehead)r2   r   r   r5   r   r   r.   #  s   
zCombiner.__init__c                 C   s   |  |}| |}||fS rO   )r   r   )r2   r=   outputr   r   r   r@   (  s   

zCombiner.forward)rC   rD   rE   r   r.   r@   rG   r   r   r5   r   r     s    r   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationSDPNz
    Self-Distillation Prototypes Network (SDPN) effectively facilitates
    self-supervised speaker representation learning. The specific structure can be
    referred to in https://arxiv.org/pdf/2308.02774.
    model_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| j|d| _t| jt	dd| _|d	 }| 
| | j  d S )
Nchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   )r   r   Tpretrained_model)r-   r.   r   other_configr<   feature_dimr   embedding_modelr   r   *_SpeakerVerificationSDPN__load_check_pointeval)r2   	model_dirr   argskwargschannels_configpretrained_model_namer5   r   r   r.   6  s$   
z SpeakerVerificationSDPN.__init__c                 C   s>   t |jdkr|jd dksJ d| |}| j|}|S )Nr!   r   r	   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   )_SpeakerVerificationSDPN__extract_featurer   r   )r2   audiofeature	embeddingr   r   r   r@   M  s   
zSpeakerVerificationSDPN.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   Trv   )Kaldifbankr   ry   r   )r2   r   r   r   r   r   __extract_featureV  s   
z)SpeakerVerificationSDPN.__extract_featureNc                 C   sR   |st d}t jtj| j||d}dd |d  D }| jj	|dd d S )Nr   )map_locationc                 S   s   i | ]\}}| d d|qS )zmodule. )replace)rY   kvr   r   r   
<dictcomp>b  s    z>SpeakerVerificationSDPN.__load_check_point.<locals>.<dictcomp>teacherT)strict)
r   r
   loadospathjoinr   itemsr   load_state_dict)r2   r   r
   
state_dictstate_dict_tear   r   r   __load_check_point\  s   

z*SpeakerVerificationSDPN.__load_check_pointrO   )rC   rD   rE   r   r   strr   r.   r@   r   r   rG   r   r   r5   r   r   .  s    	r   )NNN)r   r   r   r   ),r   r"   r  typingr   r   r   r   torch.nnr0   torch.nn.functionalr   r:   torchaudio.compliance.kaldi
compliancekaldir   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   r   rF   r'   Moduler(   rH   rR   rV   rl   r~   r   r   r   r   r   r   register_modulespeaker_verificationsdpn_svr   r   r   r   r   <module>   s6   
C'=8o
$&