o
    ߥiA                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlm  mZ ddlmZ ddlmZmZ ddlmZ d&ddZd	ed
ededefddZG dd de	jZG dd de	jZG dd de	jZG dd dej	jZG dd de	jZ G dd de	jZ!G dd de	jZ"G dd de	jZ#G dd  d e	jZ$G d!d" d"e	jZ%ej&ej'ej(d#G d$d% d%eZ)dS )'z This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    RDINOHead implementation is adapted from DINO framework.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasksc                 C   s   t | jdks	J |d u r|    }tj|| j| jd	t | || 
dk }|d u r1| j}|d u r8| j}tj|||d}|S )N   )devicedtype)r   r
   )lenshapemaxlongitemtorcharanger
   r   expand	unsqueeze	as_tensor)lengthmax_lenr   r
   mask r   T/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/rdino.pylength_to_mask   s"   
r   L_instridekernel_sizedilationc                 C   s   |dkr%t | ||  | d }||d  ||  }|d |d g}|S | ||d   d | d }| | d | | d g}|S )Nr	      )mathceil)r   r   r   r   n_stepsL_outpaddingr   r   r   get_padding_elem&   s   r&   c                       sH   e Zd Z						d fdd	Zdd Zd	ed
edefddZ  ZS )Conv1dr	   sameTreflectc
           
   
      sN   t    || _|| _|| _|| _|	| _tj||| j| j| jd||d| _	d S )Nr   )r   r   r%   groupsbias)
super__init__r   r   r   r%   padding_modennr'   conv)
selfout_channelsr   in_channelsr   r   r%   r*   r+   r.   	__class__r   r   r-   5   s    
zConv1d.__init__c                 C   sv   | j dkr| || j| j| j}n#| j dkr'| jd | j }t||df}n| j dkr-ntd| j  | |}|S )Nr(   causalr	   r   validz1Padding must be 'same', 'valid' or 'causal'. Got )	r%   _manage_paddingr   r   r   Fpad
ValueErrorr0   )r1   xnum_padwxr   r   r   forwardS   s    



zConv1d.forwardr   r   r   c                 C   s.   |j d }t||||}tj||| jd}|S )N)mode)r   r&   r9   r:   r.   )r1   r<   r   r   r   r   r%   r   r   r   r8   h   s   
zConv1d._manage_padding)r	   r	   r(   r	   Tr)   )__name__
__module____qualname__r-   r?   intr8   __classcell__r   r   r4   r   r'   3   s     r'   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )BatchNorm1dh㈵>皙?c                    s    t    tj|||d| _d S )N)epsmomentum)r,   r-   r/   rG   norm)r1   
input_sizerJ   rK   r4   r   r   r-   x   s   
zBatchNorm1d.__init__c                 C   s
   |  |S N)rL   r1   r<   r   r   r   r?      s   
zBatchNorm1d.forward)rH   rI   rB   rC   rD   r-   r?   rF   r   r   r4   r   rG   v   s
    rG   c                       s,   e Zd Zejdf fdd	Zdd Z  ZS )	TDNNBlockr	   c                    s:   t t|   t|||||d| _| | _t|d| _d S )N)r3   r2   r   r   r*   rM   )r,   rQ   r-   r'   r0   
activationrG   rL   )r1   r3   r2   r   r   rS   r*   r4   r   r   r-      s   	zTDNNBlock.__init__c                 C   s   |  | | |S rN   )rL   rS   r0   rO   r   r   r   r?      s   zTDNNBlock.forward)rB   rC   rD   r/   ReLUr-   r?   rF   r   r   r4   r   rQ      s
    rQ   c                       s,   e Zd Z			d fdd	Zdd Z  ZS )	Res2NetBlock      r	   c                    sp   t t|   || dksJ || dksJ || || t fddt|d D | _|| _d S )Nr   c                    s   g | ]
}t  d qS ))r   r   )rQ   ).0ir   hidden_channel
in_channelr   r   r   
<listcomp>   s    z)Res2NetBlock.__init__.<locals>.<listcomp>r	   )r,   rU   r-   r/   
ModuleListrangeblocksscale)r1   r3   r2   ra   r   r   r4   rZ   r   r-      s   


zRes2NetBlock.__init__c                 C   s   g }t tj|| jddD ])\}}|dkr|}n|dkr&| j|d  |}n| j|d  || }|| qtj|dd}|S )Nr	   dimr   )	enumerater   chunkra   r`   appendcat)r1   r<   yrY   x_iy_ir   r   r   r?      s   zRes2NetBlock.forward)rV   rW   r	   rP   r   r   r4   r   rU      s    rU   c                       s&   e Zd Z fddZdddZ  ZS )SEBlockc                    sN   t t|   t||dd| _tjjdd| _t||dd| _	tj
 | _d S )Nr	   r3   r2   r   T)inplace)r,   rk   r-   r'   conv1r   r/   rT   reluconv2Sigmoidsigmoid)r1   r3   se_channelsr2   r4   r   r   r-      s   zSEBlock.__init__Nc                 C   s   |j d }|d ur+t|| ||jd}|d}|jddd}|| jddd| }n|jddd}| | |}| | 	|}|| S )Nr@   r   r
   r	   r    Trc   keepdim)
r   r   r
   r   summeanro   rn   rr   rp   )r1   r<   lengthsLr   totalsr   r   r   r?      s   

zSEBlock.forwardrN   rP   r   r   r4   r   rk      s    
rk   c                       s(   e Zd Zd fdd	Zd	ddZ  ZS )
AttentiveStatisticsPooling   Tc                    s^   t    d| _|| _|rt|d |dd| _nt||dd| _t | _t	||dd| _
d S )Ng-q=rW   r	   rl   )r,   r-   rJ   global_contextrQ   tdnnr/   Tanhtanhr'   r0   )r1   channelsattention_channelsr   r4   r   r   r-      s   

z#AttentiveStatisticsPooling.__init__Nc                 C   s(  |j d }d| jfdd}|d u rtj|j d |jd}t|| ||jd}|d}| jr_|jdd	d
	 }|||| \}}|d
dd|}|d
dd|}tj|||gdd}	n|}	| | | |	}	|	|dkt	d}	tj|	dd}	|||	\}}tj||fdd}
|
d}
|
S )Nr@   r    c                 S   s@   ||   |}t|| || d  ||}||fS )Nr    )rw   r   sqrtr   powclamp)r<   mrc   rJ   rx   stdr   r   r   _compute_statistics   s
   "z?AttentiveStatisticsPooling.forward.<locals>._compute_statisticsr   )r
   rt   r	   Tru   rb   z-inf)r   rJ   r   onesr
   r   r   r   rw   floatrepeatrg   r0   r   r   masked_fillr9   softmax)r1   r<   ry   rz   r   r   r{   rx   r   attnpooled_statsr   r   r   r?      s(   


z"AttentiveStatisticsPooling.forward)r~   TrN   rP   r   r   r4   r   r}      s    r}   c                       s8   e Zd Zddddejjdf fdd	Zd	ddZ  ZS )
SERes2NetBlockrV   r~   r	   c	           	         s   t    || _t||dd||d| _t|||||| _t||dd||d| _t|||| _	d | _
||kr?t||dd| _
d S d S )Nr	   )r   r   rS   r*   rl   )r,   r-   r2   rQ   tdnn1rU   res2net_blocktdnn2rk   se_blockshortcutr'   )	r1   r3   r2   res2net_scalers   r   r   rS   r*   r4   r   r   r-   %  s<   
zSERes2NetBlock.__init__Nc                 C   sF   |}| j r
|  |}| |}| |}| |}| ||}|| S rN   )r   r   r   r   r   )r1   r<   ry   residualr   r   r   r?   N  s   



zSERes2NetBlock.forwardrN   )	rB   rC   rD   r   r/   rT   r-   r?   rF   r   r   r4   r   r   #  s    )r   c                       sV   e Zd ZdZddejjg dg dg ddddd	g d
f fdd	ZdddZ  Z	S )
ECAPA_TDNNzAn implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    cpu   )r   r   r   r   i   )   rW   rW   rW   r	   )r	   r    rW      r	   r~   rV   T)r	   r	   r	   r	   r	   c                    s*  t    t|t|ksJ t|t|ksJ || _t | _| jt||d |d |d ||d  t	dt|d D ]}| jt
||d  || |	|
|| || ||| d q?t|d |d |d |d ||d d| _t|d ||d| _t|d d d| _t|d d |dd	| _d S )
Nr   r	   )r   rs   r   r   rS   r*   r@   )r*   )r   r   r    rR   rl   )r,   r-   r   r   r/   r^   r`   rf   rQ   r_   r   mfar}   asprG   asp_bnr'   fc)r1   rM   r
   lin_neuronsrS   r   kernel_sizes	dilationsr   r   rs   r   r*   rY   r4   r   r   r-   a  s^   




zECAPA_TDNN.__init__Nc              	   C   s   | dd}g }| jD ]}z|||d}W n ty"   ||}Y nw || qtj|dd dd}| |}| j||d}| |}| 	|}| dd
d}|S )zReturns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        r	   r    )ry   Nrb   )	transposer`   	TypeErrorrf   r   rg   r   r   r   r   squeeze)r1   r<   ry   xllayerr   r   r   r?     s    



zECAPA_TDNN.forwardrN   )
rB   rC   rD   __doc__r   r/   rT   r-   r?   rF   r   r   r4   r   r   [  s    Hr   c                       s:   e Zd Z						d fdd	Zd	d
 Zdd Z  ZS )	RDINOHeadFTrW             c	                    s  t    t|d}|dkrt||| _nJt||g}	|r'|	t| |	t  t	|d D ]}
|	t|| |rI|	t| |	t  q4|	t|| tj
|	 | _t||| _| | j tjtj||dd| _| jjjd |rd| jj_d S d S )Nr	   r    F)r+   )r,   r-   r   r/   Linearmlprf   rG   GELUr_   
Sequential	add_layerapply_init_weightsutilsweight_norm
last_layerweight_gdatafill_requires_grad)r1   in_dimout_dimuse_bnnorm_last_layernlayers
hidden_dimbottleneck_dimadd_dimlayers_r4   r   r   r-     s0   
	
zRDINOHead.__init__c                 C   sV   t |tjr%tjjj|jdd t |tjr'|jd ur)tj|jd d S d S d S d S )Ng{Gz?)r   r   )	
isinstancer/   r   r   inittrunc_normal_weightr+   	constant_)r1   r   r   r   r   r     s   zRDINOHead._init_weightsc                 C   s8   |  |}| |}tjj|ddd}| |}||fS )Nr@   r    )rc   p)r   r   r/   
functional	normalizer   )r1   r<   vicr_outr   r   r   r?     s
   


zRDINOHead.forward)FTrW   r   r   r   )rB   rC   rD   r-   r   r?   rF   r   r   r4   r   r     s    "r   c                       s$   e Zd Z fddZdd Z  ZS )Combinec                    s   t t|   || _|| _d S rN   )r,   r   r-   backbonehead)r1   r   r   r4   r   r   r-     s   
zCombine.__init__c                 C   s   |  |}| |}|S rN   )r   r   )r1   r<   outputr   r   r   r?     s   

zCombine.forwardrP   r   r   r4   r   r     s    r   )module_namec                       sD   e Zd Zdeeef f fddZdd Zdd Zdd	d
Z	  Z
S )SpeakerVerification_RDINOmodel_configc                    s   t  j||g|R i | || _|| _| jd dkrtdd| _g d}t| j|d| _t| jt	ddd	| _|d
 }| 
| | j  d S )Nchannel   zFmodelscope error: Currently only 1024-channel ecapa tdnn is supported.P   )r   r   r   r   i   )r   r   i   Tpretrained_model)r,   r-   r   other_configr;   feature_dimr   embedding_modelr   r   ,_SpeakerVerification_RDINO__load_check_pointeval)r1   	model_dirr   argskwargschannels_configpretrained_model_namer4   r   r   r-     s$   

z"SpeakerVerification_RDINO.__init__c                 C   s>   t |jdkr|jd dksJ d| |}| j|}|S )Nr    r   r	   zFmodelscope error: the shape of input audio to model needs to be [1, T])r   r   +_SpeakerVerification_RDINO__extract_featurer   r   )r1   audiofeature	embeddingr   r   r   r?   $  s   
z!SpeakerVerification_RDINO.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   Tru   )Kaldifbankr   rx   r   )r1   r   r   r   r   r   __extract_feature-  s   
z+SpeakerVerification_RDINO.__extract_featureNc                 C   sR   |st d}t jtj| j||d}dd |d  D }| jj	|dd d S )Nr   )map_locationc                 S   s   i | ]\}}| d d|qS )zmodule. )replace)rX   kvr   r   r   
<dictcomp>9  s    z@SpeakerVerification_RDINO.__load_check_point.<locals>.<dictcomp>teacherT)strict)
r   r
   loadospathjoinr   itemsr   load_state_dict)r1   r   r
   
state_dictstate_dict_tear   r   r   __load_check_point3  s   

z,SpeakerVerification_RDINO.__load_check_pointrN   )rB   rC   rD   r   strr   r-   r?   r   r   rF   r   r   r4   r   r   	  s
    	r   )NNN)*r   r!   r   typingr   r   r   r   torch.nnr/   torch.nn.functionalr   r9   torchaudio.compliance.kaldi
compliancekaldir   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   r   rE   r&   Moduler'   rG   rQ   rU   rk   r}   r   r   r   r   register_modulespeaker_verificationrdino_tdnn_svr   r   r   r   r   <module>   s6   
C'=8o2