o
    ߥi)                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	 d	ej!Z"G d
d dej!Z#ej$ej%ej&dG dd deZ'dS )aV  
    This TDNN implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    TDNN replaces i-vectors for text-independent speaker verification with embeddings
    extracted from a feedforward deep neural network. The specific structure can be
    referred to in https://www.danielpovey.com/files/2017_interspeech_embeddings.pdf.
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )	TdnnLayer   r   c                    s`   t t|   || _|| _|| _|| _|| _tj	| j| j| j| j| jd| _
tj|dd| _dS )aT  Define the TDNN layer, essentially 1-D convolution

        Args:
            in_dim (int): input dimension
            out_dim (int): output channels
            context_size (int): context size, essentially the filter size
            dilation (int, optional):  Defaults to 1.
            padding (int, optional):  Defaults to 0.
        )dilationpaddingF)affineN)superr
   __init__in_dimout_dimcontext_sizer   r   nnConv1dconv_1dBatchNorm1dbn)selfr   r   r   r   r   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/xvector.pyr      s   
zTdnnLayer.__init__c                 C   s"   |  |}t|}| |}|S )N)r   Frelur   )r   xoutr   r   r   forward5   s   


zTdnnLayer.forward)r   r   __name__
__module____qualname__r   r"   __classcell__r   r   r   r   r
      s    r
   c                       s0   e Zd Z					d	 fdd	Zdd Z  ZS )
XVEC(        TSTPc                    s   t t|   || _|| _|| _t||ddd| _t||ddd| _t||ddd| _	t||ddd| _
t||ddd| _|dksE|dkrGdnd| _tt|| jd| _t| j| j || _d	S )
z
        Implementation of Kaldi style xvec, as described in
        X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION
           r   )r   r         TAPTSDP)r   N)r   r(   r   feat_dim	stats_dim	embed_dimr
   frame_1frame_2frame_3frame_4frame_5n_statsgetattrpooling_layerspoolr   Linearseg_1)r   r2   hid_dimr3   r4   pooling_funcr   r   r   r   >   s    
zXVEC.__init__c                 C   sX   | ddd}| |}| |}| |}| |}| |}| |}| |}|S )Nr   r/   r   )permuter5   r6   r7   r8   r9   r=   r?   )r   r    r!   statsembed_ar   r   r   r"   X   s   






zXVEC.forward)r)   r*   r+   r*   r,   r#   r   r   r   r   r(   <   s    r(   )module_namec                       sB   e Zd Zdeeef f fddZdd Zdd Zdd	 Z	  Z
S )
SpeakerVerificationTDNNmodel_configc                    s   t  j||g|R i | || _|| _d| _d| _t| jd | _t| j t	| j| jd| _
|d }| | | j
| j | j
  d S )NP   r*   device)r2   r4   pretrained_model)r   r   rG   other_configfeature_dimr4   r	   rI   printr(   embedding_model*_SpeakerVerificationTDNN__load_check_pointtoeval)r   	model_dirrG   argskwargspretrained_model_namer   r   r   r   i   s   

z SpeakerVerificationTDNN.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r/   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpylenshape	unsqueeze)_SpeakerVerificationTDNN__extract_featurerN   rP   rI   detachcpu)r   audiofeature	embeddingr   r   r   r"   |   s   


zSpeakerVerificationTDNN.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )num_mel_binsT)dimkeepdim)Kaldifbankr]   rL   meanappendrY   cat)r   ra   featuresaurb   r   r   r   __extract_feature   s   
z)SpeakerVerificationTDNN.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )Nr`   )map_locationT)strict)	rN   load_state_dictrY   loadospathjoinrR   rI   )r   rU   r   r   r   __load_check_point   s   
z*SpeakerVerificationTDNN.__load_check_point)r$   r%   r&   r   strr   r   r"   r^   rO   r'   r   r   r   r   rF   f   s
    
rF   )(__doc__mathrs   typingr   r   r   numpyrW   rY   torch.nnr   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldirg   )modelscope.models.audio.sv.pooling_layersmodelsra   svr<   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   modelscope.utils.devicer	   Moduler
   r(   register_modulespeaker_verificationtdnn_svrF   r   r   r   r   <module>   s$   #*