o
    ߥi                     @   s
  d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlm  mZ d dlm  mZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dl m!Z! d dl"m#Z# G d	d
 d
ej$Z%G dd dej$Z&ej'e!j(ej)dG dd deZ*dS )    N)OrderedDict)AnyDictUnion)Models)MODELS
TorchModel)BasicResBlockCAMDenseTDNNBlock
DenseLayer	StatsPool	TDNNLayerTransitLayerget_nonlinear)Tasks)create_devicec                       s:   e Zd Zeddgddf fdd	Zdd Zdd	 Z  ZS )
FCM       P   c                    s   t t|   || _tjd|ddddd| _t|| _| j	|||d dd| _
| j	|||d dd| _tj||ddddd| _t|| _||d	  | _d S )
N      F)kernel_sizestridepaddingbiasr   r   )r   )r   r      )superr   __init__	in_planesnnConv2dconv1BatchNorm2dbn1_make_layerlayer1layer2conv2bn2out_channels)selfblock
num_blocks
m_channelsfeat_dim	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/DTDNN.pyr      s,   zFCM.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )appendr   	expansionr    
Sequential)r+   r,   planesr-   r   strideslayersr2   r2   r3   r%   4   s   
zFCM._make_layerc                 C   sv   | d}t| | |}| |}| |}t| | |}|j	}|
|d |d |d  |d }|S )Nr   r   r   r   )	unsqueezeFrelur$   r"   r&   r'   r)   r(   shapereshape)r+   xoutr=   r2   r2   r3   forward<   s   


"zFCM.forward)__name__
__module____qualname__r	   r   r%   rA   __classcell__r2   r2   r0   r3   r      s    r   c                       s6   e Zd Z								d fd	d
	Zdd Z  ZS )CAMPPlusr      r         batchnorm-reluTsegmentc	                    s  t t|   t|d| _| jj}	|| _tt	dt
|	|dddd|dfg| _|}	ttdd	d
D ]=\}
\}}}t||	||| ||||d}| jd|
d  | |	||  }	| jd|
d  t|	|	d d|d |	d }	q2| jdt||	 | jdkr| jdt  | jdt|	d |dd n	| jdksJ d|  D ]}t|tjtjfrtj|jj |jd urtj|j qd S )N)r/   tdnn   r   r   )r   dilationr   
config_str)         )r   r   r   )r   r   r   )
num_layersin_channelsr*   bn_channelsr   rO   rP   memory_efficientzblock%dz	transit%dF)r   rP   out_nonlinearrK   statsdense
batchnorm_)rP   framez6`output_level` should be set to 'segment' or 'frame'. )r   rF   r   r   headr*   output_levelr    r6   r   r   xvector	enumeratezipr
   
add_moduler   r   r   r   modules
isinstanceConv1dLinearinitkaiming_normal_weightdatar   zeros_)r+   r/   embedding_sizegrowth_ratebn_sizeinit_channelsrP   rW   r^   channelsirT   r   rO   r,   mr0   r2   r3   r   J   s|   	
	





zCAMPPlus.__init__c                 C   s<   | ddd}| |}| |}| jdkr|dd}|S )Nr   r   r   r\   )permuter]   r_   r^   	transpose)r+   r?   r2   r2   r3   rA      s   


zCAMPPlus.forward)r   rG   r   rH   rI   rJ   TrK   )rB   rC   rD   r   rA   rE   r2   r2   r0   r3   rF   H   s    BrF   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )SpeakerVerificationCAMPPlusa
  A fast and efficient speaker embedding model, using a 2-dimensional convolution residual network as the head
    and a densely connected time delay neural network as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _|| _| jd | _| jd | _t| jd | _t| j| j| _	|d }| 
| | j	| j | j	  d S )N	fbank_dimemb_sizedevicepretrained_model)r   r   rw   other_configfeature_dimry   r   rz   rF   embedding_model._SpeakerVerificationCAMPPlus__load_check_pointtoeval)r+   	model_dirrw   argskwargspretrained_model_namer0   r2   r3   r      s   
z$SpeakerVerificationCAMPPlus.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r   zFmodelscope error: the shape of input audio to model needs to be [N, T])rd   npndarraytorch
from_numpylenr=   r:   -_SpeakerVerificationCAMPPlus__extract_featurer~   r   rz   detachcpu)r+   audiofeature	embeddingr2   r2   r3   rA      s   


z#SpeakerVerificationCAMPPlus.forwardc                 C   sT   g }|D ]}t j|d| jd}||jddd }||d qt|}|S )Nr   )num_mel_binsT)dimkeepdim)Kaldifbankr:   r}   meanr4   r   cat)r+   r   featuresaur   r2   r2   r3   __extract_feature   s   
z-SpeakerVerificationCAMPPlus.__extract_featurec                 C   s0   | j jtjtj| j|tdddd d S )Nr   )map_locationT)strict)	r~   load_state_dictr   loadospathjoinr   rz   )r+   r   r2   r2   r3   __load_check_point   s   
z.SpeakerVerificationCAMPPlus.__load_check_point)rB   rC   rD   __doc__r   strr   r   rA   r   r   rE   r2   r2   r0   r3   rv      s    
rv   )+r   collectionsr   typingr   r   r   numpyr   r   torch.nnr    torch.nn.functional
functionalr;   torchaudio.compliance.kaldi
compliancekaldir   modelscope.metainfor   modelscope.modelsr   r   'modelscope.models.audio.sv.DTDNN_layersr	   r
   r   r   r   r   r   modelscope.utils.constantr   modelscope.utils.devicer   Moduler   rF   register_modulespeaker_verificationcampplus_svrv   r2   r2   r2   r3   <module>   s&   $0M