o
    ߥi'                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	 d	ej!Z"G d
d dej!Z#ej$ej%ej&dG dd deZ'dS )a   ResNet implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ResNet, or Residual Neural Network, is notable for its optimization ease
    and depth-induced accuracy gains. It utilizes skip connections within its residual
    blocks to counteract the vanishing gradient problem in deep networks.
    Reference: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                       s*   e Zd ZdZd fdd	Zdd Z  ZS )
BasicBlock   c              	      s   t t|   tj||d|ddd| _t|| _tj||ddddd| _t|| _	t
 | _|dks;|| j| krUt
tj|| j| d|ddt| j| | _d S d S )N   r   Fkernel_sizestridepaddingbias)r   r   r   )superr
   __init__nnConv2dconv1BatchNorm2dbn1conv2bn2
Sequentialshortcut	expansion)self	in_planesplanesr   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/ResNet.pyr      s6   

zBasicBlock.__init__c                 C   sB   t | | |}| | |}|| |7 }t |}|S N)Frelur   r   r   r   r   )r   xoutr#   r#   r$   forward5   s
   
zBasicBlock.forward)r   )__name__
__module____qualname__r   r   r*   __classcell__r#   r#   r!   r$   r
      s    r
   c                       s@   e Zd Zeg ddddddf fdd	Zd	d
 Zdd Z  ZS )ResNet)r         r       P      TSTPTc                    s\  t t|   || _|| _|| _t|d | d | _|| _t	j
d|ddddd| _t	|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _|d
ksn|dkrpdnd| _tt|| j|j d| _t	| j|j | j || _| jrt	j|dd| _t	||| _d S t	 | _t	 | _d S )N   r   r   Fr   r   )r      r0   TAPTSDP)in_dim)affine)r   r/   r   r   feat_dimembedding_sizeint	stats_dimtwo_emb_layerr   r   r   r   r   _make_layerlayer1layer2layer3layer4n_statsgetattrpooling_layersr   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)r   block
num_blocks
m_channelsr<   r=   pooling_funcr@   r!   r#   r$   r   ?   sD   

zResNet.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )appendr   r   r   r   )r   rP   r    rQ   r   strideslayersr#   r#   r$   rA   g   s   
zResNet._make_layerc           	      C   s   | ddd}|d}t| | |}| |}| |}| |}| 	|}| 
|}| |}| jrIt|}| |}| |}|S |S )Nr   r7   r   )permute
unsqueeze_r&   r'   r   r   rB   rC   rD   rE   rI   rK   r@   rM   rN   )	r   r(   r)   out1out2out3statsembed_aembed_br#   r#   r$   r*   o   s   









zResNet.forward)r+   r,   r-   r
   r   rA   r*   r.   r#   r#   r!   r$   r/   =   s    (r/   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationResNetzW
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _|| _d| _t| jd | _t	| j| jd| _
|d }| | | j
| j | j
  d S )N	embed_dimchannelsr3   device)r=   rR   pretrained_model)r   r   ra   rb   rR   other_configfeature_dimr	   rd   r/   embedding_model,_SpeakerVerificationResNet__load_check_pointtoeval)r   	model_dirra   argskwargspretrained_model_namer!   r#   r$   r      s   
z"SpeakerVerificationResNet.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r7   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpylenshape	unsqueeze+_SpeakerVerificationResNet__extract_featurerh   rj   rd   detachcpu)r   audiofeature	embeddingr#   r#   r$   r*      s   


z!SpeakerVerificationResNet.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   T)dimkeepdim)Kaldifbankrg   meanrw   )r   r{   r|   r#   r#   r$   __extract_feature   s   
z+SpeakerVerificationResNet.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nrz   )map_locationT)strict)	rs   rd   rh   load_state_dictloadospathjoinrl   )r   ro   rd   r#   r#   r$   __load_check_point   s   

z,SpeakerVerificationResNet.__load_check_pointr%   )r+   r,   r-   __doc__r   strr   r   r*   rx   ri   r.   r#   r#   r!   r$   r`      s    r`   )(r   mathr   typingr   r   r   numpyrq   rs   torch.nnr   torch.nn.functional
functionalr&   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr{   svrH   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   modelscope.utils.devicer	   Moduler
   r/   register_modulespeaker_verification	resnet_svr`   r#   r#   r#   r$   <module>   s(   #F