o
    ߥi*                      @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  G dd	 d	ej!Z"G d
d dej#Z$G dd dej#Z%ej&ej'ej(dG dd deZ)dS )a   Res2Net implementation is adapted from https://github.com/Res2Net/Res2Net-PretrainedModels.
    Res2Net is an advanced neural network architecture that enhances the capabilities of standard ResNets
    by incorporating hierarchical residual-like connections. This innovative structure improves
    performance across various computer vision tasks, such as image classification and object
    detection, without significant computational overhead.
    Reference: https://arxiv.org/pdf/1904.01169.pdf
    Some modifications from the original architecture:
    1. Smaller kernel size for the input layer
    2. Smaller expansion in BasicBlockRes2Net
    N)AnyDictUnion)Models)MODELS
TorchModel)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )ReLUFc                    s   t t| dd| d S )Nr      )superr
   __init__)selfinplace	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/Res2Net.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr    z ())r   r   __name__)r   inplace_strr   r   r   __repr__"   s   
zReLU.__repr__)F)r   
__module____qualname__r   r   __classcell__r   r   r   r   r
      s    r
   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	BasicBlockRes2Net          c           
   
      sT  t t|   tt||d  }tj||| d|dd| _t	|| | _
|d | _g }g }t| jD ]}	|tj||dddd |t	| q5t|| _t|| _tdd| _tj|| || j ddd	| _t	|| j | _t | _|dks|| j| krttj|| j| d|ddt	| j| | _|| _|| _|| _d S )
Ng      P@r   F)kernel_sizestridebias   )r    paddingr"   T)r   )r    r"   )r   r   r   intmathfloornnConv2dconv1BatchNorm2dbn1numsrangeappend
ModuleListconvsbnsr
   relu	expansionconv3bn3
Sequentialshortcutr!   widthscale)
r   	in_planesplanesr!   	baseWidthr:   r9   r1   r2   ir   r   r   r   +   sH   


zBasicBlockRes2Net.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]1}|dkr)|| }n|||  }| j| |}| | j	| |}|dkrG|}qt
||fd}qt
||| j fd}| |}| |}| |}||7 }| |}|S )Nr   r   )r*   r,   r3   torchsplitr9   r.   r-   r1   r2   catr5   r6   r8   )r   xresidualoutspxr>   spr   r   r   forwardL   s*   







zBasicBlockRes2Net.forward)r   r   r   )r   r   r   r4   r   rG   r   r   r   r   r   r   (   s    !r   c                       s@   e Zd Zeg ddddddf fdd	Zd	d
 Zdd Z  ZS )Res2Net)r#         r#   r   P      TSTPFc                    s\  t t|   || _|| _|| _t|d | d | _|| _t	j
d|ddddd| _t	|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _|d
ksn|dkrpdnd| _tt|| j|j d| _t	| j|j | j || _| jrt	j|dd| _t	||| _d S t	 | _t	 | _d S )N   r   r#   F)r    r!   r$   r"   r   )r!   r   rI   TAPTSDP)in_dim)affine)r   rH   r   r;   feat_dimembedding_sizer%   	stats_dimtwo_emb_layerr(   r)   r*   r+   r,   _make_layerlayer1layer2layer3layer4n_statsgetattrpooling_layersr4   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)r   block
num_blocks
m_channelsrS   rT   pooling_funcrV   r   r   r   r   m   sD   

zRes2Net.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )r/   r;   r4   r(   r7   )r   rf   r<   rg   r!   strideslayersr   r   r   rW      s   
zRes2Net._make_layerc                 C   s   | ddd}|d}t| | |}| |}| |}| |}| 	|}| 
|}| |}| jrIt|}| |}| |}|S |S )Nr   r   r   )permute
unsqueeze_Fr3   r,   r*   rX   rY   rZ   r[   r_   ra   rV   rc   rd   )r   rB   rD   statsembed_aembed_br   r   r   rG      s   









zRes2Net.forward)r   r   r   r   r   rW   rG   r   r   r   r   r   rH   k   s    (rH   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationResNetzW
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _|| _d| _t| jd | _t	| j| jd| _
|d }| | | j
| j | j
  d S )N	embed_dimchannelsrK   device)rT   rh   pretrained_model)r   r   rt   ru   rh   other_configfeature_dimr	   rw   rH   embedding_model,_SpeakerVerificationResNet__load_check_pointtoeval)r   	model_dirrt   argskwargspretrained_model_namer   r   r   r      s   
z"SpeakerVerificationResNet.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayr?   
from_numpylenshape	unsqueeze+_SpeakerVerificationResNet__extract_featurer{   r}   rw   detachcpu)r   audiofeature	embeddingr   r   r   rG      s   


z!SpeakerVerificationResNet.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   T)dimkeepdim)Kaldifbankrz   meanr   )r   r   r   r   r   r   __extract_feature   s   
z+SpeakerVerificationResNet.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nr   )map_locationT)strict)	r?   rw   r{   load_state_dictloadospathjoinr   )r   r   rw   r   r   r   __load_check_point   s   

z,SpeakerVerificationResNet.__load_check_point)N)r   r   r   __doc__r   strr   r   rG   r   r|   r   r   r   r   r   rs      s    rs   )*r   r&   r   typingr   r   r   numpyr   r?   torch.nnr(   torch.nn.functional
functionalrn   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svr^   modelscope.metainfor   modelscope.modelsr   r   modelscope.utils.constantr   modelscope.utils.devicer	   Hardtanhr
   Moduler   rH   register_modulespeaker_verification
res2net_svrs   r   r   r   r   <module>   s*   
CH