o
    ߥi-                     @   sH  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
 d
ej#Z$dddZ%dddZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*ej+e j,ej-dG dd deZ.dS )a7   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net_aug incorporates both local and global feature fusion techniques
    to improve the performance. The training code is located on the following
    GitHub repository: https://github.com/alibaba-damo-academy/3D-Speaker.
    N)AnyDictUnion)Models)MODELS
TorchModel)AFF)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )ReLUFc                    s   t t| dd| d S )Nr      )superr   __init__)selfinplace	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2Net_aug.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr    z ())r   r   __name__)r   inplace_strr   r   r   __repr__   s   
zReLU.__repr__)F)r   
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r      c                 C      t j| |d|dddS )z1x1 convolution without paddingr   r   Fkernel_sizestridepaddingbiasnnConv2d	in_planes
out_planesr!   r   r   r   conv1x1$      r*   c                 C   r   )z3x3 convolution with padding   r   Fr   r$   r'   r   r   r   conv3x3/   r+   r-   c                       *   e Zd ZdZd	 fdd	Zdd Z  ZS )
BasicBlockERes2Net   r      r,   c           
   	      s6  t t|   tt||d  }t||| || _t	|| | _
|| _g }g }t| jD ]}	|t|| |t	| q/t|| _t|| _tdd| _t|| || j | _t	|| j | _t | _|dksx|| j| krttj|| j| d|ddt	| j| | _|| _|| _|| _d S )N      P@Tr   r   Fr    r!   r#   )r   r/   r   intmathfloorr*   conv1r%   BatchNorm2dbn1numsrangeappendr-   
ModuleListconvsbnsr   relu	expansionconv3bn3
Sequentialshortcutr&   r!   widthscale)
r   r(   planesr!   	baseWidthrH   rG   r?   r@   ir   r   r   r   =   s<   

zBasicBlockERes2Net.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]1}|dkr)|| }n|||  }| j| |}| | j	| |}|dkrG|}qt
||fd}q| |}| |}| |}||7 }| |}|S Nr   r   )r8   r:   rA   torchsplitrG   r<   r;   r?   r@   catrC   rD   rF   r   xresidualoutspxrK   spr   r   r   forward\   s(   







zBasicBlockERes2Net.forwardr   r1   r,   r   r   r   rB   r   rV   r   r   r   r   r   r/   :   s    r/   c                       r.   )
BasicBlockERes2Net_diff_AFFr0   r   r1   r,   c              	      sj  t t|   tt||d  }t||| || _t	|| | _
|| _g }g }g }	t| jD ]}
|t|| |	t	| q1t| jd D ]
}|t|d qKt|| _t|	| _t|| _tdd| _t|| || j | _t	|| j | _t | _|dks|| j| krttj|| j| d|ddt	| j| | _|| _|| _|| _d S )Nr2   r   channelsTr3   Fr4   )r   rY   r   r5   r6   r7   r*   r8   r%   r9   r:   r;   r<   r=   r-   r   r>   r?   r@   fuse_modelsr   rA   rB   rC   rD   rE   rF   r&   r!   rG   rH   )r   r(   rI   r!   rJ   rH   rG   r?   r\   r@   rK   jr   r   r   r   |   sD   

z$BasicBlockERes2Net_diff_AFF.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]7}|dkr)|| }n| j|d  ||| }| j	| |}| | j
| |}|dkrM|}qt||fd}q| |}| |}| |}||7 }| |}|S rL   )r8   r:   rA   rM   rN   rG   r<   r;   r\   r?   r@   rO   rC   rD   rF   rP   r   r   r   rV      s(   







z#BasicBlockERes2Net_diff_AFF.forwardrW   rX   r   r   r   r   rY   y   s    &rY   c                       sB   e Zd Zeeg ddddddf fdd	Zd	d
 Zdd Z  ZS )ERes2Net_aug)r,   r0      r,   @   P      TSTPFc	           	         s  t t|   || _|| _|| _t|d | d | _|| _t	j
d|ddddd| _t	|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t	j
|d |d ddddd
| _t|d d| _t|d d| _t|d d| _|dks|dkrdnd| _tt|| j|j d| _t	| j|j | j || _ | jrt	j!|dd| _"t	||| _#d S t	$ | _"t	$ | _#d S )N   r   r,   Fr   r   )r!      r0   )r    r"   r!   r#          rZ   TAPTSDP)in_dim)affine)%r   r^   r   r(   feat_dimembedding_sizer5   	stats_dimtwo_emb_layerr%   r&   r8   r9   r:   _make_layerlayer1layer2layer3layer4layer1_downsamplelayer2_downsamplelayer3_downsampler   fuse_mode12fuse_mode123fuse_mode1234n_statsgetattrpooling_layersrB   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)	r   block
block_fuse
num_blocks
m_channelsrl   rm   pooling_funcro   r   r   r   r      sz   	

zERes2Net_aug.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )r=   r(   rB   r%   rE   )r   r   rI   r   r!   strideslayersr   r   r   rp     s   
zERes2Net_aug._make_layerc                 C   s   | ddd}|d}t| | |}| |}| |}| |}| 	||}| 
|}| |}| ||}	| |}
| |	}| |
|}| |}| |}| jrjt|}| |}| |}|S |S )Nr   re   r   )permute
unsqueeze_FrA   r:   r8   rq   rr   ru   rx   rs   rv   ry   rt   rw   rz   r~   r   ro   r   r   )r   rQ   rS   out1out2out1_downsample
fuse_out12out3fuse_out12_downsamplefuse_out123out4fuse_out123_downsamplefuse_out1234statsembed_aembed_br   r   r   rV     s*   












zERes2Net_aug.forward)	r   r   r   r/   rY   r   rp   rV   r   r   r   r   r   r^      s    Br^   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationERes2Neta)  Enhanced Res2Net_aug architecture with local and global feature fusion.
    ERes2Net_aug is an upgraded version of ERes2Net that uses a larger number of
    parameters to achieve better recognition performance.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    st   t  j||g|R i | || _|| _d| _t| jd | _t | _|d }| 	| | j
| j | j  d S )Nra   devicepretrained_model)r   r   r   other_configfeature_dimr
   r   r^   embedding_model._SpeakerVerificationERes2Net__load_check_pointtoeval)r   	model_dirr   argskwargspretrained_model_namer   r   r   r   2  s   
z$SpeakerVerificationERes2Net.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   re   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayrM   
from_numpylenshape	unsqueeze-_SpeakerVerificationERes2Net__extract_featurer   r   r   detachcpu)r   audiofeature	embeddingr   r   r   rV   A  s   


z#SpeakerVerificationERes2Net.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr   meanr   )r   r   r   r   r   r   __extract_featureO  s   
z-SpeakerVerificationERes2Net.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nr   )map_locationT)strict)	rM   r   r   load_state_dictloadospathjoinr   )r   r   r   r   r   r   __load_check_pointU  s   

z.SpeakerVerificationERes2Net.__load_check_point)N)r   r   r   __doc__r   strr   r   rV   r   r   r   r   r   r   r   r   '  s    r   )r   )/r   r6   r   typingr   r   r   numpyr   rM   torch.nnr%   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svr}   modelscope.metainfor   modelscope.modelsr   r   !modelscope.models.audio.sv.fusionr   modelscope.utils.constantr	   modelscope.utils.devicer
   Hardtanhr   r*   r-   Moduler/   rY   r^   register_modulespeaker_verificationeres2net_aug_svr   r   r   r   r   <module>   s2   

?Gg