o
    ߥi/                     @   sH  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
 d
ej#Z$dddZ%dddZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*ej+e j,ej-dG dd deZ.dS )a   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
    fusion (LFF) fuses the features within one single residual block to extract the local signal.
    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
    N)AnyDictUnion)Models)MODELS
TorchModel)AFF)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )ReLUFc                    s   t t| dd| d S )Nr      )superr   __init__)selfinplace	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2Net.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr    z ())r   r   __name__)r   inplace_strr   r   r   __repr__   s   
zReLU.__repr__)F)r   
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r      c                 C      t j| |d|dddS )z1x1 convolution without paddingr   r   Fkernel_sizestridepaddingbiasnnConv2d	in_planes
out_planesr!   r   r   r   conv1x1$      r*   c                 C   r   )z3x3 convolution with padding   r   Fr   r$   r'   r   r   r   conv3x3/   r+   r-   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	BasicBlockERes2Net   r       c           
   	      s6  t t|   tt||d  }t||| || _t	|| | _
|| _g }g }t| jD ]}	|t|| |t	| q/t|| _t|| _tdd| _t|| || j | _t	|| j | _t | _|dksx|| j| krttj|| j| d|ddt	| j| | _|| _|| _|| _d S )N      P@Tr   r   Fr    r!   r#   )r   r/   r   intmathfloorr*   conv1r%   BatchNorm2dbn1numsrangeappendr-   
ModuleListconvsbnsr   relu	expansionconv3bn3
Sequentialshortcutr&   r!   widthscale)
r   r(   planesr!   	baseWidthrH   rG   r?   r@   ir   r   r   r   =   s<   

zBasicBlockERes2Net.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]1}|dkr)|| }n|||  }| j| |}| | j	| |}|dkrG|}qt
||fd}q| |}| |}| |}||7 }| |}|S Nr   r   )r8   r:   rA   torchsplitrG   r<   r;   r?   r@   catrC   rD   rF   r   xresidualoutspxrK   spr   r   r   forward\   s(   







zBasicBlockERes2Net.forwardr   r1   r0   r   r   r   rB   r   rV   r   r   r   r   r   r/   :   s    r/   c                       r.   )	BasicBlockERes2Net_AFFr0   r   r1   c              	      sj  t t|   tt||d  }t||| || _t	|| | _
|| _g }g }g }	t| jD ]}
|t|| |	t	| q1t| jd D ]
}|t|d qKt|| _t|	| _t|| _tdd| _t|| || j | _t	|| j | _t | _|dks|| j| krttj|| j| d|ddt	| j| | _|| _|| _|| _d S )Nr2   r   channelsTr3   Fr4   )r   rY   r   r5   r6   r7   r*   r8   r%   r9   r:   r;   r<   r=   r-   r   r>   r?   r@   fuse_modelsr   rA   rB   rC   rD   rE   rF   r&   r!   rG   rH   )r   r(   rI   r!   rJ   rH   rG   r?   r\   r@   rK   jr   r   r   r   |   sD   

zBasicBlockERes2Net_AFF.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]7}|dkr)|| }n| j|d  ||| }| j	| |}| | j
| |}|dkrM|}qt||fd}q| |}| |}| |}||7 }| |}|S rL   )r8   r:   rA   rM   rN   rG   r<   r;   r\   r?   r@   rO   rC   rD   rF   rP   r   r   r   rV      s(   







zBasicBlockERes2Net_AFF.forwardrW   rX   r   r   r   r   rY   y   s    $rY   c                       sB   e Zd Zeeg ddddddf fdd	Zd	d
 Zdd Z  ZS )ERes2Net)r,         r,   r1   P      TSTPFc	           	         s  t t|   || _|| _|| _t|d | d | _|| _t	j
d|ddddd| _t	|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d |d	 ddddd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t|d	 d| _t|d d| _t|d d| _|dks|dkrdnd| _tt|| j|j d| _t	| j|j | j || _ | jrt	j!|dd| _"t	||| _#d S t	$ | _"t	$ | _#d S )N   r   r,   Fr   r   )r!   r0   r_   )r    r"   r!   r#      rZ   TAPTSDP)in_dim)affine)%r   r^   r   r(   feat_dim	embed_dimr5   	stats_dimtwo_emb_layerr%   r&   r8   r9   r:   _make_layerlayer1layer2layer3layer4layer1_downsamplelayer2_downsamplelayer3_downsampler   fuse_mode12fuse_mode123fuse_mode1234n_statsgetattrpooling_layersrB   poolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)	r   block
block_fuse
num_blocks
m_channelsrj   rk   pooling_funcrm   r   r   r   r      sz   		

zERes2Net.__init__c                 C   sL   |gdg|d   }g }|D ]}| || j|| ||j | _qtj| S )Nr   )r=   r(   rB   r%   rE   )r   r   rI   r   r!   strideslayersr   r   r   rn     s   
zERes2Net._make_layerc                 C   s   | ddd}|d}t| | |}| |}| |}| |}| 	||}| 
|}| |}| ||}	| |}
| |	}| |
|}| |}| |}| jrjt|}| |}| |}|S |S )Nr   r0   r   )permute
unsqueeze_FrA   r:   r8   ro   rp   rs   rv   rq   rt   rw   rr   ru   rx   r|   r~   rm   r   r   )r   rQ   rS   out1out2out1_downsample
fuse_out12out3fuse_out12_downsamplefuse_out123out4fuse_out123_downsamplefuse_out1234statsembed_aembed_br   r   r   rV     s*   












zERes2Net.forward)	r   r   r   r/   rY   r   rn   rV   r   r   r   r   r   r^      s    Dr^   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationERes2Neta  Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
    of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
    interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _|| _d| _t| jd | _t	| j| jd| _
|d }| | | j
| j | j
  d S )Nrk   r[   ra   device)rk   r   pretrained_model)r   r   r   rk   r   other_configfeature_dimr
   r   r^   embedding_model._SpeakerVerificationERes2Net__load_check_pointtoeval)r   	model_dirr   argskwargspretrained_model_namer   r   r   r   6  s   
z$SpeakerVerificationERes2Net.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r0   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayrM   
from_numpylenshape	unsqueeze-_SpeakerVerificationERes2Net__extract_featurer   r   r   detachcpu)r   audiofeature	embeddingr   r   r   rV   I  s   


z#SpeakerVerificationERes2Net.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr   meanr   )r   r   r   r   r   r   __extract_featureW  s   
z-SpeakerVerificationERes2Net.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nr   )map_locationT)strict)	rM   r   r   load_state_dictloadospathjoinr   )r   r   r   r   r   r   __load_check_point]  s   

z.SpeakerVerificationERes2Net.__load_check_point)N)r   r   r   __doc__r   strr   r   rV   r   r   r   r   r   r   r   r   +  s    r   )r   )/r   r6   r   typingr   r   r   numpyr   rM   torch.nnr%   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svr{   modelscope.metainfor   modelscope.modelsr   r   !modelscope.models.audio.sv.fusionr   modelscope.utils.constantr	   modelscope.utils.devicer
   Hardtanhr   r*   r-   Moduler/   rY   r^   register_modulespeaker_verificationeres2net_svr   r   r   r   r   <module>   s2   

?Em