o
    ߥi.                     @   s4  d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlm  mZ ddlm  m  m  mZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" G d	d
 d
ej#Z$G dd dej%Z&G dd dej%Z'G dd dej%Z(ej)e j*ej+dG dd deZ,dS )a  
    To further improve the short-duration feature extraction capability of ERes2Net,
    we expand the channel dimension within each stage. However, this modification also
    increases the number of model parameters and computational complexity.
    To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures,
    ultimately reducing both the model parameters and its computational cost.
    N)AnyDictUnion)Models)MODELS
TorchModel)AFF)Tasks)create_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )ReLUFc                    s   t t| dd| d S )Nr      )superr   __init__)selfinplace	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/ERes2NetV2.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr    z ())r   r   __name__)r   inplace_strr   r   r   __repr__!   s   
zReLU.__repr__)F)r   
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                       .   e Zd Z				d fdd	Zdd Z  ZS )	BasicBlockERes2NetV2         c              
      sN  t t|   tt||d  }|| _tj||| d|dd| _	t
|| | _|| _|| _g }g }	t| jD ]}
|tj||dddd |	t
| q9t|| _t|	| _tdd| _tj|| || j ddd	| _t
|| j | _t | _|dks|| j| krttj|| j| d|ddt
| j| | _d S d S )
N      P@r   Fkernel_sizestridebias   r$   paddingr&   Tr   r$   r&   )r   r   r   intmathfloorwidthnnConv2dconv1BatchNorm2dbn1nums	expansionrangeappend
ModuleListconvsbnsr   reluconv3bn3
Sequentialshortcut)r   	in_planesplanesr%   	baseWidthscaler6   r/   r:   r;   ir   r   r   r   )   sH   

zBasicBlockERes2NetV2.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]1}|dkr)|| }n|||  }| j| |}| | j	| |}|dkrG|}qt
||fd}q| |}| |}| |}||7 }| |}|S Nr   r   )r2   r4   r<   torchsplitr/   r7   r5   r:   r;   catr=   r>   r@   r   xresidualoutspxrE   spr   r   r   forwardP   s(   







zBasicBlockERes2NetV2.forwardr   r    r!   r!   r   r   r   r   rP   r   r   r   r   r   r   '   s    'r   c                       r   )	BasicBlockERes2NetV2AFFr   r    r!   c              
      s  t t|   tt||d  }|| _tj||| d|dd| _	t
|| | _|| _|| _g }g }	g }
t| jD ]}|tj||dddd |
t
| q;t| jd D ]}|	t|dd qZt|| _t|
| _t|	| _td	d
| _tj|| || j ddd| _t
|| j | _t | _|dks|| j| krttj|| j| d|ddt
| j| | _d S d S )Nr"   r   Fr#   r'   r(      channelsrTr*   r+   )r   rS   r   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r   r9   r:   r;   fuse_modelsr   r<   r=   r>   r?   r@   )r   rA   rB   r%   rC   rD   r6   r/   r:   rX   r;   rE   jr   r   r   r   o   sP   

z BasicBlockERes2NetV2AFF.__init__c                 C   s   |}|  |}| |}| |}t|| jd}t| jD ]7}|dkr)|| }n| j|d  ||| }| j	| |}| | j
| |}|dkrM|}qt||fd}q| |}| |}| |}||7 }| |}|S rF   )r2   r4   r<   rG   rH   r/   r7   r5   rX   r:   r;   rI   r=   r>   r@   rJ   r   r   r   rP      s(   







zBasicBlockERes2NetV2AFF.forwardrQ   rR   r   r   r   r   rS   m   s    ,rS   c                       sH   e Zd Zeeg dddddddddf fd	d
	Zdd Zdd Z  ZS )
ERes2NetV2)r'   rT      r'   @   P      r    r!   TSTPFc                    s  t t|   || _|| _|| _t|d | d | _|| _|| _	|| _
|	| _tjd|ddddd| _t|| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _tj|d	 | j |d | j ddddd
| _t|d | j d	d| _|
dks|
dkrdnd| _tt|
| j| j d| _t| j| j | j || _| jrtj|dd| _ t||| _!d S t" | _ t" | _!d S )N   r   r'   F)r$   r%   r)   r&   r   )r%   r!   rT   )r$   r)   r%   r&   rU   TAPTSDP)in_dim)affine)#r   rZ   r   rA   feat_dim	embed_dimr,   	stats_dimtwo_emb_layerrC   rD   r6   r0   r1   r2   r3   r4   _make_layerlayer1layer2layer3layer4	layer3_dsr   fuse34n_statsgetattrpooling_layerspoolLinearseg_1BatchNorm1dseg_bn_1seg_2Identity)r   block
block_fuse
num_blocks
m_channelsre   rf   rC   rD   r6   pooling_funcrh   r   r   r   r      s\   	

zERes2NetV2.__init__c                 C   sZ   |gdg|d   }g }|D ]}| || j||| j| j| jd || j | _qtj| S )Nr   )rC   rD   r6   )r8   rA   rC   rD   r6   r0   r?   )r   rz   rB   r|   r%   strideslayersr   r   r   ri      s   
zERes2NetV2._make_layerc                 C   s   | ddd}|d}t| | |}| |}| |}| |}| 	|}| 
|}| ||}| |}	| |	}
| jrTt|
}| |}| |}|S |
S )Nr   r!   r   )permute
unsqueeze_Fr<   r4   r2   rj   rk   rl   rm   rn   ro   rs   ru   rh   rw   rx   )r   rK   rM   out1out2out3out4out3_ds
fuse_out34statsembed_aembed_br   r   r   rP     s"   










zERes2NetV2.forward)	r   r   r   r   rS   r   ri   rP   r   r   r   r   r   rZ      s    :rZ   )module_namec                       sH   e Zd ZdZdeeef f fddZdd Zdd Z	dd
dZ
  ZS )SpeakerVerificationERes2NetV2a  ERes2NetV2 architecture with local and global feature fusion. ERes2NetV2 is mainly composed
    of Bottom-up Dual-stage Feature Fusion (BDFF) and Bottleneck-like Local Feature Fusion (BLFF).
    BDFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
    The BLFF extracts localization-preserved speaker features and strengthen the local information interaction.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _| jd | _| jd | _|| _d| _t	| jd | _
t| j| j| j| jd| _|d }| | | j| j
 | j  d S )	Nrf   rC   rD   r6   r]   device)rf   rC   rD   r6   pretrained_model)r   r   r   rf   rC   rD   r6   other_configfeature_dimr
   r   rZ   embedding_model0_SpeakerVerificationERes2NetV2__load_check_pointtoeval)r   	model_dirr   argskwargspretrained_model_namer   r   r   r   &  s&   
z&SpeakerVerificationERes2NetV2.__init__c                 C   sl   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}|  S )Nr   r   r!   zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarrayrG   
from_numpylenshape	unsqueeze/_SpeakerVerificationERes2NetV2__extract_featurer   r   r   detachcpu)r   audiofeature	embeddingr   r   r   rP   >  s   


z%SpeakerVerificationERes2NetV2.forwardc                 C   s0   t j|| jd}||jddd }|d}|S )N)num_mel_binsr   T)dimkeepdim)Kaldifbankr   meanr   )r   r   r   r   r   r   __extract_featureL  s   
z/SpeakerVerificationERes2NetV2.__extract_featureNc                 C   s8   |st d}| jjt jtj| j||ddd d S )Nr   )map_locationT)strict)	rG   r   r   load_state_dictloadospathjoinr   )r   r   r   r   r   r   __load_check_pointR  s   

z0SpeakerVerificationERes2NetV2.__load_check_point)N)r   r   r   __doc__r   strr   r   rP   r   r   r   r   r   r   r   r     s    	r   )-r   r-   r   typingr   r   r   numpyr   rG   torch.nnr0   torch.nn.functional
functionalr   torchaudio.compliance.kaldi
compliancekaldir   )modelscope.models.audio.sv.pooling_layersmodelsr   svrr   modelscope.metainfor   modelscope.modelsr   r   !modelscope.models.audio.sv.fusionr   modelscope.utils.constantr	   modelscope.utils.devicer
   Hardtanhr   Moduler   rS   rZ   register_modulespeaker_verificationeres2netv2_svr   r   r   r   r   <module>   s.   FLa