o
    ´©i$  ã                   @   s¼   d Z ddlZddlZddlmZ ddlm  mZ ddlm	  m
  m  mZ ddlmZ G dd„ dejƒZddd„Zdd	d
„ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )aˆ   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. 
    The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
    ERes2Net-Large is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better 
    recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
é    N)ÚAFFc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr   Ú__init__)ÚselfÚinplace©Ú	__class__© úW/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/eres2net/eres2net_aug.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r
   Ú__name__)r   Úinplace_strr   r   r   Ú__repr__   s   zReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r	   r   r      s    r   é   c                 C   ó   t j| |d|dddS )z1x1 convolution without paddingr   r   F©Úkernel_sizeÚstrideÚpaddingÚbias©ÚnnÚConv2d©Ú	in_planesÚ
out_planesr   r   r   r   Úconv1x1!   ó   r"   c                 C   r   )z3x3 convolution with paddingé   r   Fr   r   r   r   r   r   Úconv3x3&   r#   r%   c                       ó*   e Zd ZdZd	‡ fdd„	Zdd„ Z‡  ZS )
ÚBasicBlockERes2Neté   r   é   r$   c           
   	      s6  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }t| jƒD ]}	| t||ƒ¡ | t 	|¡¡ q/t |¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dksx|| j| krt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nç      P@T©r   r   F©r   r   r   )r   r'   r   ÚintÚmathÚfloorr"   Úconv1r   ÚBatchNorm2dÚbn1ÚnumsÚrangeÚappendr%   Ú
ModuleListÚconvsÚbnsr   ÚreluÚ	expansionÚconv3Úbn3Ú
SequentialÚshortcutr   r   ÚwidthÚscale)
r   r    Úplanesr   Ú	baseWidthr@   r?   r7   r8   Úir	   r   r   r   .   s4   
ÿü
zBasicBlockERes2Net.__init__c                 C   sÔ   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S ©Nr   r   )r0   r2   r9   ÚtorchÚsplitr?   r4   r3   r7   r8   Úcatr;   r<   r>   ©r   ÚxÚresidualÚoutÚspxrC   Úspr   r   r   ÚforwardL   s(   







zBasicBlockERes2Net.forward©r   r)   r$   ©r   r   r   r:   r   rN   r   r   r   r	   r   r'   +   s    r'   c                       r&   )
ÚBasicBlockERes2Net_diff_AFFr(   r   r)   r$   c              	      sj  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }g }	t| jƒD ]}
| t||ƒ¡ |	 t 	|¡¡ q1t| jd ƒD ]
}| t|d¡ qKt |¡| _t |	¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dks’|| j| krªt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nr*   r   ©ÚchannelsTr+   Fr,   )r   rQ   r   r-   r.   r/   r"   r0   r   r1   r2   r3   r4   r5   r%   r   r6   r7   r8   Úfuse_modelsr   r9   r:   r;   r<   r=   r>   r   r   r?   r@   )r   r    rA   r   rB   r@   r?   r7   rT   r8   rC   Újr	   r   r   r   l   s<   
ÿü
z$BasicBlockERes2Net_diff_AFF.__init__c                 C   sà   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]7}|dkr)|| }n| j|d  ||| ƒ}| j	| |ƒ}|  | j
| |ƒ¡}|dkrM|}qt ||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rD   )r0   r2   r9   rE   rF   r?   r4   r3   rT   r7   r8   rG   r;   r<   r>   rH   r   r   r   rN      s(   







z#BasicBlockERes2Net_diff_AFF.forwardrO   rP   r   r   r	   r   rQ   i   s    $rQ   c                       sB   e Zd Zeeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚERes2NetAug)r$   r(   é   r$   é@   éP   éÀ   ÚTSTPFc	           	         sì  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t	j
|d |d ddddd
| _t|d d| _t|d d| _t|d d| _|dks¶|dkr¸dnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _ | jrêt	j!|dd| _"t	 ||¡| _#d S t	 $¡ | _"t	 $¡ | _#d S )Né   r   r$   Fr   r   )r   é   r(   )r   r   r   r   é   é    rR   ÚTAPÚTSDP)Úin_dim)Úaffine)%r   rV   r   r    Úfeat_dimÚembedding_sizer-   Ú	stats_dimÚtwo_emb_layerr   r   r0   r1   r2   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Úlayer1_downsampleÚlayer2_downsampleÚlayer3_downsampler   Úfuse_mode12Úfuse_mode123Úfuse_mode1234Ún_statsÚgetattrÚpooling_layersr:   ÚpoolÚLinearÚseg_1ÚBatchNorm1dÚseg_bn_1Úseg_2ÚIdentity)	r   ÚblockÚ
block_fuseÚ
num_blocksÚ
m_channelsrd   re   Úpooling_funcrg   r	   r   r   r   ¯   s@   ÿÿÿ
zERes2NetAug.__init__c                 C   sL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S )Nr   )r5   r    r:   r   r=   )r   r}   rA   r   r   ÚstridesÚlayersr   r   r   rh   ß   s   
zERes2NetAug._make_layerc                 C   sØ   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	||¡}|  
|¡}|  |¡}|  ||¡}	|  |¡}
|  |	¡}|  |
|¡}|  |¡}|  |¡}| jrjt |¡}|  |¡}|  |¡}|S |S )Nr   r]   r   )ÚpermuteÚ
unsqueeze_ÚFr9   r2   r0   ri   rj   rm   rp   rk   rn   rq   rl   ro   rr   rv   rx   rg   rz   r{   )r   rI   rK   Úout1Úout2Úout1_downsampleÚ
fuse_out12Úout3Úfuse_out12_downsampleÚfuse_out123Úout4Úfuse_out123_downsampleÚfuse_out1234ÚstatsÚembed_aÚembed_br   r   r   rN   ç   s*   












zERes2NetAug.forward)	r   r   r   r'   rQ   r   rh   rN   r   r   r   r	   r   rV   ®   s    ÷0rV   )r   )Ú__doc__r.   rE   Útorch.nnr   Útorch.nn.functionalÚ
functionalr†   Ú)funasr.models.sond.pooling.pooling_layersÚmodelsÚsondÚpoolingru   Úfunasr.models.eres2net.fusionr   ÚHardtanhr   r"   r%   ÚModuler'   rQ   rV   r   r   r   r   Ú<module>   s   


>E