o
    ´©iŽ5  ã                   @   sà   d Z ddlZddlZddlmZ ddlm  mZ ddlm	  m
  m  mZ ddlmZ G dd„ dejƒZddd„Zdd	d
„ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )aˆ   Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. 
    The local feature fusion (LFF) fuses the features within one single residual block to extract the local signal.
    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
    ERes2Net-Large is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better 
    recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
é    N)ÚAFFc                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )ÚReLUFc                    s   t t| ƒ dd|¡ d S )Nr   é   )Úsuperr   Ú__init__)ÚselfÚinplace©Ú	__class__© úS/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/eres2net/eres2net.pyr      s   zReLU.__init__c                 C   s"   | j rdnd}| jjd | d S )Nr   Ú z (ú))r   r
   Ú__name__)r   Úinplace_strr   r   r   Ú__repr__   s   zReLU.__repr__)F)r   Ú
__module__Ú__qualname__r   r   Ú__classcell__r   r   r	   r   r      s    r   é   c                 C   ó   t j| |d|dddS )z1x1 convolution without paddingr   r   F©Úkernel_sizeÚstrideÚpaddingÚbias©ÚnnÚConv2d©Ú	in_planesÚ
out_planesr   r   r   r   Úconv1x1!   ó   r"   c                 C   r   )z3x3 convolution with paddingé   r   Fr   r   r   r   r   r   Úconv3x3&   r#   r%   c                       ó*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	ÚBasicBlockERes2Neté   r   é    c           
   	      s6  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }t| jƒD ]}	| t||ƒ¡ | t 	|¡¡ q/t |¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dksx|| j| krt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nç      P@T©r   r   F©r   r   r   )r   r'   r   ÚintÚmathÚfloorr"   Úconv1r   ÚBatchNorm2dÚbn1ÚnumsÚrangeÚappendr%   Ú
ModuleListÚconvsÚbnsr   ÚreluÚ	expansionÚconv3Úbn3Ú
SequentialÚshortcutr   r   ÚwidthÚscale©
r   r    Úplanesr   Ú	baseWidthr@   r?   r7   r8   Úir	   r   r   r   .   s4   
ÿü
zBasicBlockERes2Net.__init__c                 C   sÔ   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S ©Nr   r   ©r0   r2   r9   ÚtorchÚsplitr?   r4   r3   r7   r8   Úcatr;   r<   r>   ©r   ÚxÚresidualÚoutÚspxrD   Úspr   r   r   ÚforwardL   s(   







zBasicBlockERes2Net.forward©r   r)   r(   ©r   r   r   r:   r   rP   r   r   r   r	   r   r'   +   s    r'   c                       r&   )	ÚBasicBlockERes2Net_diff_AFFr(   r   r)   c              	      sj  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|| _g }g }g }	t| jƒD ]}
| t||ƒ¡ |	 t 	|¡¡ q1t| jd ƒD ]
}| t|d¡ qKt |¡| _t |	¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dks’|| j| krªt tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nr*   r   ©ÚchannelsTr+   Fr,   )r   rS   r   r-   r.   r/   r"   r0   r   r1   r2   r3   r4   r5   r%   r   r6   r7   r8   Úfuse_modelsr   r9   r:   r;   r<   r=   r>   r   r   r?   r@   )r   r    rB   r   rC   r@   r?   r7   rV   r8   rD   Újr	   r   r   r   l   s<   
ÿü
z$BasicBlockERes2Net_diff_AFF.__init__c                 C   sà   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]7}|dkr)|| }n| j|d  ||| ƒ}| j	| |ƒ}|  | j
| |ƒ¡}|dkrM|}qt ||fd¡}q|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rE   )r0   r2   r9   rG   rH   r?   r4   r3   rV   r7   r8   rI   r;   r<   r>   rJ   r   r   r   rP      s(   







z#BasicBlockERes2Net_diff_AFF.forwardrQ   rR   r   r   r	   r   rS   i   s    #rS   c                       sB   e Zd Zeeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚERes2Net©r$   é   é   r$   r)   éP   éÀ   ÚTSTPFc	           	         sì  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _t	j
|d |d	 ddddd| _t	j
|d	 |d ddddd
| _t	j
|d |d ddddd
| _t|d	 d| _t|d d| _t|d d| _|dks¶|dkr¸dnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _ | jrêt	j!|dd| _"t	 ||¡| _#d S t	 $¡ | _"t	 $¡ | _#d S )Né   r   r$   Fr   r   ©r   r(   rZ   )r   r   r   r   é   rT   ÚTAPÚTSDP©Úin_dim©Úaffine)%r   rX   r   r    Úfeat_dimÚembedding_sizer-   Ú	stats_dimÚtwo_emb_layerr   r   r0   r1   r2   Ú_make_layerÚlayer1Úlayer2Úlayer3Úlayer4Úlayer1_downsampleÚlayer2_downsampleÚlayer3_downsampler   Úfuse_mode12Úfuse_mode123Úfuse_mode1234Ún_statsÚgetattrÚpooling_layersr:   ÚpoolÚLinearÚseg_1ÚBatchNorm1dÚseg_bn_1Úseg_2ÚIdentity)	r   ÚblockÚ
block_fuseÚ
num_blocksÚ
m_channelsrh   ri   Úpooling_funcrk   r	   r   r   r   ®   s@   ÿÿÿ
zERes2Net.__init__c                 C   óL   |gdg|d   }g }|D ]}|  || j||ƒ¡ ||j | _qtj|Ž S ©Nr   ©r5   r    r:   r   r=   ©r   r   rB   rƒ   r   ÚstridesÚlayersr   r   r   rl   á   ó   
zERes2Net._make_layerc                 C   sØ   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	||¡}|  
|¡}|  |¡}|  ||¡}	|  |¡}
|  |	¡}|  |
|¡}|  |¡}|  |¡}| jrjt |¡}|  |¡}|  |¡}|S |S ©Nr   r(   r   )ÚpermuteÚ
unsqueeze_ÚFr9   r2   r0   rm   rn   rq   rt   ro   rr   ru   rp   rs   rv   rz   r|   rk   r~   r   )r   rK   rM   Úout1Úout2Úout1_downsampleÚ
fuse_out12Úout3Úfuse_out12_downsampleÚfuse_out123Úout4Úfuse_out123_downsampleÚfuse_out1234ÚstatsÚembed_aÚembed_br   r   r   rP   é   s*   












zERes2Net.forward)	r   r   r   r'   rS   r   rl   rP   r   r   r   r	   r   rX   ­   s    ÷3rX   c                       r&   )	ÚBasicBlockRes2Netr(   r   r)   c           
   	      s:  t t| ƒ ¡  tt ||d  ¡ƒ}t||| |ƒ| _t 	|| ¡| _
|d | _g }g }t| jƒD ]}	| t||ƒ¡ | t 	|¡¡ q1t |¡| _t |¡| _tdd| _t|| || j ƒ| _t 	|| j ¡| _t ¡ | _|dksz|| j| kr’t tj|| j| d|ddt 	| j| ¡¡| _|| _|| _|| _d S )Nr*   r   Tr+   Fr,   )r   rž   r   r-   r.   r/   r"   r0   r   r1   r2   r3   r4   r5   r%   r6   r7   r8   r   r9   r:   r;   r<   r=   r>   r   r   r?   r@   rA   r	   r   r   r     s4   

ÿü
zBasicBlockRes2Net.__init__c                 C   sê   |}|   |¡}|  |¡}|  |¡}t || jd¡}t| jƒD ]1}|dkr)|| }n|||  }| j| |ƒ}|  | j	| |ƒ¡}|dkrG|}qt 
||fd¡}qt 
||| j fd¡}|  |¡}|  |¡}|  |¡}||7 }|  |¡}|S rE   rF   rJ   r   r   r   rP   #  s*   







zBasicBlockRes2Net.forwardrQ   rR   r   r   r	   r   rž     s    rž   c                       s@   e Zd Zeg d¢dddddf‡ fdd„	Zd	d
„ Zdd„ Z‡  ZS )ÚRes2NetrY   r)   r\   r]   r^   Fc                    s\  t t| ƒ ¡  || _|| _|| _t|d ƒ| d | _|| _t	j
d|ddddd| _t	 |¡| _| j|||d dd| _| j||d |d dd| _| j||d	 |d dd| _| j||d |d dd| _|d
ksn|dkrpdnd| _tt|ƒ| j|j d| _t	 | j|j | j |¡| _| jr¢t	j|dd| _t	 ||¡| _d S t	 ¡ | _t	 ¡ | _d S )Nr_   r   r$   Fr   r   r`   r(   rZ   rb   rc   rd   rf   )r   rŸ   r   r    rh   ri   r-   rj   rk   r   r   r0   r1   r2   rl   rm   rn   ro   rp   rw   rx   ry   r:   rz   r{   r|   r}   r~   r   r€   )r   r   rƒ   r„   rh   ri   r…   rk   r	   r   r   r   C  s(   

zRes2Net.__init__c                 C   r†   r‡   rˆ   r‰   r   r   r   rl   e  rŒ   zRes2Net._make_layerc                 C   s–   |  ddd¡}| d¡}t |  |  |¡¡¡}|  |¡}|  |¡}|  |¡}|  	|¡}|  
|¡}|  |¡}| jrIt |¡}|  |¡}|  |¡}|S |S r   )rŽ   r   r   r9   r2   r0   rm   rn   ro   rp   rz   r|   rk   r~   r   )r   rK   rM   r›   rœ   r   r   r   r   rP   m  s   









zRes2Net.forward)r   r   r   rž   r   rl   rP   r   r   r   r	   r   rŸ   B  s    ø"rŸ   )r   )Ú__doc__r.   rG   Útorch.nnr   Útorch.nn.functionalÚ
functionalr   Ú)funasr.models.sond.pooling.pooling_layersÚmodelsÚsondÚpoolingry   Úfunasr.models.eres2net.fusionr   ÚHardtanhr   r"   r%   ÚModuler'   rS   rX   rž   rŸ   r   r   r   r   Ú<module>   s   


>DV?