o
    i0B                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZ d dlmZ d dlZd dlZG dd de jjZG d	d
 d
e jjZG dd deZG dd deZG dd deZG dd deZdS )    N)
functional)
AbsEncoder)TupleOptional)statistic_poolingwindowed_statistic_pooling)OrderedDictc                	       s@   e Zd Zddedededef fddZdd	 Zd
d Z  ZS )
BasicLayer      ?
in_filtersfiltersstridebn_momentumc                    s   t    || _|| _|| _tjj|d|dd| _tj	 | _
tjj||d|dd| _tjj|d|dd| _tj	 | _tjj||dddd| _||ksP|dkritjj||d|dd| _tjj|d|dd| _d S d S )NMbP?T)epsmomentumaffine   F)bias   )super__init__r   r   r   torchnnBatchNorm2dbn1ReLUrelu1Conv2dconv1bn2relu2conv2conv_scbn_sc)selfr   r   r   r   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sond/encoder/resnet34_encoder.pyr      s   
zBasicLayer.__init__c                 C   sZ   |dkrt |dddS |dkr+|d|d}}t ||d d|d dfddS d S )Nr   )r   r   r   r   constantr      r   )Fpadsize)r%   xr   hwr(   r(   r)   proper_padding    s    zBasicLayer.proper_paddingc                 C   s   |}| j | jks| jdkr| |}| |}| | |}| || j}| |}| 	| 
|}| |d}| |}| jdkrJ|d | j }|| |fS )Nr   r+   )r   r   r   r#   r$   r   r   r2   r   r!   r    r"   )r%   xs_padilensidentityr(   r(   r)   forward)   s   




zBasicLayer.forwardr
   )	__name__
__module____qualname__intfloatr   r2   r6   __classcell__r(   r(   r&   r)   r	      s     	r	   c                       s&   e Zd Zd fdd	Zdd Z  ZS )
BasicBlockr
   c                    sZ   t    || _t|D ]}t|dkr|n|||dkr|nd|}| d|| qd S )Nr   r   layer_{})r   r   	num_layerranger	   
add_moduleformat)r%   r   r   r@   r   r   ilayerr&   r(   r)   r   >   s   
 zBasicBlock.__init__c                 C   s2   t | jD ]}| jd| ||\}}q||fS )Nr?   )rA   r@   _modulesrC   )r%   r3   r4   rD   r(   r(   r)   r6   H   s   zBasicBlock.forwardr7   )r8   r9   r:   r   r6   r=   r(   r(   r&   r)   r>   =   s    
r>   c                       sh   e Zd Z						d fdd	Zd	efd
dZ	ddejdejdejd	eejejf fddZ	  Z
S )ResNet34Tr
   F   r         r       @      rH   c              	      s(  t t|   || _|| _|| _|| _|| _|| _|d }|r7t	j
jd|dddddd| _t	j
j|d|d| _|rCt	j
jdddd	| _tt|D ]0}	|	dkrW| jrT|nd}
n||	d  }
t|
||	 ||	 |	dkrkdnd
|d}| d|	| qIt	j
|d |d| _t	j
j|d|d| _d| _d S Nr   r   r   Fzeros)r   padding_moder   r   r   )paddingr+   )r   r@   r   r   block_{}   )r   rG   r   use_head_convuse_head_maxpoolnum_nodes_pooling_layerlayers_in_blockfilters_in_block
input_sizer   r   r   pre_convr   pre_conv_bn	MaxPool2dhead_maxpoolrA   lenr>   rB   rC   resnet0_dense
resnet0_bntime_ds_ratio)r%   r]   rX   batchnorm_momentumrY   rZ   r[   r\   pre_filtersrD   r   blockr&   r(   r)   r   Q   sD   

zResNet34.__init__returnc                 C      | j S NrZ   r%   r(   r(   r)   output_size      zResNet34.output_sizeNr3   r4   prev_statesc           	      C   s   |}| d| jksJ d| d| jtj|dd}| jr/| |}| |}t	|}| j
r7| |}||}}tt| jD ]}| jd| }|||\}}qC| |}t	|}| |}||fS )NrV   9Dimension of features {} doesn't match the input_size {}.r   dimrU   )r.   r]   rC   r   	unsqueezerX   r^   r_   r,   relurY   ra   rA   rb   r[   rF   rc   rd   )	r%   r3   r4   rp   featuresresnet_outsresnet_out_lensrD   rh   r(   r(   r)   r6      s*   







zResNet34.forward)Tr
   FrH   rI   rL   rk   r8   r9   r:   r   r;   rn   r   Tensorr   r6   r=   r(   r(   r&   r)   rG   P   s&    5rG   c                       sn   e Zd Z										d fd
d	ZdefddZ	ddejdejdejdeejejf fddZ	  Z
S )ResNet34_SP_L2RegTr
   FrH   rI   rL   encoderEAND/speech_encoder
 c              	      sB  t t|   || _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|d }|r@tjjd|dddddd| _tjj|d|d| _|rLtjjdddd	| _tt|D ]0}|dkr`| jr]|nd}n||d  }t||| || |dkrtdnd
|d}| d|| qRtj|d | d |d| _tjj|d|d| _d| _d S rP   )r   r{   r   rX   rY   rZ   r[   r\   r]   !tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tftf_train_stepsr   r   r   r^   r   r_   r`   ra   rA   rb   r>   rB   rC   Conv1drc   BatchNorm1drd   re   )r%   r]   rX   rf   rY   rZ   r[   r\   r   r   r   rg   rD   r   rh   r&   r(   r)   r      sN   
zResNet34_SP_L2Reg.__init__ri   c                 C   rj   rk   rl   rm   r(   r(   r)   rn      ro   zResNet34_SP_L2Reg.output_sizeNr3   r4   rp   c                 C   s   |}| d| jksJ d| d| jtj|dd}| jr/| |}| |}t	|}| j
r7| |}||}}tt| jD ]}| jd| }|||\}}qC|j\}	}
}}t|dddd|	||
 |g}| |}t	|}| |}||fS )	NrV   rq   r   rr   rU   r   r   r+   )r.   r]   rC   r   rt   rX   r^   r_   r,   ru   rY   ra   rA   rb   r[   rF   shapereshapepermuterc   rd   )r%   r3   r4   rp   rv   rw   rx   rD   rh   bbccttffr(   r(   r)   r6      s.   




"


zResNet34_SP_L2Reg.forward)	Tr
   FrH   rI   rL   r|   r}   r~   rk   ry   r(   r(   r&   r)   r{      s,    =r{   c                          e Zd Z													
		d fdd	ZdefddZ	ddejdejdejdeejeje	ej f f fddZ
  ZS )ResNet34Diarresnet1_denseTr
   FrH   rI   rL   window_shift   r   r|   seq2seq/speech_encoderc              	         t t| j|||||||d || _|	| _|
| _|| _|| _|| _|| _	|| _
tj|d |	| _tjj|	d|d| _tj|	|
| _tjj|
d|d| _dS )z
        Author: Speech Lab, Alibaba Group, China
        SOND: Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis
        https://arxiv.org/abs/2211.10243
        rX   rf   rY   rZ   r[   r\   r+   r   rS   N)r   r   r   embedding_nodenum_nodes_resnet1num_nodes_last_layerpooling_type	pool_sizer   r   r   r   r   Linearr   r   
resnet1_bnresnet2_dense
resnet2_bnr%   r]   r   rX   rf   rY   rZ   r[   r\   r   r   r   r   r   r   r   r&   r(   r)   r     2   

zResNet34Diar.__init__ri   c                 C   *   | j dr	| jS | j dr| jS | jS Nresnet1resnet2r   
startswithr   r   rZ   rm   r(   r(   r)   rn   C  
   zResNet34Diar.output_sizeNr3   r4   rp   c                    s   t  }t ||\}}||d< | jdkrt||d}nt||d| j| j\}}|dd}||d< | 	|}||d< t
|}||d	< | |dddd}||d
< | |}||d< t
|}||d< | |dddd}||d< || j |d fS )Nrd   	frame_gsp)r   )r+   r   r   r+   poolingr   resnet1_relur   r   resnet2_relur   r   r   r6   r   r   r   r   r   	transposer   r,   ru   r   r   r   r   r%   r3   r4   rp   	endpointsres_outrv   r&   r(   r)   r6   K  .   




zResNet34Diar.forwardr   Tr
   FrH   rI   rL   rH   rH   r   r   r   r|   r   rk   r8   r9   r:   r   r;   rn   r   rz   r   r   r6   r=   r(   r(   r&   r)   r     6    5r   c                       r   )ResNet34SpL2RegDiarr   Tr
   FrH   rI   rL   r   r   r   r|   r   c              	      r   )z
        Author: Speech Lab, Alibaba Group, China
        TOLD: A Novel Two-Stage Overlap-Aware Framework for Speaker Diarization
        https://arxiv.org/abs/2303.05397
        r   r+   r   rS   N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r&   r(   r)   r   p  r   zResNet34SpL2RegDiar.__init__ri   c                 C   r   r   r   rm   r(   r(   r)   rn     r   zResNet34SpL2RegDiar.output_sizeNr3   r4   rp   c                    s   t  }t ||\}}||d< | jdkrt||d}nt||d| j| j\}}|dd}||d< | 	|}||d< t
|}||d< | |dddd}||d	< | |}||d
< t
|}||d< | |dddd}||d< || j |d fS )Nrd   r   )r+   r   r+   r   r   r   r   r   r   r   r   r   r&   r(   r)   r6     r   zResNet34SpL2RegDiar.forwardr   rk   r   r(   r(   r&   r)   r   o  r   r   )r   torch.nnr   r,   !funasr.models.encoder.abs_encoderr   typingr   r   'funasr.models.pooling.statistic_poolingr   r   collectionsr   loggingnumpynpr   Moduler	   r>   rG   r{   r   r   r(   r(   r(   r)   <module>   s    2Zcb