o
    i8                     @   s   d dl Z d dlmZ d dlmZ d dlmZ ee jedkZ	G dd de j
jZG dd	 d	e j
jZG d
d de j
jZG dd de j
jZG dd de j
jZdS )    N)parse)ComplexTensor)	get_layerz1.9.0c                       s8   e Zd ZdZddddejjf fdd	Zdd	 Z  Z	S )
Conv2DActNormz9Basic Conv2D + activation + instance norm building block.   r         r	   r   Fc                    sr   t t|   |rtj|||||}ntjj|||||dd}t| }	tjj||dd}
tj	||	|
| _
d S )Nreflectpadding_mode:0yE>eps)superr   __init__torchnnConvTranspose2dConv2dr   	GroupNorm
Sequentiallayer)selfin_channelsout_channelskszstridepaddingupsample
activationconvactnorm	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/layers/tcndenseunet.pyr      s   


zConv2DActNorm.__init__c                 C   s
   |  |S Nr   r   inpr(   r(   r)   forward%   s   
zConv2DActNorm.forward
__name__
__module____qualname____doc__r   r   ELUr   r.   __classcell__r(   r(   r&   r)   r   
   s    r   c                       s0   e Zd ZdZejjf fdd	Zdd Z  Z	S )FreqWiseBlockzFreqWiseBlock, see iNeuBe paper.

    Block that applies pointwise 2D convolution over
    STFT-like image tensor on frequency axis.
    The input is assumed to be [batch, image_channels, frames, freq].

    c                    s>   t t|   t||ddd|d| _t||ddd|d| _d S )Nr	   r	   )r   r   r"   )r   r6   r   r   
bottleneck	freq_proc)r   r   	num_freqsr   r"   r&   r(   r)   r   2   s   zFreqWiseBlock.__init__c                 C   s,   |  | |dddddddd}|S )Nr   r   r
   r	   )r:   r9   permute)r   r-   outr(   r(   r)   r.   <   s   zFreqWiseBlock.forwardr/   r(   r(   r&   r)   r6   )   s    
r6   c                       s:   e Zd ZdZddddejjdf fdd	Zdd	 Z  Z	S )

DenseBlockan  single DenseNet block as used in iNeuBe model.

    Args:
        in_channels: number of input channels (image axis).
        out_channels: number of output channels (image axis).
        num_freqs: number of complex frequencies in the
            input STFT complex image-like tensor.
            The input is batch, image_channels, frames, freqs.
        pre_blocks: dense block before point-wise convolution block over frequency axis.
        freq_proc_blocks: number of frequency axis processing blocks.
        post_blocks: dense block after point-wise convolution block over frequency axis.
        ksz: kernel size used in densenet Conv2D layers.
        activation: activation function to use in the whole iNeuBe model,
                you can use any torch supported activation e.g. 'relu' or 'elu'.
        hid_chans: number of hidden channels in densenet Conv2D.
    r
   r	   r       c
              	      s:  t t|   |dksJ |dksJ tjg | _d}
t|D ]}t||	|
  |	|dd|d}| j	| |
d7 }
q tjg | _
t|D ]}t||	|
  ||	|d}| j
	| |
d7 }
qFtjg | _t|d D ]}t||	|
  |	|dd|d}| j	| |
d7 }
qlt||	|
  ||dd|d}| j	| d S )Nr	   r   r7   r8   )r   r>   r   r   r   
ModuleList
pre_blocksranger   appendfreq_proc_blocksr6   post_blocks)r   r   r   r;   rA   rD   rE   r   r"   	hid_chans
tot_layersindxc_layerlastr&   r(   r)   r   W   s^   






zDenseBlock.__init__c                 C   s|   |g}| j D ]}|t|d}|| q| jD ]}|t|d}|| q| jD ]}|t|d}|| q,|S )Nr	   )rA   r   catrC   rD   rE   )r   inputr=   	pre_blockc_out
freq_block
post_blockr(   r(   r)   r.      s   


zDenseBlock.forwardr/   r(   r(   r&   r)   r>   E   s    Br>   c                       s6   e Zd ZdZdddejjf fdd	Zdd Z  Z	S )TCNResBlocka  single depth-wise separable TCN block as used in iNeuBe TCN.

    Args:
        in_chan: number of input feature channels.
        out_chan: number of output feature channels.
        ksz: kernel size.
        stride: stride in depth-wise convolution.
        dilation: dilation in depth-wise convolution.
        activation: activation function to use in the whole iNeuBe model,
            you can use any torch supported activation e.g. 'relu' or 'elu'.
    r   r	   c           
   
      sj   t t|   |}tjj||||||d|d}tj||d}	tjtjj||ddt| ||	| _	d S )Nr   )r    dilationr   groupsr	   r   r   )
r   rQ   r   r   r   Conv1dr   r   r   r   )
r   in_chanout_chanr   r   rR   r"   r    dconv
point_convr&   r(   r)   r      s&   

zTCNResBlock.__init__c                 C   s   |  || S r*   r+   r,   r(   r(   r)   r.      s   zTCNResBlock.forwardr/   r(   r(   r&   r)   rQ      s
    rQ   c                       sL   e Zd ZdZdddddddddd	ejjf fd
d	Zdd Zdd Z	  Z
S )TCNDenseUNeta  TCNDenseNet block from iNeuBe

    Reference:
    Lu, Y. J., Cornell, S., Chang, X., Zhang, W., Li, C., Ni, Z., ... & Watanabe, S.
    Towards Low-Distortion Multi-Channel Speech Enhancement:
    The ESPNET-Se Submission to the L3DAS22 Challenge. ICASSP 2022 p. 9201-9205.

    Args:
        n_spk: number of output sources/speakers.
        in_freqs: number of complex STFT frequencies.
        mic_channels: number of microphones channels
            (only fixed-array geometry supported).
        hid_chans: number of channels in the subsampling/upsampling conv layers.
        hid_chans_dense: number of channels in the densenet layers
            (reduce this to reduce VRAM requirements).
        ksz_dense: kernel size in the densenet layers thorough iNeuBe.
        ksz_tcn: kernel size in the TCN submodule.
        tcn_repeats: number of repetitions of blocks in the TCN submodule.
        tcn_blocks: number of blocks in the TCN submodule.
        tcn_channels: number of channels in the TCN submodule.
        activation: activation function to use in the whole iNeuBe model,
            you can use any torch supported activation e.g. 'relu' or 'elu'.
    r	   i  r?   r   r         i  c                    s  t t|   || _|| _|| _|d }tjtjj	| jd |dddddt
||||||d}| |}tjg | _| j| tt|D ]%}t||ddd|d	}t
|||| |||d}tj||}| j| qH| jt||d ddd|d	 | jt|d |d
 ddd|d	 | jt|d
 |
ddd|d	 g | _t|D ]}t|	D ]}| jt|
|
|d| |d qqtjj| j | _tjg | _| jt|
d |d
 ddd|dd | jt|d |d ddd|dd | jt|d
 |ddd|dd tt|D ]5}|t|| d  }t
|d |d ||||d}t|d |ddd|dd}tj||}| j| qtjt
|d |d | jd |||dtj|d d| j ddd}| j| d S )Nr
   r   r7   r   r   r   )r   r"   rF   r   r8   rZ   )rR   r"   T)r"   r!      r	   )r   rY   r   n_spkr   mic_channelsr   r   r   r   r>   
_get_depthr@   encoderrC   rB   lenr   tcnrQ   decoderr   )r   r]   in_freqsr^   rF   hid_chans_dense	ksz_denseksz_tcntcn_repeats
tcn_blockstcn_channelsr"   r;   firstfreq_axis_dims
layer_indx
downsampledenseblocksrI   rxdec_indxc_num_freqsr!   rJ   r&   r(   r)   r      s(  

	zTCNDenseUNet.__init__c                 C   s:   d}g }|dkrt |d }|| |d7 }|dks|S )Nr      r
   r	   )intrC   )r   num_freqn_layersfreqsr(   r(   r)   r_     s   
zTCNDenseUNet._get_depthc                 C   sT  | dddd}|j\}}}}|| jksJ t|j|jfd}|dd}||| jd || j	}g }|}| j
D ]}	|	|}|| q9|jd dksNJ | |dd}
|
}t| jD ]\}}t|||d   fd}||}q`||d| jd| j	}trt|dddf |dddf }|S t|dddf |dddf }|S )am  forward.

        Args:
            tf_rep (torch.Tensor): 4D tensor (multi-channel complex STFT of mixture)
                        of shape [B, T, C, F] batch, frames, microphones, frequencies.

        Returns:
            out (torch.Tensor): complex 3D tensor monaural STFT of the targets
                shape is [B, T, F] batch, frames, frequencies.

        r   r
   r   r	   N)r<   shaper^   r   rK   realimag	transposereshaper   r`   rC   rb   squeeze	unsqueeze	enumeraterc   r]   is_torch_1_9_pluscomplexr   )r   tf_repbszmics_frames	inp_featsenc_outbuffer	enc_layertcn_outrH   	dec_layerc_inputr=   r(   r(   r)   r.     s2   

$"zTCNDenseUNet.forward)r0   r1   r2   r3   r   r   r4   r   r_   r.   r5   r(   r(   r&   r)   rY      s"     +
rY   )r   packaging.versionr   Vtorch_complex.tensorr   )espnet2.torch_utils.get_layer_from_stringr   __version__r   r   Moduler   r6   r>   rQ   rY   r(   r(   r(   r)   <module>   s    g*