o
    ipH                     @   s   d dl mZ d dlZd dlmZ d dlmZmZ G dd dejZ	G dd dejZ
G dd	 d	ejZG d
d dejZG dd dejZdS )    )ListN)conv2d_output_shapeconvtransp2d_output_shapec                       s(   e Zd Z	d fdd	Zdd Z  ZS )	GLSTM      Fc                    s   t    | dksJ |f|  r!d dks!J || _|| _|| _t | _tfddt|D | _	t|D ]}| j
t fddt|D  qBdS )at  Grouped LSTM.

        Reference:
            Efficient Sequence Learning with Group Recurrent Networks; Gao et al., 2018

        Args:
            hidden_size (int): total hidden size of all LSTMs in each grouped LSTM layer
                i.e., hidden size of each LSTM is `hidden_size // groups`
            groups (int): number of LSTMs in each grouped LSTM layer
            layers (int): number of grouped LSTM layers
            bidirectional (bool): whether to use BLSTM or unidirectional LSTM
            rearrange (bool): whether to apply the rearrange operation after each
                grouped LSTM layer
        r   r   c                    s   g | ]}t  qS  )nn	LayerNorm.0_)hidden_sizer   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/layers/dc_crn.py
<listcomp>-   s    z"GLSTM.__init__.<locals>.<listcomp>c              	      s,   g | ]}t j rd  ndd dqS )r      T)batch_firstbidirectional)r	   LSTMr   )r   hidden_size_tr   r   r   1   s    N)super__init__groupslayers	rearranger	   
ModuleList	lstm_listrangelnappend)selfr   r   r   r   r   layer	__class__)r   r   r   r   r      s&   

zGLSTM.__init__c                    sP  | dd dd}}||d tjjddtjfddtjD ddtj	ddd	j
d tdjD ]< jrg||jd dd ||dtjjddtj fd
dtjD ddj
  qNdd|dd  dd S )zGrouped LSTM forward.

        Args:
            x (torch.Tensor): (B, C, T, D)
        Returns:
            out (torch.Tensor): (B, C, T, D)
        r   r   r   dimc                    s&   g | ]}j d  |  | d  qS r   r   r   i)outr    r   r   r   M      & z!GLSTM.forward.<locals>.<listcomp>)	start_dimend_dimc                    s&   g | ]}j   | | d  qS r'   r(   r)   r!   r+   r    r   r   r   \   r,   )	transpose
contiguoussizeviewtorchchunkr   stackr   flattenr   r   r   reshapecat)r    xBTr   r0   r   forward>   s4   
&zGLSTM.forward)r   r   r   FF__name__
__module____qualname__r   r>   __classcell__r   r   r"   r   r      s    .r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	GluConv2dr   c                    sD   t    tj|||||d| _tj|||||d| _t | _dS )a  Conv2d with Gated Linear Units (GLU).

        Input and output shapes are the same as regular Conv2d layers.

        Reference: Section III-B in [1]

        Args:
            in_channels (int): number of input channels
            out_channels (int): number of output channels
            kernel_size (int/tuple): kernel size in Conv2d
            stride (int/tuple): stride size in Conv2d
            padding (int/tuple): padding size in Conv2d
        )in_channelsout_channelskernel_sizestridepaddingN)r   r   r	   Conv2dconv1conv2Sigmoidsigmoid)r    rE   rF   rG   rH   rI   r"   r   r   r   h   s    
zGluConv2d.__init__c                 C   "   |  |}| | |}|| S )zConvGLU forward.

        Args:
            x (torch.Tensor): (B, C_in, H_in, W_in)
        Returns:
            out (torch.Tensor): (B, C_out, H_out, W_out)
        )rK   rN   rL   r    r;   r+   gater   r   r   r>         
zGluConv2d.forwardr'   r?   r   r   r"   r   rD   g   s     rD   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )GluConvTranspose2dr   r   r   c                    sH   t    tj||||||d| _tj||||||d| _t | _dS )av  ConvTranspose2d with Gated Linear Units (GLU).

        Input and output shapes are the same as regular ConvTranspose2d layers.

        Reference: Section III-B in [1]

        Args:
            in_channels (int): number of input channels
            out_channels (int): number of output channels
            kernel_size (int/tuple): kernel size in ConvTranspose2d
            stride (int/tuple): stride size in ConvTranspose2d
            padding (int/tuple): padding size in ConvTranspose2d
            output_padding (int/tuple): Additional size added to one side of each
                dimension in the output shape
        )rE   rF   rG   rH   rI   output_paddingN)r   r   r	   ConvTranspose2ddeconv1deconv2rM   rN   )r    rE   rF   rG   rH   rI   rU   r"   r   r   r      s$   
	zGluConvTranspose2d.__init__c                 C   rO   )zDeconvGLU forward.

        Args:
            x (torch.Tensor): (B, C_in, H_in, W_in)
        Returns:
            out (torch.Tensor): (B, C_out, H_out, W_out)
        )rW   rN   rX   rP   r   r   r   r>      rR   zGluConvTranspose2d.forward)r   rT   r?   r   r   r"   r   rS      s
    ,rS   c                       s8   e Zd Z									d fd	d
	Zdd Z  ZS )DenselyConnectedBlock   r      r   r   r      r   r   rT      Fc                    s  t    |
dksJ |
t | _|}d\}}dg}t|
d D ]A}| jttj|||d|dt	|tj
dd || }t||f|d|d\}}|| ||krZ||ksbJ ||||fq!|ru| jt||||||	d	 d
S | jt|||||d d
S )a  Densely-Connected Convolutional Block.

        Args:
            in_channels (int): number of input channels
            out_channels (int): number of output channels
            hid_channels (int): number of output channels in intermediate Conv layers
            kernel_size (tuple): kernel size for all but the last Conv layers
            padding (tuple): padding for all but the last Conv layers
            last_kernel_size (tuple): kernel size for the last GluConv layer
            last_stride (tuple): stride for the last GluConv layer
            last_padding (tuple): padding for the last GluConv layer
            last_output_padding (tuple): output padding for the last GluConvTranspose2d
                 (only used when `transposed=True`)
            layers (int): total number of Conv layers
            transposed (bool): True to use GluConvTranspose2d in the last layer
                               False to use GluConv2d in the last layer
        r   )*      rc   r   r   )rG   rH   rI   T)inplacerG   rH   pad)rG   rH   rI   rU   N)r   r   r	   r   convr   r   
SequentialrJ   BatchNorm2dELUr   rS   rD   )r    rE   rF   hid_channelsrG   rI   last_kernel_sizelast_stridelast_paddinglast_output_paddingr   
transposed
in_channelr=   Dhidden_sizesr   tdimhdimr"   r   r   r      sb   




"zDenselyConnectedBlock.__init__c                 C   sh   | j d |}||g}t| j }t| j dd D ]\}}|tj|dd}||d k r1|| q|S )zDenselyConnectedBlock forward.

        Args:
            input (torch.Tensor): (B, C, T_in, F_in)
        Returns:
            out (torch.Tensor): (B, C, T_out, F_out)
        r   r   Nr%   )rh   len	enumerater5   r:   r   )r    inputr+   outputs
num_layersidxr!   r   r   r   r>   &  s   

zDenselyConnectedBlock.forward)	rZ   r[   r]   r^   r`   r]   rT   ra   Fr?   r   r   r"   r   rY      s    VrY   c                       sP   e Zd Zg dddddddddddd	d	d
d
d	fdef fddZdd Z  ZS )DC_CRN)r          @         rZ   r[   r]   r^   r`   ra   rd   r   Finput_channelsc                    sF  t    |d dksJ |t | _d}|g}|}tdt|D ]5}| jt||d  || |||||||	dd
 t	||f|||d\}}|| ||ksWJ ||fq"||d  }||kshJ ||ft
|||||d	| _t | _t | _tt|d ddD ]}| jt|| || ||||
|||	dd
 || }t	||f|
||d\}}||kr||ksJ ||||f|dkr||d  }n|}t||f|||d\}}||ksJ ||f||d  | }|dksJ ||d  |f| jt|| d |||||||d|f|	d
d qtj||d| _tj||d| _dS )ab	  Densely-Connected Convolutional Recurrent Network (DC-CRN).

        Reference: Fig. 3 and Section III-B in [1]

        Args:
            input_dim (int): input feature dimension
            input_channels (list): number of input channels for the stacked
                DenselyConnectedBlock layers
                Its length should be (`number of DenselyConnectedBlock layers`).
                It is recommended to use even number of channels to avoid AssertError
                when `glstm_bidirectional=True`.
            enc_hid_channels (int): common number of intermediate channels for all
                DenselyConnectedBlock of the encoder
            enc_kernel_size (tuple): common kernel size for all DenselyConnectedBlock
                of the encoder
            enc_padding (tuple): common padding for all DenselyConnectedBlock
                of the encoder
            enc_last_kernel_size (tuple): common kernel size for the last Conv layer
                in all DenselyConnectedBlock of the encoder
            enc_last_stride (tuple): common stride for the last Conv layer in all
                DenselyConnectedBlock of the encoder
            enc_last_padding (tuple): common padding for the last Conv layer in all
                DenselyConnectedBlock of the encoder
            enc_layers (int): common total number of Conv layers for all
                DenselyConnectedBlock layers of the encoder
            skip_last_kernel_size (tuple): common kernel size for the last Conv layer
                in all DenselyConnectedBlock of the skip pathways
            skip_last_stride (tuple): common stride for the last Conv layer in all
                DenselyConnectedBlock of the skip pathways
            skip_last_padding (tuple): common padding for the last Conv layer in all
                DenselyConnectedBlock of the skip pathways
            glstm_groups (int): number of groups in each Grouped LSTM layer
            glstm_layers (int): number of Grouped LSTM layers
            glstm_bidirectional (bool): whether to use BLSTM or unidirectional LSTM
                in Grouped LSTM layers
            glstm_rearrange (bool): whether to apply the rearrange operation after each
                grouped LSTM layer
            output_channels (int): number of output channels (must be an even number to
                recover both real and imaginary parts)
        r   r   rb   r   F)
rE   rF   rl   rG   rI   rm   rn   ro   r   rq   rf   r$   )r   r   r   r   r   T)rE   rF   rl   rG   rI   rm   rn   ro   rp   r   rq   )in_featuresout_featuresN)r   r   r	   r   conv_encr   rw   r   rY   r   r   glstmskip_pathway
deconv_decr   Linearfc_realfc_imag)r    	input_dimr   enc_hid_channelsenc_kernel_sizeenc_paddingenc_last_kernel_sizeenc_last_strideenc_last_padding
enc_layersskip_last_kernel_sizeskip_last_strideskip_last_paddingglstm_groupsglstm_layersglstm_bidirectionalglstm_rearrangeoutput_channelsr=   rt   rv   r*   ru   hsenc_hdimout_chhpaddingr"   r   r   r   9  s   
<






 

zDC_CRN.__init__c                 C   s
  |}g }t | jD ]\}}||}|| q	t|}| |d }| jd |d }tj||fdd}tt| j	d D ]!}| j	| |}| j|d  ||| d  }tj||fdd}q=| j	d |}tj
|ddd\}	}
| |	}| |
}tj||gdd}|S )zDC-CRN forward.

        Args:
            x (torch.Tensor): Concatenated real and imaginary spectrum features
                (B, input_channels[0], T, F)
        Returns:
            out (torch.Tensor): (B, 2, output_channels, T, F)
        r$   r   r   r%   r   )rx   r   r   rw   r   r   r5   r:   r   r   r6   r   r   r7   )r    r;   r+   conv_outr|   r!   num_outres
deconv_out	dout_real	dout_imagout_realout_imagr   r   r   r>     s&   	

zDC_CRN.forward)r@   rA   rB   r   r   r>   rC   r   r   r"   r   r}   8  s,     #r}   )typingr   r5   torch.nnr	   espnet2.enh.layers.conv_utilsr   r   Moduler   rD   rS   rY   r}   r   r   r   r   <module>   s   X.:i