o
    siy                     @   s  d dl mZmZmZ d dlZd dlZd dlmZ d dlZddl	m
Z
 ddl	mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ G dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd deZ#edee$ fddZ%edee$ fdd Z&G d!d" d"ejZ'G d#d$ d$ejZ(G d%d& d&ejZ)G d'd( d(e)Z*G d)d* d*e)Z+dS )+    )ListTupleOptionalN)nn   )
complex_nn   )normsactivations)BaseDCUMaskNet)GlobLN)has_arg)DCUNET_ARCHITECTURES)_DilatedConvNorm_NormAct_ConvNormAct	_ConvNorm)script_if_tracing
pad_x_to_yc                       s(   e Zd ZdZ fddZdd Z  ZS )_Chop1dz5To ensure the output length is the same as the input.c                    s   t    || _d S N)super__init__	chop_size)selfr   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/asteroid/masknn/convolutional.pyr      s   

z_Chop1d.__init__c                 C   s   |dd | j  f  S )N.)r   
contiguousr   xr   r   r   forward   s   z_Chop1d.forward__name__
__module____qualname____doc__r   r"   __classcell__r   r   r   r   r      s    r   c                       s.   e Zd ZdZ		d fdd	Zdd Z  ZS )	Conv1DBlocka  One dimensional convolutional block, as proposed in [1].

    Args:
        in_chan (int): Number of input channels.
        hid_chan (int): Number of hidden channels in the depth-wise
            convolution.
        skip_out_chan (int): Number of channels in the skip convolution.
            If 0 or None, `Conv1DBlock` won't have any skip connections.
            Corresponds to the the block in v1 or the paper. The `forward`
            return res instead of [res, skip] in this case.
        kernel_size (int): Size of the depth-wise convolutional kernel.
        padding (int): Padding of the depth-wise convolution.
        dilation (int): Dilation of the depth-wise convolution.
        norm_type (str, optional): Type of normalization to use. To choose from

            -  ``'gLN'``: global Layernorm.
            -  ``'cLN'``: channelwise Layernorm.
            -  ``'cgLN'``: cumulative global Layernorm.
            -  Any norm supported by :func:`~.norms.get`
        causal (bool, optional) : Whether or not the convolutions are causal


    References
        [1] : "Conv-TasNet: Surpassing ideal time-frequency magnitude masking
        for speech separation" TASLP 2019 Yi Luo, Nima Mesgarani
        https://arxiv.org/abs/1809.07454
    gLNFc	              	      s   t t|   || _t|}	t||d}
tj||||||d}|r+t|t	|}t|
t
 |	||t
 |	|| _t||d| _|rRt||d| _d S d S )Nr   )paddingdilationgroups)r   r)   r   skip_out_chanr	   getr   Conv1d
Sequentialr   PReLUshared_blockres_conv	skip_conv)r   in_chanhid_chanr.   kernel_sizer+   r,   	norm_typecausal	conv_norm	in_conv1ddepth_conv1dr   r   r   r   :   s*   
zConv1DBlock.__init__c                 C   s0   |  |}| |}| js|S | |}||fS )z"Input shape $(batch, feats, seq)$.)r3   r4   r.   r5   )r   r!   
shared_outres_outskip_outr   r   r   r"   Z   s   


zConv1DBlock.forward)r*   Fr#   r   r   r   r   r)      s    $ r)   c                       sF   e Zd ZdZ											d fd
d	Zdd Zdd Z  ZS )	TDConvNeta  Temporal Convolutional network used in ConvTasnet.

    Args:
        in_chan (int): Number of input filters.
        n_src (int): Number of masks to estimate.
        out_chan (int, optional): Number of bins in the estimated masks.
            If ``None``, `out_chan = in_chan`.
        n_blocks (int, optional): Number of convolutional blocks in each
            repeat. Defaults to 8.
        n_repeats (int, optional): Number of repeats. Defaults to 3.
        bn_chan (int, optional): Number of channels after the bottleneck.
        hid_chan (int, optional): Number of channels in the convolutional
            blocks.
        skip_chan (int, optional): Number of channels in the skip connections.
            If 0 or None, TDConvNet won't have any skip connections and the
            masks will be computed from the residual output.
            Corresponds to the ConvTasnet architecture in v1 or the paper.
        conv_kernel_size (int, optional): Kernel size in convolutional blocks.
        norm_type (str, optional): To choose from ``'BN'``, ``'gLN'``,
            ``'cLN'``.
        mask_act (str, optional): Which non-linear function to generate mask.
        causal (bool, optional) : Whether or not the convolutions are causal.

    References
        [1] : "Conv-TasNet: Surpassing ideal time-frequency magnitude masking
        for speech separation" TASLP 2019 Yi Luo, Nima Mesgarani
        https://arxiv.org/abs/1809.07454
    N            r*   reluFc                    s`  t t|   || _|| _|r|n|}|| _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|| _t|
|}t||d}t||| _t | _t|D ]0}t|D ])}|sd|	d d|  d }n|	d d|  }| jt||||	|d| |
|d qUqO|r|n|}t||| d}tt || _t|}t|dr|dd| _d S | | _d S )Nr   r   )r+   r,   r9   r:   dimrG   )r   rA   r   r6   n_srcout_chann_blocks	n_repeatsbn_chanr7   	skip_chanconv_kernel_sizer9   mask_actr:   r	   r/   r   r0   r1   
bottleneck
ModuleListTCNrangeappendr)   r2   mask_netr
   r   
output_act)r   r6   rI   rJ   rK   rL   rM   r7   rN   rO   r9   rP   r:   
layer_normbottleneck_convrr!   r+   mask_conv_inp	mask_convmask_nl_classr   r   r   r      sV   


zTDConvNet.__init__c                 C   s   |  \}}}| |}tjdg|jd}| jD ]}||}| jr*|\}	}
||
 }n|}	||	 }q| jr6|n|}| |}||| j	| j
|}| |}|S )Forward.

        Args:
            mixture_w (:class:`torch.Tensor`): Tensor of shape $(batch, nfilters, nframes)$

        Returns:
            :class:`torch.Tensor`: estimated mask of shape $(batch, nsrc, nfilters, nframes)$
                )device)sizerQ   torchtensorr`   rS   rN   rV   viewrI   rJ   rW   )r   	mixture_wbatch_n_framesoutputskip_connectionlayertcn_outresidualskipmask_inpscoreest_maskr   r   r   r"      s   	





zTDConvNet.forwardc                 C   s:   | j | j| j| j| j| j| j| j| j| j	| j
| jd}|S )Nr6   rJ   rM   r7   rN   rO   rK   rL   rI   r9   rP   r:   rr   r   configr   r   r   
get_config   s   zTDConvNet.get_config)
NrB   rC   rD   rE   rD   rC   r*   rF   Fr$   r%   r&   r'   r   r"   ru   r(   r   r   r   r   rA   d   s    !@rA   c                       sD   e Zd ZdZ									d fd	d
	Zdd Zdd Z  ZS )TDConvNetppuo  Improved Temporal Convolutional network used in [1] (TDCN++)

    Args:
        in_chan (int): Number of input filters.
        n_src (int): Number of masks to estimate.
        out_chan (int, optional): Number of bins in the estimated masks.
            If ``None``, `out_chan = in_chan`.
        n_blocks (int, optional): Number of convolutional blocks in each
            repeat. Defaults to 8.
        n_repeats (int, optional): Number of repeats. Defaults to 3.
        bn_chan (int, optional): Number of channels after the bottleneck.
        hid_chan (int, optional): Number of channels in the convolutional
            blocks.
        skip_chan (int, optional): Number of channels in the skip connections.
            If 0 or None, TDConvNet won't have any skip connections and the
            masks will be computed from the residual output.
            Corresponds to the ConvTasnet architecture in v1 or the paper.
        kernel_size (int, optional): Kernel size in convolutional blocks.
        norm_type (str, optional): To choose from ``'BN'``, ``'gLN'``,
            ``'cLN'``.
        mask_act (str, optional): Which non-linear function to generate mask.

    References
        [1] : Kavalerov, Ilya et al. “Universal Sound Separation.” in WASPAA 2019

    .. note::
        The differences wrt to ConvTasnet's TCN are:

        1. Channel wise layer norm instead of global
        2. Longer-range skip-residual connections from earlier repeat inputs
           to later repeat inputs after passing them through dense layer.
        3. Learnable scaling parameter after each dense layer. The scaling
           parameter for the second dense  layer  in  each  convolutional
           block (which  is  applied  rightbefore the residual connection) is
           initialized to an exponentially decaying scalar equal to 0.9**L,
           where L is the layer or block index.

    NrB   rC   rD   rE   fgLNrF   c                    s  t    || _|| _|r|n|}|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _t|
|}t||d}t||| _t | _t|D ]$}t|D ]}|	d d|  d }| jt||||	|d| |
d qPqJt | _t|d D ]}| jt||d qztdd td|D }|d||d  }tj|dd| _|r|n|}t||| d}tt  || _!t"|}t#|d	r|dd
| _$n| | _$|r|n|}t%||| _&d S )Nr   r   )r+   r,   r9   c                 S   s   g | ]}d | qS )g?r   ).0lr   r   r   
<listcomp>L      z(TDConvNetpp.__init__.<locals>.<listcomp>r   T)requires_gradrG   rH   )'r   r   r6   rI   rJ   rK   rL   rM   r7   rN   rO   r9   rP   r	   r/   r   r0   r1   rQ   rR   rS   rT   rU   r)   
dense_skiprb   Tensor	unsqueezeexpandclone	Parameterscaling_paramr2   rV   r
   r   rW   Linearconsistency)r   r6   rI   rJ   rK   rL   rM   r7   rN   rO   r9   rP   rX   rY   rZ   r!   r+   r   r[   r\   r]   out_sizer   r   r   r     s^   




zTDConvNetpp.__init__c                 C   s*  |  \}}}| |}|}d}t| jD ]P}|dkr(| j|d  || }|}t| jD ]7}	|| j |	 }
| j|
 |}| jrI|\}}|| }n|\}}|	dkrZ| j||	d f nd}|| }|| }q-q| jrk|n|}| 	|}|
|| j| j|}| |}| |d}tjj|d}||fS )r^   r_   r   r   g      ?)ra   rQ   rT   rL   r~   rK   rS   rN   r   rV   rd   rI   rJ   rW   r   meanrb   r   
functionalsoftmax)r   re   rf   	n_filtersrh   ri   output_copyrj   rZ   r!   irl   rm   rn   rg   scalero   rp   rq   weightsr   r   r   r"   ^  s4   	




zTDConvNetpp.forwardc                 C   s6   | j | j| j| j| j| j| j| j| j| j	| j
d}|S )Nr6   rJ   rM   r7   rN   rO   rK   rL   rI   r9   rP   r   rs   r   r   r   ru     s   zTDConvNetpp.get_config)	NrB   rC   rD   rE   rD   rC   rx   rF   rv   r   r   r   r   rw      s    +F-rw   c                       s6   e Zd ZdZ		d	 fdd	ZdejfddZ  ZS )
DCUNetComplexEncoderBlocka  Encoder block as proposed in [1].

    Args:
        in_chan (int): Number of input channels.
        out_chan (int): Number of output channels.
        kernel_size (Tuple[int, int]): Convolution kernel size.
        stride (Tuple[int, int]): Convolution stride.
        padding (Tuple[int, int]): Convolution padding.
        norm_type (str, optional): Type of normalization to use.
            See :mod:`~asteroid.masknn.norms` for valid values.
        activation (str, optional): Type of activation to use.
            See :mod:`~asteroid.masknn.activations` for valid values.

    References
        [1] : "Phase-aware Speech Enhancement with Deep Complex U-Net",
        Hyeong-Seok Choi et al. https://arxiv.org/abs/1903.03107
    bN
leaky_reluc           	         sL   t    tj||||||d u d| _t||| _t|}| | _	d S N)bias)
r   r   r   ComplexConv2dconvr	   get_complexnormr
   
activation)	r   r6   rJ   r8   strider+   r9   r   activation_classr   r   r   r     s   


z"DCUNetComplexEncoderBlock.__init__r!   c                 C      |  | | |S r   )r   r   r   r    r   r   r   r"        z!DCUNetComplexEncoderBlock.forward)r   r   	r$   r%   r&   r'   r   r   ComplexTensorr"   r(   r   r   r   r   r     s    r   c                       s8   e Zd ZdZ			d
 fdd	Zdejfdd	Z  ZS )DCUNetComplexDecoderBlocka  Decoder block as proposed in [1].

    Args:
        in_chan (int): Number of input channels.
        out_chan (int): Number of output channels.
        kernel_size (Tuple[int, int]): Convolution kernel size.
        stride (Tuple[int, int]): Convolution stride.
        padding (Tuple[int, int]): Convolution padding.
        norm_type (str, optional): Type of normalization to use.
            See :mod:`~asteroid.masknn.norms` for valid values.
        activation (str, optional): Type of activation to use.
            See :mod:`~asteroid.masknn.activations` for valid values.

    References
        [1] : "Phase-aware Speech Enhancement with Deep Complex U-Net",
        Hyeong-Seok Choi et al. https://arxiv.org/abs/1903.03107
    r   r   r   r   c	           
   	      sr   t    || _|| _|| _|| _|| _|| _tj	|||||||d u d| _
t||| _t|}	|	 | _d S r   )r   r   r6   rJ   r8   r   r+   output_paddingr   ComplexConvTranspose2ddeconvr	   r   r   r
   r   )
r   r6   rJ   r8   r   r+   r   r9   r   r   r   r   r   r     s   

z"DCUNetComplexDecoderBlock.__init__r!   c                 C   r   r   )r   r   r   r    r   r   r   r"     r   z!DCUNetComplexDecoderBlock.forward)r   r   r   r   r   r   r   r   r     s    r   c                       s6   e Zd ZdZeZd	 fdd	Zdd Zdd Z  Z	S )

DCUMaskNeta  Masking part of DCUNet, as proposed in [1].

    Valid `architecture` values for the ``default_architecture`` classmethod are:
    "Large-DCUNet-20", "DCUNet-20", "DCUNet-16", "DCUNet-10" and "mini".

    Valid `fix_length_mode` values are [None, "pad", "trim"].

    Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs - 1$ divisible
    by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency strides of the encoders,
    and $time - 1$ is divisible by $t_0 * t_1 * ... * t_N$ where $t_N$ are the time
    strides of the encoders.

    References
        [1] : "Phase-aware Speech Enhancement with Deep Complex U-Net",
        Hyeong-Seok Choi et al. https://arxiv.org/abs/1903.03107
    Nc                    s|   || _ tjdd |D dd| _ddlm m t jdfdd|D  fdd|d d	 D t	j
|d	  d
| d S )Nc                 S   s   g | ]	\}}}}}|qS r   r   )ry   rg   
enc_strider   r   r   r{     s    z'DCUMaskNet.__init__.<locals>.<listcomp>r   )axisr   r   r   c                       g | ]} | qS r   r   ry   args)r   r   r   r{     r|   c                    r   r   r   r   )r   r   r   r{     r|   r   )encodersdecodersoutput_layerr   )fix_length_modenpprodencoders_stride_productconvolutionalr   r   r   r   r   r   )r   r   r   r   kwargsr   r   r   r     s   
zDCUMaskNet.__init__c                 C   s   t | j|t| jS r   )_fix_dcu_input_dimsr   rb   
from_numpyr   r    r   r   r   fix_input_dims   s   zDCUMaskNet.fix_input_dimsc                 C   s   t | j||S r   )_fix_dcu_output_dimsr   )r   outr!   r   r   r   fix_output_dims%  s   zDCUMaskNet.fix_output_dimsr   )
r$   r%   r&   r'   r   _architecturesr   r   r   r(   r   r   r   r   r     s    r   r   c                 C   s   t |d }t |d }|jd d | r!td| d|j d|jd d | }|rm| du r<td| d|j d	| d
krQd|| g}tjj||dd}|S | dkred| g}tjj||dd}|S td|  d|S )z3Pad or trim `x` to a length compatible with DCUNet.r   r   zGInput shape must be [batch, freq + 1, time + 1] with freq divisible by z, got z insteadr   NzGInput shape must be [batch, freq + 1, time + 1] with time divisible by zh instead. Set the 'fix_length_mode' argument in 'DCUNet' to 'pad' or 'trim' to fix shapes automatically.padconstant)modetrimzUnknown fix_length mode '')intshape	TypeErrorr   r   r   
ValueError)r   r!   r   	freq_prod	time_prodtime_remainder	pad_shaper   r   r   r   )  s<   
r   c                 C   s
   t ||S )z0Fix shape of `out` to the original shape of `x`.)r   )r   r   r!   r   r   r   r   F  s   
r   c                       :   e Zd ZdZ				d fdd	Zdd	 Zd
d Z  ZS )SuDORMRFaG  SuDORMRF mask network, as described in [1].

    Args:
        in_chan (int): Number of input channels. Also number of output channels.
        n_src (int): Number of sources in the input mixtures.
        bn_chan (int, optional): Number of bins in the bottleneck layer and the UNet blocks.
        num_blocks (int): Number of of UBlocks.
        upsampling_depth (int): Depth of upsampling.
        mask_act (str): Name of output activation.

    References
        [1] : "Sudo rm -rf: Efficient Networks for Universal Audio Source Separation",
        Tzinis et al. MLSP 2020.
    rD         r   c                    s   t    | _|| _ | _|| _| _|| _tj	ddd| _
tj dd| _tj fddt|D  | _ krGtj dd| _tjd|d dfd  dfd	| _t|}t|d
rm|dd| _d S | | _d S )Nr   g:0yE>)epsr8   c                       g | ]	}t  d qS )rJ   r6   upsampling_depth)UBlockry   rg   rM   r6   r   r   r   r{   s      z%SuDORMRF.__init__.<locals>.<listcomp>r   r   )r8   r+   rG   rH   )r   r   r6   rI   rM   
num_blocksr   rP   r   	GroupNormlnr0   l1r1   rT   smreshape_before_masksConv2dmr
   r/   r   rW   )r   r6   rI   rM   r   r   rP   r]   r   r   r   r   \  s4   
	


zSuDORMRF.__init__c                 C   sR   |  |}| |}| |}| j| jkr| |}| |d}| |}|S )Nr   )	r   r   r   rM   r6   r   r   r   rW   r    r   r   r   r"     s   




zSuDORMRF.forwardc                 C   "   | j | j| j| j| j| jd}|S Nr6   rI   rM   r   r   rP   r   rs   r   r   r   ru        zSuDORMRF.get_config)rD   r   r   r   rv   r   r   r   r   r   L  s    4r   c                       r   )SuDORMRFImprovedaO  Improved SuDORMRF mask network, as described in [1].

    Args:
        in_chan (int): Number of input channels. Also number of output channels.
        n_src (int): Number of sources in the input mixtures.
        bn_chan (int, optional): Number of bins in the bottleneck layer and the UNet blocks.
        num_blocks (int): Number of of UBlocks
        upsampling_depth (int): Depth of upsampling
        mask_act (str): Name of output activation.


    References
        [1] : "Sudo rm -rf: Efficient Networks for Universal Audio Source Separation",
        Tzinis et al. MLSP 2020.
    rD   r   r   rF   c           	         s   t    | _|| _ | _|| _| _|| _t| _	t
j dd| _t
j fddt|D  | _t
 | d}t
t
 || _t|}t|drZ|dd| _d S | | _d S )Nr   r   c                    r   r   )
UConvBlockr   r   r   r   r{     r   z-SuDORMRFImproved.__init__.<locals>.<listcomp>rG   rH   )r   r   r6   rI   rM   r   r   rP   r   r   r   r0   rQ   r1   rT   r   r2   rV   r
   r/   r   rW   )	r   r6   rI   rM   r   r   rP   r\   r]   r   r   r   r     s(   
	


zSuDORMRFImproved.__init__c                 C   sP   |  |}| |}| |}| |}||jd | j| jd}| |}|S )Nr   r   )	r   rQ   r   rV   rd   r   rI   r6   rW   r    r   r   r   r"     s   




zSuDORMRFImproved.forwardc                 C   r   r   r   rs   r   r   r   ru     r   zSuDORMRFImproved.get_config)rD   r   r   rF   rv   r   r   r   r   r     s    ,
r   c                       s   e Zd Zd fdd	Z  ZS )_BaseUBlockrD   rE   r   Fc                    s   t    t||ddd|d| _|| _t | _| jt	||dd|d|d t
d|D ]}|dkr5d}nd}| jt	||d| d ||d|d q,|dkrYtjjdd| _d S d S )Nr   )r   r-   
use_globln   )kSizer   r-   dr   r   r   )scale_factor)r   r   r   proj_1x1depthr   rR   spp_dwrU   r   rT   rb   Upsample	upsampler)r   rJ   r6   r   r   r   r   r   r   r   r     sJ   


z_BaseUBlock.__init__)rD   rE   r   F)r$   r%   r&   r   r(   r   r   r   r   r     s    r   c                       *   e Zd ZdZd	 fdd	Zdd Z  ZS )
r   zlUpsampling block.

    Based on the following principle: ``REDUCE ---> SPLIT ---> TRANSFORM --> MERGE``
    rD   rE   r   c                    s@   t  j|||dd t||dddd| _t|| _t|| _d S )NFr   r   )r-   )r   r   r   conv_1x1_expr   
final_norm
module_actr   rJ   r6   r   r   r   r   r   .  s   
zUBlock.__init__c           	      C   s   |  |}| jd |g}td| jD ]}| j| |d }|| qt| jd D ]}| |d}|d |dd|d jd f  |d< q+| | 	|d }| 
|| S )zn
        Args:
            x: input feature map

        Returns:
            transformed feature map
        r   r   r   .N)r   r   rT   r   rU   r   popr   r   r   r   )	r   r!   output1ri   kout_krg   resampled_out_kexpandedr   r   r   r"   4  s   

(zUBlock.forwardrD   rE   r   r#   r   r   r   r   r   (  s    r   c                       r   )
r   zBlock which performs successive downsampling and upsampling
    in order to be able to analyze the input features in multiple resolutions.
    rD   rE   r   c                    s6   t  j|||dd t|dd| _t||d| _d S )NTr   r   )r   r   r   r   r   r0   r4   r   r   r   r   r   U  s   zUConvBlock.__init__c           
      C   s   |  }| |}| jd |g}td| jD ]}| j| |d }|| qt| jd D ]}| |d}|d |dd|d jd f  |d< q/| 	|d }	| 
|	| S )zm
        Args
            x: input feature map

        Returns:
            transformed feature map
        r   r   r   .N)r   r   r   rT   r   rU   r   r   r   r   r4   )
r   r!   rm   r   ri   r   r   rg   r   r   r   r   r   r"   Z  s   
(zUConvBlock.forwardr   r#   r   r   r   r   r   P  s    r   ),typingr   r   r   numpyr   rb   r   warnings r   r	   r
   baser   r   utilsr   _dcunet_architecturesr   _localr   r   r   r   utils.torch_utilsr   r   Moduler   r)   rA   rw   r   r   r   strr   r   r   r   r   r   r   r   r   r   r   <module>   s>    G  -,4-]S,(