o
    i                     @   s   d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZmZ d dlmZ dd ZG d	d
 d
e
jZG dd de
jZG dd deZdS )    N)OrderedDict)DictListOptionalTuple)DPMulCat)merge_featuresplit_feature)AbsSeparatorc                 C   s   |   dd }|   dd \}}t||}|| }|| }||d  | }|| }	| jg |d|R  }
td|	d||}|  	 
| j}| d}| jg ||	|R  }|d||
 |jg |dR  }|S )a  Reconstructs a signal from a framed representation.

        Adds potentially overlapping frames of a signal with shape
        `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
        The resulting tensor has shape `[..., output_size]` where
            output_size = (frames - 1) * frame_step + frame_length

        Args:
            signal: A [..., frames, frame_length] Tensor. All dimensions may be unknown,
                and rank must be at least 2.
            frame_step: An integer denoting overlap offsets.
                Must be less than or equal to frame_length.

        Returns:
            A Tensor with shape [..., output_size] containing the
                overlap-added frames of signal's inner-most two dimensions.
            output_size = (frames - 1) * frame_step + frame_length

        Based on

    https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
    N   r   )sizemathgcdviewtorcharangeunfoldclonedetachlongtodevice
contiguous	new_zeros
index_add_)signal
frame_stepouter_dimensionsframesframe_lengthsubframe_lengthsubframe_stepsubframes_per_frameoutput_sizeoutput_subframessubframe_signalframeresult r*   Z/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/separator/svoice_separator.pyoverlap_and_add   s"   r,   c                       s.   e Zd Zdedef fddZdd Z  ZS )Encoderenc_kernel_sizeenc_feat_dimc                    s2   t    tjd|||d dd| _t | _d S )Nr      F)kernel_sizestridebias)super__init__nnConv1dconvReLU	nonlinear)selfr.   r/   	__class__r*   r+   r5   ?   s   
zEncoder.__init__c                 C   s    t |d}| | |}|S )Nr   )r   	unsqueezer:   r8   )r;   mixture	mixture_wr*   r*   r+   forwardK   s   zEncoder.forward)__name__
__module____qualname__intr5   rA   __classcell__r*   r*   r<   r+   r-   >   s    r-   c                       s$   e Zd Z fddZdd Z  ZS )Decoderc                    s   t    || _d S N)r4   r5   r1   )r;   r1   r<   r*   r+   r5   R   s   

zDecoder.__init__c                 C   s6   t |dd}td| jf|}t|| jd }|S )Nr0      r   )r   	transposer6   	AvgPool2dr1   r,   )r;   
est_sourcer*   r*   r+   rA   V   s   zDecoder.forward)rB   rC   rD   r5   rA   rF   r*   r*   r<   r+   rG   Q   s    rG   c                       s   e Zd ZdZ					ddeded	ed
edededededef fddZ	ddejdejde	e
 deeej ejef fddZedd Z  ZS )SVoiceSeparatora  SVoice model for speech separation.

    Reference:
        Voice Separation with an Unknown Number of Multiple Speakers;
        E. Nachmani et al., 2020;
        https://arxiv.org/abs/2003.01531

    Args:
        enc_dim: int, dimension of the encoder module's output. (Default: 128)
        kernel_size: int, the kernel size of Conv1D layer in both encoder and
            decoder modules. (Default: 8)
        hidden_size: int, dimension of the hidden state in RNN layers. (Default: 128)
        num_spk: int, the number of speakers in the output. (Default: 2)
        num_layers: int, number of stacked MulCat blocks. (Default: 4)
        segment_size: dual-path segment size. (Default: 20)
        bidirectional: bool, whether the RNN layers are bidirectional. (Default: True)
        input_normalize: bool, whether to apply GroupNorm on the input Tensor.
            (Default: False)
    r0         TF	input_dimenc_dimr1   hidden_sizenum_spk
num_layerssegment_sizebidirectionalinput_normalizec
           
   	      sN   t    || _|| _|| _t||| _t|| _t	|||||||	d| _
d S )N)
input_sizerR   r%   rS   rT   rV   rW   )r4   r5   _num_spkrQ   rU   r-   encoderrG   decoderr   	rnn_model)
r;   rP   rQ   r1   rR   rS   rT   rU   rV   rW   r<   r*   r+   r5   s   s   

zSVoiceSeparator.__init__Ninputilens
additionalreturnc                 C   s   | d}| |}t|| j\}}| |}g }	tt|D ]@}
t||
 |}||j	d | j
| j|j	d }| |}| d}t|d|| f}t|jdd}| jr]|	| q|}	qi }|	||fS )a  Forward.

        Args:
            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
            ilens (torch.Tensor): input lengths [Batch]
            additional (Dict or None): other data included in model
                NOTE: not used in this model

        Returns:
            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
            ilens (torch.Tensor): (B,)
            others predicted data, e.g. masks: OrderedDict[
                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
                ...
                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
            ]
        r   r   r0   r   )dim)r   rZ   r	   rU   r\   rangelenr   r   shaperY   rQ   r[   Fpadlistunbindtrainingappend)r;   r]   r^   r_   T_mixr@   enc_segmentsenc_rest
output_alloutputsii	output_iiT_estothersr*   r*   r+   rA      s&   





zSVoiceSeparator.forwardc                 C   s   | j S rH   )rY   )r;   r*   r*   r+   rS      s   zSVoiceSeparator.num_spk)r0   rN   rO   TFrH   )rB   rC   rD   __doc__rE   boolr5   r   Tensorr   r   r   r   r   rA   propertyrS   rF   r*   r*   r<   r+   rM   ^   sL    	
"
4rM   )r   collectionsr   typingr   r   r   r   r   torch.nnr6   torch.nn.functional
functionalre   espnet2.enh.layers.dpmulcatr   espnet2.enh.layers.dprnnr   r	   #espnet2.enh.separator.abs_separatorr
   r,   Moduler-   rG   rM   r*   r*   r*   r+   <module>   s    0