o
    i                     @   s   d dl mZ d dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ ee
jed
kZG dd deZdS )    )OrderedDict)LooseVersion)DictListOptionalTupleUnionN)ComplexTensor)
is_complex)DPTNet)choose_norm)AbsSeparatorz1.9.0c                       s   e Zd Z												
		d)dedededededededededededededef fddZ	d*dee	j
ef de	j
dee d eeee	j
ef  e	j
ef fd!d"Zd#d$ Zd*d%d&Zed'd( Z  ZS )+DPTNetSeparatorTlstm   F              relugLN      	input_dimpost_enc_relurnn_typebidirectionalnum_spkpredict_noiseunit	att_headsdropout
activation	norm_typelayersegment_size	nonlinearc                    s   t    || _|| _|| _|| _t||| _| jr| jd n| j| _	t
||||| j	 ||	|
|||d
| _tjtj||dtj | _tjtj||dtj | _|dvrbtd|tj tj tj d| | _dS )a  Dual-Path Transformer Network (DPTNet) Separator

        Args:
            input_dim: input feature dimension
            rnn_type: string, select from 'RNN', 'LSTM' and 'GRU'.
            bidirectional: bool, whether the inter-chunk RNN layers are bidirectional.
            num_spk: number of speakers
            predict_noise: whether to output the estimated noise signal
            unit: int, dimension of the hidden state.
            att_heads: number of attention heads.
            dropout: float, dropout ratio. Default is 0.
            activation: activation function applied at the output of RNN.
            norm_type: type of normalization to use after each inter- or
                intra-chunk Transformer block.
            nonlinear: the nonlinear function for mask estimation,
                       select from 'relu', 'tanh', 'sigmoid'
            layer: int, number of stacked RNN layers. Default is 3.
            segment_size: dual-path segment size
           )
r   
input_sizehidden_sizeoutput_sizer   r    r!   
num_layersr   r"   )sigmoidr   tanhzNot supporting nonlinear={}N)super__init___num_spkr   r$   r   r   enc_LNr   num_outputsr   dptnettorchnn
SequentialConv1dTanhoutputSigmoidoutput_gate
ValueErrorformatReLUr%   )selfr   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/separator/dptnet_separator.pyr.      sB   
$
zDPTNetSeparator.__init__Ninputilens
additionalreturnc                    s0  t  r	t }n| jrtjj }n }|j\}}}|dd}| 	|}| 
|}| |}	|	|| j d|	d|	d}	| j|	|d}	| |	| |	 }	|	|| j||}
| |
ddjdd}
| jrq|
^ }
} fdd|
D }ttd	d tt|
D |
}| jr | |d
< |||fS )a  Forward.

        Args:
            input (torch.Tensor or ComplexTensor): Encoded feature [B, T, N]
            ilens (torch.Tensor): input lengths [Batch]
            additional (Dict or None): other data included in model
                NOTE: not used in this model

        Returns:
            masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, N), ...]
            ilens (torch.Tensor): (B,)
            others predicted data, e.g. masks: OrderedDict[
                'mask_spk1': torch.Tensor(Batch, Frames, Freq),
                'mask_spk2': torch.Tensor(Batch, Frames, Freq),
                ...
                'mask_spkn': torch.Tensor(Batch, Frames, Freq),
            ]
        r&   r   )length)dimc                    s   g | ]} | qS rA   rA   ).0mrC   rA   rB   
<listcomp>   s    z+DPTNetSeparator.forward.<locals>.<listcomp>c                 S   s   g | ]	}d  |d qS )z
mask_spk{}r&   )r<   )rK   irA   rA   rB   rN      s    noise1)r
   absr   r3   r4   
functionalr   shape	transposer0   split_featurer2   reshaper1   sizemerge_featurer8   r:   r%   unbindr   r   ziprangelen)r>   rC   rD   rE   featureBTN	segmented	processedmasks
mask_noisemaskedothersrA   rM   rB   forward[   s4   





zDPTNetSeparator.forwardc                 C   sR   |  \}}}tjjj|d| jdf| jdf| jd dfd}|||| jdS )NrG   r&   r   r   )kernel_sizepaddingstride)rW   r3   r4   rR   unfold	unsqueezer$   rV   )r>   xr^   r`   r_   unfoldedrA   rA   rB   rU      s   zDPTNetSeparator.split_featurec                 C   s   |  \}}}}| jd }|d u r|d | | }d}nd|f}|||| |}	tjjj|	d|fd|f|d|fd}tjjjt|	d|fd|f|d|fd}
||
 }||||S )Nr   r&   r   )r)   rh   ri   rj   )rC   r)   rh   ri   rj   )rW   r$   rV   r3   r4   rR   fold	ones_like)r>   rm   rI   r^   r`   Ln_chunkshop_sizeri   seqnorm_matrA   rA   rB   rX      s.   
zDPTNetSeparator.merge_featurec                 C   s   | j S N)r/   )r>   rA   rA   rB   r      s   zDPTNetSeparator.num_spk)Tr   Tr   Fr   r   r   r   r   r   r   r   rv   )__name__
__module____qualname__intboolstrfloatr.   r   r3   Tensorr	   r   r   r   r   r   rg   rU   rX   propertyr   __classcell__rA   rA   r?   rB   r      sr    	
N
C

r   )collectionsr   distutils.versionr   typingr   r   r   r   r   r3   torch_complex.tensorr	    espnet2.enh.layers.complex_utilsr
   espnet2.enh.layers.dptnetr   espnet2.enh.layers.tcnr   #espnet2.enh.separator.abs_separatorr   __version__is_torch_1_9_plusr   rA   rA   rA   rB   <module>   s    