o
    i[                     @   s   d Z ddlZddlmZmZmZmZ ddlZddlm	Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlm Z m!Z!m"Z" dd	l#m$Z$ e
ej%e
d
kZ&dZ'G dd dej(j)Z*G dd dej(j)Z+dS )zDNN beamformer module.    N)ListOptionalTupleUnion)parse)
functional)ComplexTensor)apply_beamforming_vectorblind_analytic_normalizationget_gev_vectorget_lcmv_vector_with_rtfget_mvdr_vectorget_mvdr_vector_with_rtfget_mwf_vectorget_rank1_mwf_vectorget_rtf_matrixget_sdw_mwf_vectorget_WPD_filter_v2get_WPD_filter_with_rtfperform_WPD_filteringprepare_beamformer_stats)stack	to_doubleto_float)MaskEstimatorz1.9.0)mvdrmvdr_soudenmpdrmpdr_soudenwmpdrwmpdr_soudenwpd
wpd_soudenmwfwmwfsdw_mwfr1mwflcmvlcmpwlcmpgevgev_banmvdr_tfsmvdr_tfs_soudenc                +       sD  e Zd ZdZ											
											d7dedededededededededededededed ed!ed"ed#ed$ed%ed&ef* fd'd(Z	)	)d8d*e	e
jef d+e
jd,eee
j  d-eee
j  d.ee	e
jef e
je
jf f
d/d0Z	)	)	1d9d2d3Zd*e	e
jef d+e
jd.eee
jd4f e
jf fd5d6Z  ZS ):DNN_BeamformerzDNN mask based Beamformer.

    Citation:
        Multichannel End-to-end Speech Recognition; T. Ochiai et al., 2017;
        http://proceedings.mlr.press/v70/ochiai17a/ochiai17a.pdf

    blstmp   ,  @     Tsigmoid        r            ?ư>Hz>F   btypeblayersbunitsbprojsnum_spkuse_noise_mask	nonlineardropout_ratebadimref_channelbeamformer_typertf_iterationsmwf_muepsdiagonal_loadingdiag_epsmask_flooringflooring_thresuse_torch_solverbtapsbdelayc              
      s~  t    |r|d n|}t||||||	||d| _|dk r%t||
|dnd | _|| _|| _|dks6J ||| _|| _	|t
vrFtd| |dksO|dsz|sz|dkrhtd|   td	|   ntd
|   td|   || _|ds|dksJ ||| _|| _|dkr|dksJ ||f|| _| jdkr|nd| _|| _|| _|| _|| _|| _|| _d S )Nr3   )nmaskrB   r   )rI   z!Not supporting beamformer_type=%sr   _soudenzMInitializing %s beamformer without noise mask estimator (single-speaker case)zI(1 - speech_mask) will be used for estimating noise PSD in %s beamformer!zLInitializing %s beamformer without noise mask estimator (multi-speaker case)zQInterference speech masks will be used for estimating noise PSD in %s beamformer!r7   )super__init__r   maskAttentionReferencerefrE   rA   r@   rQ   BEAMFORMER_TYPES
ValueErrorendswithloggingwarningupperrF   rG   rH   rO   rP   rI   rJ   rK   rL   rM   rN   )selfbidimr<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   bnmask	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/layers/dnn_beamformer.pyrT   K   sv   


zDNN_Beamformer.__init__Ndatailenspowersoracle_masksreturnc              
      s  | dddd}t|}|dur|}n ||\}} jt|ks(J t| jr4 fdd|D } jdkr jrA|\}}	n|d }d| }	 jdv rRt	d	t
||g|	| j j j jd
}
 jdv r{ j|||
d |
d |
d d\}}n7 jds jdks jdks jdks jdks jdr |||
d |
d \}}ntd j|dd}n jrt|dd }|d }	nt|}d}	t
|||	| j j j jd
}
 jdv rt|
d |
d  j j j j jd}g g }}t jD ]} jdv r$ j|||
d | |
d | |
d | d\}}n jdv r= |||
d | |
d | \}}nr jdkrZ j|||
d |
d | |
d | d\}}nU jdv rq |||
d |
d | \}}n> jdkr j|||
d |
d | ||d\}}n$ jd v r j|||
d | |
d | ||d\}}ntd j|dd}|| || qd!d |D }|||fS )"a  DNN_Beamformer forward function.

        Notation:
            B: Batch
            C: Channel
            T: Time or Sequence length
            F: Freq

        Args:
            data (torch.complex64/ComplexTensor): (B, T, C, F)
            ilens (torch.Tensor): (B,)
            powers (List[torch.Tensor] or None): used for wMPDR or WPD (B, F, T)
            oracle_masks (List[torch.Tensor] or None): oracle masks (B, F, C, T)
                if not None, oracle_masks will be used instead of self.mask
        Returns:
            enhanced (torch.complex64/ComplexTensor): (B, T, F)
            ilens (torch.Tensor): (B,)
            masks (torch.Tensor): (B, T, C, F)
        r   r0   r7   r3   Nc                    s   g | ]
}t j| jd qS ))min)torchclamprM   .0mr^   rc   rd   
<listcomp>   s    z*DNN_Beamformer.forward.<locals>.<listcomp>)r'   r(   r)   z"Single source is not supported yet)rg   rF   rP   rO   rI   )r   r   r   r!   psd_n
psd_speechpsd_distortion)rt   rR   r#   r$   r%   r&   r*   !Not supporting beamformer_type={}r6   )rJ   rE   rG   rN   rK   )r   r,   r   r!   )	r   r-   r    r"   r$   r%   r&   r*   r+   r   )r   r#   r(   )rtf_matspk)r'   r)   c                 S      g | ]}| d dqS r6   	transposerm   rc   rc   rd   rq   g      )permuter   rU   rQ   lenrL   r@   rA   rF   NotImplementedErrorr   rP   rO   rI   apply_beamformingrZ   
startswithrY   formatr}   listr   rJ   rE   rG   rN   rK   rangeappend)r^   re   rf   rg   rh   data_dmasks_mask_speech
mask_noisebeamformer_statsenhancedwsrw   ienhwrc   rp   rd   forward   s  





























	


zDNN_Beamformer.forwardr   c                    s  j dk rj jd|\} n+jdr?tj 	 dd  	df  j
tjddj f d	 nj jd
v rhtt|ttjj jjjd	}	t|	t }
nˈjdkrt|ttfsvJ fdd|D }	t fdd|	D }
t  |
 jddd}W d   n1 sw   Y  |
d|d}
t|	dd}	nxjdv rttt|jjjd}	t|	t }
nYjdkr.t|ttfsJ fdd|D }	t fdd|	D }
t  |
 jddd}W d   n	1 sw   Y  |
d|d}
t|	dd}	njdkrXtt|ttjj jjjd	}	t|	t jj }
nۈjdkryt!tt|jjd}	t|	t jj }
njdv rt"tt|jjjd}	t|	t }
njdkrt#tt|j$jjjd}	t|	t }
nzjdkrt%tt|j$jjjd}	t|	t }
nYjdv rt&t|t||jjjd }	t|	t }
n:j'd!r+t(t|td"jjd#}	t|	t }
jd$kr*t)|	t|}|
|*d% }
nt+d&,j|
j jd|	j jdfS )'a  Beamforming with the provided statistics.

        Args:
            data (torch.complex64/ComplexTensor): (B, F, C, T)
            ilens (torch.Tensor): (B,)
            psd_n (torch.complex64/ComplexTensor):
                Noise covariance matrix for MVDR (B, F, C, C)
                Observation covariance matrix for MPDR/wMPDR (B, F, C, C)
                Stacked observation covariance for WPD (B,F,(btaps+1)*C,(btaps+1)*C)
            psd_speech (torch.complex64/ComplexTensor):
                Speech covariance matrix (B, F, C, C)
            psd_distortion (torch.complex64/ComplexTensor):
                Noise covariance matrix (B, F, C, C)
            rtf_mat (torch.complex64/ComplexTensor):
                RTF matrix (B, F, C, num_spk)
            spk (int): speaker index
        Return:
            enhanced (torch.complex64/ComplexTensor): (B, F, T)
            ws (torch.complex64/ComplexTensor): (B, F) or (B, F, (btaps+1)*C)
        r   )dtyperR   Nr{   rv   )devicer   .r3   )r   r   r   
iterationsreference_vectornormalize_ref_channelrN   rJ   rK   r,   c                    s<   g | ]}t t|tt jjjjjd 	qS )r   )r   r   rG   rE   rN   rJ   rK   rn   psd_n_i)rt   rs   r^   urc   rd   rq     s    z4DNN_Beamformer.apply_beamforming.<locals>.<listcomp>c                       g | ]	}t |t qS rc   r	   r   rn   r   re   rc   rd   rq         T)dimkeepdimsr   )r   r   r    rN   rJ   rK   r-   c              
      s.   g | ]}t t t|jjjd qS )r   )r   r   rN   rJ   rK   r   )rs   r^   r   rc   rd   rq     s    	c                    r   rc   r   r   r   rc   rd   rq     r   r!   r"   )rJ   rK   )r#   r$   r%   )denoising_weightrN   rJ   rK   r&   )r(   r)   r'   )r   rN   rJ   rK   r*   power)moderJ   rK   r+   r6   ru   )-rE   rW   tor   doublerF   rZ   rk   zerossizer   fill_r   r   rG   rN   rJ   rK   r	   
isinstancer   tupler   no_gradabsargmingathersqueezer   r   r   rP   rO   r   r   r   rH   r   r   r   r   r
   	unsqueezerY   r   )r^   re   rf   rr   rs   rt   rw   rx   r   r   r   indexgainrc   )re   rt   rs   r^   r   rd   r   j  s  





	
		
z DNN_Beamformer.apply_beamforming.c              	   C   s6   |  t|dddd|\}}dd |D }||fS )a  Predict masks for beamforming.

        Args:
            data (torch.complex64/ComplexTensor): (B, T, C, F), double precision
            ilens (torch.Tensor): (B,)
        Returns:
            masks (torch.Tensor): (B, T, C, F)
            ilens (torch.Tensor): (B,)
        r   r0   r7   r3   c                 S   ry   rz   r|   rm   rc   rc   rd   rq   @  r~   z/DNN_Beamformer.predict_mask.<locals>.<listcomp>)rU   r   r   )r^   re   rf   r   r   rc   rc   rd   predict_mask2  s    zDNN_Beamformer.predict_mask)r/   r0   r1   r2   r3   Tr4   r5   r2   r6   r   r7   r8   r9   Tr:   Fr9   Tr;   r0   )NN)NNr   )__name__
__module____qualname____doc__strintboolfloatrT   r   rk   Tensorr   
LongTensorr   r   r   r   r   r   __classcell__rc   rc   ra   rd   r.   B   s    	
^
 L
 Ir.   c                       sT   e Zd Zd fdd	Z	ddeejef dejde	de
ejejf fd	d
Z  ZS )rV   r9   c                    s4   t    tj||| _tj|d| _|| _d S )Nr3   )rS   rT   rk   nnLinearmlp_psdgvecrI   )r^   r_   att_dimrI   ra   rc   rd   rT   E  s   

zAttentionReference.__init__       @psd_inrf   scalingri   c                 C   s   |  dd \}}}| d| dksJ |  |tj|tj|jdtjd}|jdd|d  dd	}|j	d |j
d  | j d
 }| |}	| t|	d}
tj||
 dd}||fS )a%  Attention-based reference forward function.

        Args:
            psd_in (torch.complex64/ComplexTensor): (B, F, C, C)
            ilens (torch.Tensor): (B,)
            scaling (float):
        Returns:
            u (torch.Tensor): (B, C)
            ilens (torch.Tensor): (B,)
        Nr0   r7   )r   r   r   r6   r   r3   rv   g      ?)r   masked_fillrk   eyer   r   typesumr}   realimagrI   r   r   tanhr   Fsoftmax)r^   r   rf   r   Br   Cpsdpsd_featr   er   rc   rc   rd   r   K  s    
zAttentionReference.forward)r9   )r   )r   r   r   rT   r   rk   r   r   r   r   r   r   r   rc   rc   ra   rd   rV   D  s    
rV   ),r   r[   typingr   r   r   r   rk   packaging.versionr   Vtorch.nnr   r   torch_complex.tensorr   espnet2.enh.layers.beamformerr	   r
   r   r   r   r   r   r   r   r   r   r   r   r    espnet2.enh.layers.complex_utilsr   r   r   !espnet2.enh.layers.mask_estimatorr   __version__is_torch_1_9_plusrX   r   Moduler.   rV   rc   rc   rc   rd   <module>   s$    @#    