o
    iN                     @   s   d Z ddlmZmZmZmZmZ ddlZddlm	Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  e
ej!e
dkZ"e#e$ j%Z&G dd de Z'dS )zEnhancement model module.    )DictListOptionalOrderedDictTupleN)parse)check_argument_types)AbsMask)
AbsDecoder)
AbsEncoder)FrequencyDomainLoss)TimeDomainLoss)AbsLossWrapper)AbsSeparator)DANSeparator)force_gatherable)AbsESPnetModelz1.9.0c                       sn  e Zd ZdZ			d.dedededee d	e	e
 d
ededee f fddZ	d/dejdejdeejeeejf ejf fddZ	d/dejdejdee deejejejf fddZ		d0dejdejdejdejdedejdejdejdeejeeejf ejf fdd Zd1d"d#Zd$d% Zd&d' Zd(d) Zed*d+ Zdejdejdeeejf fd,d-Z  ZS )2ESPnetEnhancementModelz/Speech enhancement or separation Frontend modelFmask_mseNencoder	separatordecodermask_moduleloss_wrappersstft_consistency	loss_type	mask_typec	           
         s   t  sJ t   || _|| _|| _|| _|j| _t| jdd| _	|| _
dd | j
D }	tt|	t|	kr>td|	|rD| nd | _|| _|| _t| jdd| _d S )Nnum_noise_type   c                 S   s   g | ]}|j jqS  )	criterionname).0wr   r   L/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/espnet_model.py
<listcomp>2   s    z3ESPnetEnhancementModel.__init__.<locals>.<listcomp>z)Duplicated loss names are not allowed: {}ref_channel)r   super__init__r   r   r   r   num_spkgetattrr   r   lenset
ValueErrorformatupperr   r   r   r&   )
selfr   r   r   r   r   r   r   r   names	__class__r   r$   r)      s    

zESPnetEnhancementModel.__init__
speech_mixspeech_mix_lengthsreturnc              
      s&  d v sJ d fddt jD }tj|dd}d v r3 fddt jD }tj|dd}nd	}d
 v r\ fddt jD }t|djfv sTJ t|tj|dd}nd	}|jd }|d	uri|nt| 	|jd 
 dksJ j|jd |jd   krjd ksn J |j|jjf|dd	 f jdd}|d	ur|dd	 f jdd}|d	ur|dd	 f }|jdd}i }tjtrfdd|D |d< |d	d	d	 f }||\}	}
}}|	|
|||||\}}}}|||fS )aF  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech_mix: (Batch, samples) or (Batch, samples, channels)
            speech_ref: (Batch, num_speaker, samples)
                        or (Batch, num_speaker, samples, channels)
            speech_mix_lengths: (Batch,), default None for chunk interator,
                            because the chunk-iterator does not have the
                            speech_lengths returned. see in
                            espnet2/iterators/chunk_iter_factory.py
            kwargs: "utt_id" is among the input.
        speech_ref1z.At least 1 reference signal input is required.c              	      s,   g | ]}  d |d  t d qS )
speech_refr   r8   )gettorch
zeros_liker"   spkkwargsr   r$   r%   Y   s    z2ESPnetEnhancementModel.forward.<locals>.<listcomp>r   dim
noise_ref1c                       g | ]} d  |d  qS )znoise_ref{}r   r/   r"   nr?   r   r$   r%   f       Ndereverb_ref1c                    0   g | ]}d  |d  v r d  |d  qS )zdereverb_ref{}r   rE   rF   r?   r   r$   r%   t   
    r   .c                       g | ]
}  |d  qS r   r   r"   rr1   speech_lengthsr   r$   r%      s    feature_ref)ranger*   r;   stackr   r,   shapeonesintfill_rB   maxunbind
isinstancer   r   forward_enhanceforward_loss)r1   r5   r6   r@   r9   	noise_refdereverb_speech_ref
batch_size
additional
speech_prefeature_mixfeature_preotherslossstatsweightpermr   )r@   r1   rR   r$   forwardE   sr   



*


zESPnetEnhancementModel.forwardrR   rb   c                    s     |\}} jd u r |||\}}}n, ||\}}	|dd ur= ||||d \}}}||d< |	|d< nd }||	d}|d urS fdd|D }
nd }
|
|||fS )Nr*   bottleneck_featsbottleneck_feats_lengths)rl   rm   c                    rL   rM   )r   )r"   psrQ   r   r$   r%          z:ESPnetEnhancementModel.forward_enhance.<locals>.<listcomp>)r   r   r   r:   )r1   r5   rR   rb   rd   flensre   rf   rl   rm   rc   r   rQ   r$   r]      s(   


z&ESPnetEnhancementModel.forward_enhancerc   rd   re   rf   r9   r_   r`   c	              
      s`  t jddrd v sJ   |d ur4d v r4tjD ]}	d|	d }
 |
 d  |
< qt jddrEd v sEJ   |d urld v rltjD ]}d	|d }
|
 v rk |
 d  |
< qRd
}i }i }d }jD ]
}|j	}t |ddrj
rqwt |ddr|d u rtd|} fddtjD }n)t |ddr|d u rtd|} fddtjD }t|dkrd }n|}|}t|tr|d usJ j||ddd\}}|||i  |\}}}nwt|tr`j||ddd\}}|jr>t |ddr||||| \}}n7t |ddr0||||| \}}n"||||| \}}nfdd|D }fdd|D }|||i  |\}}}ntdt| |||j 7 }|| |d u rd|v r|d }qwj
rt|trtd| |d< |d jd }t|||f|j\}}}||||fS )Npredict_noiseFnoise1noise{}r   r   predict_dereverb	dereverb1
dereverb{}g        only_for_testis_noise_losszPNo noise reference for training!
Please specify "--use_noise_ref true" in run.shc                    rD   )rs   r   rE   rF   rf   r   r$   r%      rH   z7ESPnetEnhancementModel.forward_loss.<locals>.<listcomp>is_dereverb_lossz\No dereverberated reference for training!
Please specify "--use_dereverb_ref true" in run.shc                    rJ   )rv   r   rE   rF   ry   r   r$   r%   	  rK      T)ch_dim	force_1chc                    rL   rM   rN   r"   srrQ   r   r$   r%   B  ro   c                    rL   rM   rN   r"   sprQ   r   r$   r%   C  ro   zUnsupported loss type: %srj   z8At least one criterion must satisfy: only_for_test=Falserg   )r+   r   keysrT   r   r/   r   r*   r   r    trainingr.   r,   r\   r   _align_ref_pre_channelsr   compute_on_mask_get_noise_masks_get_dereverb_masks_get_speech_masksNotImplementedErrorstrri   updatefloatAttributeErrordetachrV   r   device)r1   rc   rR   rd   re   rf   r9   r_   r`   rG   keyr>   rg   rh   orj   loss_wrapperr    
signal_ref
signal_presrefsprelstf_reftf_prera   ri   r   )rf   r1   rR   r$   r^      s   





	



z#ESPnetEnhancementModel.forward_lossr{   c                    sR  |d u s|d u r||fS |d j | jtjdt|d ttf}|r*|d d  n|d  }|d  |krF fdd|D }||fS |d  |k rl|r^ fdd|D }||fS  fdd|D }||fS |d  |  krzdkrn ||fS |r fdd|D }|r fd	d|D }||fS  fd
d|D }||fS )Nr   )dtypec                       g | ]}|   qS r   index_selectsqueezerO   r|   indexr   r$   r%   f      zBESPnetEnhancementModel._align_ref_pre_channels.<locals>.<listcomp>c                    (   g | ]}|D ]}|   qqS r   r   r"   plistpr   r   r$   r%   j      c                    r   r   r   r"   r   r   r   r$   r%   p  r      c                    r   r   r   rO   r   r   r$   r%   s  r   c                    r   r   r   r   r   r   r$   r%   u  r   c                    r   r   r   r   r   r   r$   r%   {  r   )
new_tensorr&   r;   longr\   listtuplerB   )r1   refprer|   r}   pre_is_multi_listpre_dimr   r   r$   r   Z  s6    z.ESPnetEnhancementModel._align_ref_pre_channelsc                    s    t| d }|j| fdd|D |d}	dv r.fddtjD }
|	|
fS t|t|ks@J t|t|f|j| fdd|D |d}
|	|
fS )Nr   c                       g | ]
} | d  qS rM   rN   )r"   nrilensr1   r   r$   r%     ro   z;ESPnetEnhancementModel._get_noise_masks.<locals>.<listcomp>
noise_specmask_noise1c                    rD   )zmask_noise{}r   rE   rF   ry   r   r$   r%     rH   c                    r   rM   rN   )r"   npr   r   r$   r%     ro   )r   sumcreate_mask_labelrT   r   r,   )r1   r    rd   r9   r_   	noise_prer   rf   speech_spec	masks_ref	masks_prer   r   rf   r1   r$   r   ~  s$   

$z'ESPnetEnhancementModel._get_noise_masksc                    s   |d ur t| d }nd }|j| fdd|D |d}	dv rGfddtjD }
t|
t|	ksCJ t|
t|	f|	|
fS t|t|ksYJ t|t|f|j| fdd|D |d}
|	|
fS )Nr   c                    r   rM   rN   )r"   drr   r   r$   r%     ro   z>ESPnetEnhancementModel._get_dereverb_masks.<locals>.<listcomp>r   mask_dereverb1c                    rJ   )zmask_dereverb{}r   rE   r=   ry   r   r$   r%     rK   c                    r   rM   rN   )r"   dpr   r   r$   r%     ro   )r   r   r   rT   r*   r,   )r1   r    feat_mixr_   dereverb_refdereverb_prer   rf   r   r   r   r   r   r$   r     s0   
$z*ESPnetEnhancementModel._get_dereverb_masksc                    s   |d ur t| d }nd }|j| fdd|D |d}	dv r5fddtjD }
|	|
fS |j| fdd|D |d}
|	|
fS )Nr   c                    r   rM   rN   r~   r   r   r$   r%     ro   z<ESPnetEnhancementModel._get_speech_masks.<locals>.<listcomp>r   	mask_spk1c                    rD   )z
mask_spk{}r   rE   r=   ry   r   r$   r%     rH   c                    r   rM   rN   r   r   r   r$   r%     ro   )r   r   r   rT   r*   )r1   r    rd   r_   r9   rc   r   rf   r   r   r   r   r   r$   r     s&   
	z(ESPnetEnhancementModel._get_speech_masksc                 C   s   t | dkr| S tj| dd} t|tjstj|dd}| d|dks.J | j|jf|  |  }|dkrO|jg |jdd t	|D R  
| }t| d|jddS )a2  Sort the input list of tensors by the specified permutation.

        Args:
            nn_output: List[torch.Tensor(Batch, ...)], len(nn_output) == num_spk
            perm: (Batch, num_spk) or List[torch.Tensor(num_spk)]
        Returns:
            nn_output_new: List[torch.Tensor(Batch, ...)]
        r   rA   r   c                 S   s   g | ]}d qS )r   r   )r"   _r   r   r$   r%     s    z7ESPnetEnhancementModel.sort_by_perm.<locals>.<listcomp>)r,   r;   rU   r\   TensorsizerV   rB   viewrT   	expand_asgatherr[   )	nn_outputrj   diff_dimr   r   r$   sort_by_perm  s   
$$z#ESPnetEnhancementModel.sort_by_permc                 K   s,   |d d d |  f }||}}||dS )N)featsfeats_lengths)rZ   )r1   r5   r6   r@   r   r   r   r   r$   collect_feats  s   

z$ESPnetEnhancementModel.collect_feats)Fr   N)N)NN)r{   F)__name__
__module____qualname____doc__r   r   r
   r   r	   r   r   boolr   r)   r;   r   r   r   rk   r]   r   r^   r   r   r   r   staticmethodr   r   __classcell__r   r   r3   r$   r      s    		-
q
+	

 
$
r   )(r   typingr   r   r   r   r   r;   packaging.versionr   V	typeguardr   espnet2.diar.layers.abs_maskr	   espnet2.enh.decoder.abs_decoderr
   espnet2.enh.encoder.abs_encoderr   %espnet2.enh.loss.criterions.tf_domainr   'espnet2.enh.loss.criterions.time_domainr   %espnet2.enh.loss.wrappers.abs_wrapperr   #espnet2.enh.separator.abs_separatorr   #espnet2.enh.separator.dan_separatorr    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   __version__is_torch_1_9_plusfinfoget_default_dtypeepsEPSr   r   r   r   r$   <module>   s$    