o
    i                     @   s\  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ G d	d
 d
eZ				d)dedededefddZ				d*dejde dedede!dejfddZ"G dd deZ#G dd  d e#Z$G d!d" d"eZ%G d#d$ d$e#Z&G d%d& d&eZ'G d'd( d(e#Z(dS )+    N)ABCabstractmethod)Path)
CollectionDictIterableListUnion)check_argument_typescheck_return_type)build_tokenizer)TextCleaner)TokenIDConverterc                	   @   sP   e Zd ZdefddZededeeeee	j
f f deee	j
f fddZd	S )
AbsPreprocessortrainc                 C   s
   || _ d S N)r   )selfr    r   N/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/train/preprocessor.py__init__   s   
zAbsPreprocessor.__init__uiddatareturnc                 C   s   t r   )NotImplementedErrorr   r   r   r   r   r   __call__   s   zAbsPreprocessor.__call__N)__name__
__module____qualname__boolr   r   strr   r	   npndarrayr   r   r   r   r   r      s    r         Tframe_lengthframe_shiftcenteredpaddedc           
      C   sZ  | j dkr	td|dk rtd|| jd krtdd|kr$td|rDdd	 t| jd D |d
 |d
 fg }tj| |ddd} |rl| jd |  | | }dd	 t| jd D d|fg }tj| |ddd} |dkrz||krz| d }|S | jd d | jd | | d |f }| jd d || jd  | jd f }	tjj	j
| ||	d}|S )Nr   zInput array size is zero   z'frame_length must be a positive integerz)frame_length is greater than input lengthz"frame_shift must be greater than 0c                 S      g | ]}d qS r   r   r   .0_r   r   r   
<listcomp>-       zframing.<locals>.<listcomp>   constant)modeconstant_valuesc                 S   r+   r,   r   r.   r   r   r   r1   7   r2   .N)shapestrides)size
ValueErrorr8   rangendimr!   padr9   libstride_tricks
as_strided)
xr%   r&   r'   r(   	pad_shapenaddresultr8   r9   r   r   r   framing   s6   
"&rF   {Gz?   boxcarrB   	thresholdwindowr   c           
      C   s   | j d |k rtj| j dtjdS | jjdkr| tj} t| ||ddd}|t	j
|||j9 }|d jdd}tj|ddd	}t|d
krTtj| j dtjdS || |k}t|d |j |f }	|	jg |j dd dR  }	tj|	dg| jd  d
| j d |	j d  fg ddS )zPower based voice activity detection.

    Args:
        x: (Channel, Time)
    >>> x = np.random.randn(1000)
    >>> detect = detect_non_silence(x)
    >>> assert x.shape == detect.shape
    >>> assert detect.dtype == np.bool
    r*   T)
fill_valuedtypeiF)r%   r&   r'   r(   r3   axis)rP   keepdimsr   r7   Nr-   r)   edger5   )r8   r!   fullr   rM   kindastypefloat64rF   scipysignal
get_windowmeanallbroadcast_toreshaper>   r=   )
rB   rJ   r%   r&   rK   framed_wpower
mean_powerdetect_framesdetectsr   r   r   detect_non_silenceG   s4   (rd   c                '       sd  e Zd Z																		d+d	ed
edeeeee f deeeee f dee dedededeeeee f dedede	dede	dede	de	dedef& fddZ
dd Zd d! Zd"eeeeejf f d#eeeeejf f fd$d%Zd"eeeeejf f d#eeejf fd&d'Zd(ed"eeeeejf f d#eeejf fd)d*Z  ZS ),CommonPreprocessorN<unk><space>      ?3_10      ?speechtextr   
token_type
token_listbpemodeltext_cleanerg2p_type
unk_symbolspace_symbolnon_linguistic_symbols	delimiterrir_scprir_apply_prob	noise_scpnoise_apply_probnoise_db_rangeshort_noise_thresspeech_volume_normalizespeech_name	text_namec                    s  t  | || _|| _|| _|| _|| _|| _|| _|d ur?|d u r't	dt
|| _t|||
||	|d| _t||d| _n	d | _d | _d | _|r|d urg | _t|ddd,}|D ]!}| d d}t|dkrt| j|d  q[| j|d  q[W d    n1 sw   Y  nd | _|r	|d ur	g | _t|ddd,}|D ]!}| d d}t|dkr| j|d  q| j|d  qW d    n1 sw   Y  |d	}t|dkrt|d \| _| _d S t|d
krt|d t|d | _| _d S t	dd | _d S )N0token_list is required if token_type is not Nonerm   ro   ru   rs   rt   rq   rn   rr   rutf-8encodingr)   r   r0   r3   z8Format error: '{noise_db_range}' e.g. -3_4 -> [-3db,4db])superr   r   r}   r~   r|   rw   ry   r{   r;   r   rp   r   	tokenizerr   token_id_converterrirsopenstripsplitlenappendnoisesfloatnoise_db_lownoise_db_high)r   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   flinesps	__class__r   r   r   |   st   


"
zCommonPreprocessor.__init__c                 C   s   t j| j}d }|d urEtj|t jdd\}}|j}tj	j
||ddd d d |jd f }|t| d  }t |t|d | }||fS )NTrM   	always_2drT   rS   r)   r3   绽|=)r!   randomchoicer   	soundfilereadrW   TrX   rY   convolver8   rd   r[   sqrtmax)r   rk   r`   rir_pathrirr0   power2r   r   r   _convolve_rir   s   z CommonPreprocessor._convolve_rirc                 C   s  |j d }tj| j}d }|d urtj| j| j}t	|y}|j
|kr/|jtjdd}nc|j
|k rm|j
| | jk rItd|j
 d| d tjd||j
 }|jtjdd}tj||||j
 | fdgd	d
}n%tjd|j
| }|| |j|tjdd}t||krtd| W d    n1 sw   Y  |j}|d  }	d| d  t| tt|	d }
||
|  }||fS )Nr)   Tr   zNoise (z) is much shorter than speech (z) in dynamic mixingr   r-   wraprS   zSomething wrong: r3   
      r   )r8   r!   r   r   r   uniformr   r   r   	SoundFileframesr   rW   r{   loggingwarningrandintr>   seekr   RuntimeErrorr   r[   r   r   )r   rk   r`   nsamples
noise_pathnoisenoise_dbr   offsetnoise_powerscaler   r   r   
_add_noise   sN   




zCommonPreprocessor._add_noiser   r   c                 C   s:  t  sJ | j|v r| jry| jd us| jd ury|| j }|jdkr*|d d d f }n|j}|t| d  }| jd urL| j	t
j krL| ||\}}| jd ura| jt
j kra| ||\}}|j}t
t
|}|dkrt|| }||| j< | jd ur|| j }t
t
|}|| j | || j< t|sJ |S )Nr)   r3   rh   )r
   r}   r   r   r   r=   r   rd   r[   rw   r!   r   r   ry   r   r   absr|   r   )r   r   rk   r`   r0   mar   r   r   _speech_process  s0   







z"CommonPreprocessor._speech_processc                 C   sf   | j |v r+| jd ur+|| j  }| |}| j|}| j|}tj|tjd|| j < t	|s1J |S NrM   
r~   r   rp   text2tokensr   
tokens2idsr!   arrayint64r   r   r   rl   tokens	text_intsr   r   r   _text_process8  s   

z CommonPreprocessor._text_processr   c                 C   s"   t  sJ | |}| |}|S r   )r
   r   r   r   r   r   r   r   D  s   


zCommonPreprocessor.__call__)NNNNNrf   rg   NNNrh   Nrh   ri   rj   Nrk   rl   )r   r   r   r   r    r	   r   r   r   r   r   r   r   r   r!   r"   r   r   r   __classcell__r   r   r   r   re   {   s    	
V+
(
re   c                )       s   e Zd Z																			d#d	ed
edeeeee f deeeee f deeeee f dee dedededeeeee f dedede	dede	dede	de	dedef( fddZ
deeeeejf f d eeejf fd!d"Z  ZS )$SLUPreprocessorNrf   rg   rh   ri   rj   rk   rl   r   rm   rn   transcript_token_listro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   c                    s   t  jdi d|d|d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|d|d|d|d| |d ur\td td|||	|
|d| _t||d| _d S d | _d | _d S )Nr   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   zusing transcriptwordr   r   r   )r   r   printr   transcript_tokenizerr   transcript_token_id_converter)r   r   rm   rn   r   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   O  sn   	

zSLUPreprocessor.__init__r   r   c                 C   s   | j |v r+| jd ur+|| j  }| |}| j|}| j|}tj|tjd|| j < d|v rS| jd urS|d }| |}| j	|}| j
|}tj|tjd|d< t|sYJ |S )Nr   
transcript)r~   r   rp   r   r   r   r!   r   r   r   r   r   r   r   r   r   r     s   


zSLUPreprocessor._text_process)NNNNNNrf   rg   NNNrh   Nrh   ri   rj   Nrk   rl   )r   r   r   r   r    r	   r   r   r   r   r   r   r!   r"   r   r   r   r   r   r   r   N  s    	
>r   c                       s   e Zd Zdddddddddddgfdededeeeee f d	eeeee f d
ee dedededeeeee f dedede	e f fddZ
deeeeejf f deeejf fddZdedeeeeejf f deeejf fddZ  ZS )CommonPreprocessor_multiNrf   rg   rk   rl   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   r}   r~   c                    s~   t  | || _|| _|| _|d ur4|d u rtdt|| _t|||
||	|d| _	t
||d| _d S d | _d | _	d | _d S )Nr   r   r   )r   r   r   r}   r~   r;   r   rp   r   r   r   r   )r   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   r}   r~   r   r   r   r     s.   

z!CommonPreprocessor_multi.__init__r   r   c                 C   sl   | j D ]*}||v r-| jd ur-|| }| |}| j|}| j|}tj|tjd||< qt	|s4J |S r   r   )r   r   text_nrl   r   r   r   r   r   r     s   

z&CommonPreprocessor_multi._text_processr   c                 C   s$   t  sJ | j|v r	 | |}|S r   )r
   r}   r   r   r   r   r   r     s
   


z!CommonPreprocessor_multi.__call__)r   r   r   r   r    r	   r   r   r   r   r   r   r!   r"   r   r   r   r   r   r   r   r     sb    	
*
r   c                (       s  e Zd Zdgdgdgdddddddddddddddgfd	ed
ee deeeeee f  deeeeee f  de	e dedededeeeee f dedede
dede
dede
de
dedee f& fddZdeeeeejf f deeejf fd d!Z  ZS )" MutliTokenizerCommonPreprocessorNrf   rg   rh   ri   rj   rk   rl   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   c                    s~  t  jdi d|d|d d|d d|d d|d|d|d	|d
|	d|
d|d|d d|d|d|d|d|d|d| t|t|  krbt|  krbt|ksgJ d J dt|| _g | _g | _t| jD ]=}|| d ur|| d u rtd| jt	|| || |
||	|d | jt
|| |d qw| jd  | jd  qwt|| _|| _d S )Nr   rm   r   rn   ro   rp   rq   rr   rs   rt   ru   r}   r~   rv   rw   rx   ry   rz   r{   r|   zDtoken_type, token_list, bpemodel, or processing text_name mismatchedr   r   r   r   )r   r   r   num_tokenizerr   r   r<   r;   r   r   r   r   rp   r~   )r   r   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   rN   r   r   r   r     s   	
.



z)MutliTokenizerCommonPreprocessor.__init__r   r   c                 C   s   t | jD ]5}| j| }||v r:| j| d ur:|| }| |}| j| |}| j| |}tj	|tj
d||< qt|sAJ |S r   )r<   r   r~   r   rp   r   r   r   r!   r   r   r   )r   r   rN   r~   rl   r   r   r   r   r   r   :  s   

z.MutliTokenizerCommonPreprocessor._text_process)r   r   r   r   r   r    r	   r   r   r   r   r   r   r!   r"   r   r   r   r   r   r   r     s~    	
Pr   c                       s   e Zd Z							ddededed	ed
edededef fddZdd Zdd Z	dd Z
dedeeeeejf f deeejf fddZ  ZS )DynamicMixingPreprocessorNr3           
speech_mix
speech_refr   
source_scpref_numdynamic_mixing_gain_dbr}   speech_ref_name_prefixmixture_source_nameutt2spkc	                    s  t  | || _|| _|| _|| _|| _|d u r | d| _n|| _i | _|d us4J dt	| j
 t|ddd&}	|	D ]}
|
 d d}t|dksPJ |d | j|d < q>W d    n1 sdw   Y  i | _|d u r~| j D ]}|| j|< qunDt|ddd&}	|	D ]}
|
 d d}t|dksJ |d | j|d < qW d    n1 sw   Y  | j D ]	}|| jv sJ qt| j | _d S )	N1zPlease pass `source_scp` to r   r   r   r)   r3   r   )r   r   r   r   r   r}   r   r   sourcestyper   r   r   r   r   r   keyslistsource_keys)r   r   r   r   r   r}   r   r   r   r   r   r   keyr   r   r   r   J  sF   
z"DynamicMixingPreprocessor.__init__c                 C   s   |g}| j | g}d}t|| jk rHt| j}| j | }||vr+||vr+|| n|d7 }|dkrA|| td| d t|| jk s|dd  S )Nr   r)   r   z6Can not find speech source from different speaker for zO times.There may be problems with training data. Please check the utt2spk file.)	r   r   r   r   r   r   r   r   r   )r   r   r   spk_ids	retry_cntpickedspk_idr   r   r   _pick_source_utterances_}  s$   

z2DynamicMixingPreprocessor._pick_source_utterances_c                 C   sn   t j| j| tjdd\}}||jd kr&||jd  }t|d|fd}n|d| }||jd ks5J |S )NFr   r   reflect)r   r   r   r!   float32r8   r>   )r   r   speech_lengthsourcer0   r>   r   r   r   _read_source_  s   
z'DynamicMixingPreprocessor._read_source_c           
         s     |}| j jd  fdd|D }| j g| } fddtt|D }dd |D }dd t||D }tjt|dd}t	|D ]\}}	|	| j
 |d  < qK|| j< |S )	Nr   c                    s   g | ]}  |qS r   )r   )r/   r   r   r   r   r   r1         z:DynamicMixingPreprocessor._mix_speech_.<locals>.<listcomp>c                    s   g | ]}t  j  jqS r   )r   r   r   r/   rN   r   r   r   r1     s    c                 S   s   g | ]}d |d  qS )r   g      4@r   )r/   g_dbr   r   r   r1     r   c                 S   s   g | ]\}}|| qS r   r   )r/   refgr   r   r   r1     r   rO   r)   )r   r   r8   r<   r   zipr!   sumr   	enumerater   r}   )
r   r   r   r   
ref_audios
gain_in_dbgainr   rN   r   r   r   r   _mix_speech_  s   



z&DynamicMixingPreprocessor._mix_speech_r   r   r   c                 C   s>   t || j jdksJ d| jr| ||}t|sJ |S )Nr)   z'Multi-channel input has not been tested)r   r   r8   r   r   r   r   r   r   r   r     s   z"DynamicMixingPreprocessor.__call__)Nr3   r   r   r   NN)r   r   r   r   r    intr   r   r   r   r   r   r	   r!   r"   r   r   r   r   r   r   r   I  sF    	3r   c                #       s   e Zd ZdZ													
				
d*dededededededededededededededededef" fddZd d! Z	d"d# Z
d$d% Zd&eeeeejf f d'eeeeejf f fd(d)Z  ZS )+EnhPreprocessorz/Preprocessor for Speech Enhancement (Enh) task.Nrh   ri   rj   r   r   	noise_refdereverb_refFr)   @  r   rv   rw   rx   ry   rz   r{   r|   r}   r   noise_ref_name_prefixdereverb_ref_name_prefixuse_reverberant_refnum_spknum_noise_typesample_rateforce_single_channelc                    s$  t  jdi d|dd dd dd dd dd ddd	d
dd dd d|d|d|d|d|d|d|d|	 |
| _|| _|| _|| _|| _|| _|| _|| _	| j
d ur|d}t|dkrqt|d \| _| _d S t|dkrt|d t|d | _| _d S td| dd S )Nr   rm   rn   ro   rp   rq   rr   rf   rs   rg   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r0   r)   r   r3   z-Format error for --speech_volume_normalize: ''r   )r   r   r   r  r  r  r  r  r  r  r|   r   r   r   
volume_lowvolume_highr;   )r   r   rv   rw   rx   ry   rz   r{   r|   r}   r   r  r  r  r  r  r  r  r   r   r   r   r     sr   	


"zEnhPreprocessor.__init__c                    s\   t |trt fdd|D S t |tr fdd|D S |jdkr+|d d d f S |jS )Nc                 3   s    | ]}  |V  qd S r   
_ensure_2dr/   sigr   r   r   	<genexpr>  s    z-EnhPreprocessor._ensure_2d.<locals>.<genexpr>c                    s   g | ]}  |qS r   r  r  r   r   r   r1     s    z.EnhPreprocessor._ensure_2d.<locals>.<listcomp>r)   )
isinstancetupler   r=   r   )r   rY   r   r   r   r    s
   

 zEnhPreprocessor._ensure_2dc           
      C   s   d}t j|dd }||| j d  }|d d d |f }tjj||ddd d d |jd f }|t| d 	 }	t 
|t|	d | }|S )	N2   r)   rO   i  rT   rS   r3   r   )r!   argmaxminr  rX   rY   r   r8   rd   r[   r   r   )
r   rk   r   r`   predelaydtet	rir_earlyspeech2r   r   r   r   _get_early_signal   s   z!EnhPreprocessor._get_early_signalc                 C   s   ||| j  || j < t| jD ]}| jt|d  }||v r&||| ||< qt| jD ]/}| jt|d  }| js>||v rF||| ||< | jt|d  }||v r[||| ||< q,d S )Nr)   )	r}   r<   r  r  r    r  r   r   r  )r   	data_dictfuncn
noise_namespkspeech_ref_namedereverb_ref_namer   r   r   _apply_to_all_signals-  s   z%EnhPreprocessor._apply_to_all_signalsr   r   c                    s  t  sJ j vrt sJ  S jr fddtjD }d v rD fddtjD }t|djfv sCJ t|nd }dd |D }jd urjt	j

 krtfddt||D  \}}jrttdd	 |}ttd
d	 |}jrtjD ]7}t|d }j| }|| j |< |d ur|dkst|dkrj| }	|| || || j |	< qn<tjD ]6}t|d }j| }|| || || j |< |d ur|dkst|dkrj| }	 |  |	< qt|}
|
t|
 d  }jd urbjt	j

 krb|
|\}
}jrA|
jd dkr3|
d d }
|jd dkrA|d d }tdjD ]}jt|d  } |d  qG|j jd < |
j}
|
 j< t	t	 |
dkr! fdd	 ! dd	  jr! dd	  j"d urĈjrt	j
#j$j%nj$ j }
t	t	 |
! fdd	 t sJ  S )Nc              	      s(   g | ]}  jt|d    qS r)   )r  r   r    r   r   r   r   r   r1   I  s    z3EnhPreprocessor._speech_process.<locals>.<listcomp>dereverb_ref1c              	      s>   g | ]}j t|d    v r j t|d    qS r$  )r  r    r  r   r%  r   r   r1   P  s
    r)   c                 S   s    g | ]}|t | d   qS )r3   )rd   r[   )r/   srefr   r   r   r1   \  s    c                    s   g | ]
\}}  ||qS r   )r   )r/   spr`   r   r   r   r1   c  s    
c                 S      | j d dkr	| S | d d S Nr   r)   r8   rB   r   r   r   <lambda>j      z1EnhPreprocessor._speech_process.<locals>.<lambda>c                 S   r)  r*  r+  r,  r   r   r   r-  m  r.  r   r3   r   rh   c                    s   |   S r   r   r,  )r   r   r   r-        c                 S   s   |   S r   )squeezer,  r   r   r   r-    r/  c                 S   s   | j dkr| S | d d df S )Nr)   r   )r=   r,  r   r   r   r-    r.  c                    s   |    S r   r   r,  )r   volume_scaler   r   r-    s    )&r
   r}   r   r   r<   r  r   r   rw   r!   r   r   r  r   mapr  r    r   r   r  r  r   rd   r[   r   ry   r   r8   r  r  popr   r   r#  r|   r   r
  r  )r   r   r   dereverb_speech_ref	power_refrir_refr   suffixr!  dereverb_namer   	power_mixr   r  namer   )r   r   r   r1  r   r   >  s   









zEnhPreprocessor._speech_process)Nrh   Nrh   ri   rj   Nr   r   r   r   Fr)   r)   r  F)r   r   r   __doc__r   r    r   r   r   r  r  r#  r   r	   r!   r"   r   r   r   r   r   r   r     sz    	
=	r   )r#   r$   TT)rG   rH   r#   rI   ))r   r   abcr   r   pathlibr   typingr   r   r   r   r	   numpyr!   scipy.signalrX   r   	typeguardr
   r   espnet2.text.build_tokenizerr   espnet2.text.cleanerr   espnet2.text.token_id_converterr   r   r   r   rF   r"   r   r    rd   re   r   r   r   r   r   r   r   r   r   <module>   sf    
-
4 TRI` 