o
    iT                  ,   @   s*  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 e1e2 j3Z4d=ddZ5d>ddZ6dd Z7G dd dZ8de9fddZ:d e9d!e;d"e9d#e;d$e;d%e;d&e;d'ee;e9f d(eee9e9e9f  d)e
e9 d*e
e9 d+e
e9 d,e
e9 d-e
e9 d.e<d/e
e= d0e
e= d1e<d2e<d3e
e; d4e<d5e<f,d6d7Z>d8d9 Z?d=d:d;Z@eAd<kre@  dS dS )?    N)chain)Path)AnyListOptionalSequenceTupleUnion)trange)check_argument_types)FrequencyDomainMSE)	SISNRLoss)	PITSolver)SoundScpWriter)EnhancementTask)
EnhS2TTask)	to_device)set_all_random_seed)AbsESPnetModel)config_argparse)str2boolstr2triple_strstr_or_none)get_commandline_argsc                 C   s6   | d u r|d usJ dt |jd } | S t | } | S )Nz[The argument 'model_file' must be provided if the argument 'train_config' is not specified.zconfig.yaml)r   parent)train_config
model_file r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/enh_inference.pyget_train_config    s   
r   F c              	   C   s   |  D ]E\}}|| vr|rtd||| || |< qt|tr0t| | ||| dd q|rE| | |krEtd||| | | || |< qdS )z9Update `dict_org` with `dict_patch` in-place recursively.z&Overwriting config: [{}{}]: None -> {}.)verbose
log_prefixz$Overwriting config: [{}{}]: {} -> {}N)itemslogginginfoformat
isinstancedictrecursive_dict_update)dict_org
dict_patchr"   r#   keyvaluer   r   r   r*   ,   s*   


r*   c                 C   sp   |  |}t|tstdtj dt| || |d ur6|dkr,dtj	  }|
tj||d |S )Nzmodel must inherit z
, but got cudazcuda:)map_location)build_modelr(   r   RuntimeError__name__typetotorchr/   current_deviceload_state_dictload)taskargsr   devicemodelr   r   r   build_model_from_args_and_fileE   s   


r>   c                   @   s   e Zd ZdZ												d!deeef deeef deeef d	ee d
ee de	de	dee
 de	dedede	fddZe 	d"deejejf de
deej fddZe d#ddZe	d$dee dee fdd ZdS )%SeparateSpeecha  SeparateSpeech class

    Examples:
        >>> import soundfile
        >>> separate_speech = SeparateSpeech("enh_config.yml", "enh.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> separate_speech(audio)
        [separated_audio1, separated_audio2, ...]

    NFcpufloat32r   r   inference_configsegment_sizehop_sizenormalize_segment_scaleshow_progressbarref_channelnormalize_output_wavr<   dtypeenh_s2t_taskc                 C   s  t  sJ |s	tnt}|d u r||||
\}}n|t||d}|jddd}t|}W d    n1 s6w   Y  t|jddd}t|}W d    n1 sTw   Y  |r^d}nd}t	t
dd |D  }| D ]}||vr~td	d
| qot||dd tjdi |}t||||
}|r|j}|jtt|d  |
| _|| _|| _|| _|| _|| _|| _|	| _|| _|j| _| jdkrdnd}|d urt !d"| ||j#_$|| _$n|j$| _$|d uo|d u| _%| j%rt !d|  t !d"|| d S t !d|  d S )N)r   rzutf-8)encoding)enh_encoderenh_separatorenh_decoder)encoder	separatordecoderc                 S   s   g | ]}||d  gqS )_confr   ).0kr   r   r   
<listcomp>       z+SeparateSpeech.__init__.<locals>.<listcomp>z3Only the following top-level keys are supported: %sz, T)r"   )rI      enhancement
separationz1Overwrite enh_model.separator.ref_channel with {}zPerform segment-wise speech %sz,Segment length = {} sec, hop length = {} secz%Perform direct speech %s on the inputr   )&r   r   r   build_model_from_filer   openyaml	safe_loadr   listr   keys
ValueErrorjoinr*   argparse	Namespacer>   	enh_modelr5   getattrr6   evalr<   rI   enh_train_argsrC   rD   rE   rH   rF   num_spkr%   r&   r'   rQ   rG   
segmenting)selfr   r   rB   rC   rD   rE   rF   rG   rH   r<   rI   rJ   r:   re   rh   f
train_args
infer_argsarg_listsupported_keysrU   r   r   r   __init__a   sx   

zSeparateSpeech.__init__@  
speech_mixfsreturnc                    s  t  sJ t|tjrt|}| dksJ | |d}|t	tj
}|j|gtj|ddt|jd}tjdjr d j| kr tt|jj  }tt|d| j|  }tj|  }}|ddd|f j}g }	jrtnt}
|
|D ]}t|j | }|| }|d krǈd }||}|| }|dd||f |ddd|f< n|}|dd||f }|j|gtj|dj|\}}j||\}}}fdd|D }| dkr|ddjf }n|}jrIttj |ddd|f !ddd	d
ttj t"|ddd|f !ddd	d
  fdd|D }|	#tj$|dd q|	d }td|D ]}j%|dddd| df |	| ddddd|f dd}t|D ]}|	| || |f |	| dd|f< q||d krd|	| dddd|df< |	| dddd||f }n|	| dddd|df }|dddd| df |	| ddddd|f  d |dddd| df< tj&||gdd}q]|d|dksJ |j|jftj'|dd}nj|\}}j||\}}}fdd|D }t(|j)ksNJ t(|j)kt(|d |ksaJ t(|d |fj*rndd |D }|S dd |D }|S )zInference

        Args:
            speech_mix: Input speech data (Batch, Nsamples [, Channels])
            fs: sample rate
        Returns:
            [separated_audio1, separated_audio2, ...]

        rX   r   )rI   
fill_value)r<   Nc                       g | ]}j | d  qS r   re   rR   rT   rl   )lengths_segrk   r   r   rV      s    z+SeparateSpeech.__call__.<locals>.<listcomp>   Tdimkeepdimc                    s   g | ]}|   qS r   r   rT   w)
enh_energy
mix_energyr   r   rV     s    )r~   si_snr	criterionc                    rw   rx   ry   rz   )lengthsrk   r   r   rV   3  s    c                 S   s2   g | ]}|t |jd ddd  d   qS )rX   Tr}   r   g?)absmaxr@   numpyr   r   r   r   rV   8  s    $c                 S   s   g | ]}|   qS r   )r@   r   r   r   r   r   rV   =  rW   )+r   r(   npndarrayr6   	as_tensorr~   sizer5   rf   rI   new_fulllongr   r<   rj   rC   introundrD   ceilshaperF   r
   range	new_zerosre   rP   rQ   rG   rE   sqrtmeanpowsumappendstackcal_permumationcatunbindlenri   rH   )rk   rs   rt   
batch_sizeoverlap_lengthnum_segmentstT	pad_shape	enh_wavesrange_isten
speech_segfeatsf_lens_processed_wavspeech_seg_wavespermbatchenh_waves_res_ir   )r   r   r{   r   rk   r   __call__   s   



&
$ ( 8&"&zSeparateSpeech.__call__r   c           	      C   s6   t td| }t| d}|||\}}}|d }|S )a|  Calculate the permutation between seaprated streams in two adjacent segments.

        Args:
            ref_wavs (List[torch.Tensor]): [(Batch, Nsamples)]
            enh_wavs (List[torch.Tensor]): [(Batch, Nsamples)]
            criterion (str): one of ("si_snr", "mse", "corr)
        Returns:
            perm (torch.Tensor): permutation for enh_wavs (Batch, num_spk)
        )r   mser   r   )r   r   r   )	rk   ref_wavsenh_wavsr   criterion_class
pit_solverr   othersr   r   r   r   r   A  s
   zSeparateSpeech.cal_permumation	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a*  Build SeparateSpeech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            SeparateSpeech: SeparateSpeech instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.r   )espnet_model_zoo.downloaderr   ImportErrorr%   errorupdatedownload_and_unpackr?   )r   r   r   dr   r   r   from_pretrainedU  s   zSeparateSpeech.from_pretrained)NNNNNFFNFr@   rA   F)rr   )r   N)r3   
__module____qualname____doc__r	   r   strr   floatboolr   rq   r6   no_gradTensorr   r   r   r   r   staticmethodr   r   r   r   r   r   r?   U   st    


	

] r?   r.   c                 C   s   | dv rd S t | S )N)noneNoneNONE)humanfriendly
parse_size)r.   r   r   r   humanfriendly_or_nonet  s   
r   
output_dirr   rI   rt   ngpuseednum_workers	log_leveldata_path_and_name_and_typekey_filer   r   r   rB   allow_variable_data_keysrC   rD   rE   rF   rG   rH   rJ   c           $      C   s  t  sJ |dkrtd|dkrtdtj|dd |dkr#d}nd}t| t|
|||||||||||d}tjdd	|i|}tj	||||	|t
|jd
t|jd
|dd	}t|   } g }t|jD ]}|t|  d|d  |  d|d  d qjt|D ]p\}\}}td| d|  t|tsJ t|tdd |D sJ |ttt| }t||ksJ t| d| dd | D }|di |}t|D ]\} }!t|D ]}"||!|" f||  ||" < qqq|D ]}#|#  qd S )NrX   z!batch decoding is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelr'   r/   r@   )r   r   rB   rC   rD   rE   rF   rG   rH   r<   rI   rJ   r   FT)rI   r   r   r   preprocess_fn
collate_fnr   	inferencez/wavs/z/spkz.scp[z] Enhancing c                 s   s    | ]}t |tV  qd S r   )r(   r   )rT   sr   r   r   	<genexpr>  s    zinference.<locals>.<genexpr>z != c                 S   s    i | ]\}}| d s||qS )_lengths)endswith)rT   rU   vr   r   r   
<dictcomp>  s     zinference.<locals>.<dictcomp>r   )r   NotImplementedErrorr%   basicConfigr   r)   r?   r   r   build_streaming_iteratorbuild_preprocess_fnrh   build_collate_fnr   
expanduserresolver   ri   r   r   	enumerater&   r(   r4   allr   nextitervaluesr$   close)$r   r   rI   rt   r   r   r   r   r   r   r   r   r   rB   r   rC   rD   rE   rF   rG   rH   rJ   r<   separate_speech_kwargsseparate_speechloaderwritersr   r`   r   _bsr   spkr   bwriterr   r   r   r   z  s   
&"
r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | jdtddd | d}|jdt	dd d! |jd"t
d# |jd$td%d& | d'}|jd(td%d)d | d*}|jd+td,d- |jd.td/d- |jd0td1d- |jd2t
d d3d |jd4td%d5d | d6}|jd7tdd8d | d9}|jd:td d;d |jd<td d=d |jd>td%d?d |jd@td%dAd |jdBtd dCd | S )DNzFrontend inference)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upper)xr   r   r   <lambda>  s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr  DEBUGNOTSETzThe verbose level of logging)r4   defaultchoiceshelpz--output_dirT)r4   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r4   r
  r  z--seedzRandom seedz--dtyperA   )float16rA   float64z	Data type)r
  r  r  z--fsrr   zSampling ratez--num_workersrX   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typer   )r4   r  actionz
--key_file)r4   z--allow_variable_data_keysF)r4   r
  zOutput data relatedz--normalize_output_wavz0Whether to normalize the predicted wav to [-1~1]zThe model configuration relatedz--train_configzTraining configuration file)r4   r  z--model_filezModel parameter filez--model_tagz]Pretrained model tag. If specify this option, train_config and model_file will be overwrittenz--inference_configzQOptional configuration file for overwriting enh model attributes during inferencez--enh_s2t_taskzenhancement and asr joint modelzData loading relatedz--batch_sizezThe batch size for inferencezSeparateSpeech relatedz--segment_sizezHSegment length in seconds for segment-wise speech enhancement/separationz
--hop_sizezDHop length in seconds for segment-wise speech enhancement/separationz--normalize_segment_scalezHWhether to normalize the energy of the separated streams in each segmentz--show_progressbarzYWhether to show a progress bar when performing segment-wise speech enhancement/separationz--ref_channelzvIf not None, this will overwrite the ref_channel defined in the separator module (for multi-channel speech processing))r   ArgumentParserrc   ArgumentDefaultsHelpFormatteradd_argumentr   r   r   add_argument_groupr   r   r   r   )parsergroupr   r   r   
get_parser  s   




r  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigr   )	printr   sysstderrr  
parse_argsvarspopr   )cmdr  r;   r   r   r   r   mainh  s   
r!  __main__r   )Fr    )Brc   r%   r  	itertoolsr   pathlibr   typingr   r   r   r   r   r	   r   r   r   r6   r]   tqdmr
   	typeguardr   %espnet2.enh.loss.criterions.tf_domainr   'espnet2.enh.loss.criterions.time_domainr   $espnet2.enh.loss.wrappers.pit_solverr   espnet2.fileio.sound_scpr   espnet2.tasks.enhr   espnet2.tasks.enh_s2tr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.train.abs_espnet_modelr   espnet2.utilsr   espnet2.utils.typesr   r   r   espnet.utils.cli_utilsr   finfoget_default_dtypeepsEPSr   r*   r>   r?   r   r   r   r   r   r   r  r!  r3   r   r   r   r   <module>   s    

  !
	

i 

	
