o
    iH                     @   s(  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
Zd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ d dlKmLZL d dlMmNZN d dlOmPZP d d lQmRZR d d!lSmTZT d d"lUmVZV d d#lWmXZX d d$lYmZZZ d d%l[m\Z\ d d&l]m^Z^ d d'l_m`Z` d d(lambZb d d)lcmdZd d d*lemfZf d d+lgmhZh d d,limjZj d d-lkmlZl d d.lmmnZn d d/lompZp d d0lqmrZr d d1lsmtZtmuZumvZv d d2lwmxZx d d3lymzZz d d4l{m|Z| d d5l}m~Z~mZ end6ee%e!e#d7ed8d9Zend:edZi d;eFd<eHd=eJd>eLd?eNd@eRdAePdBeTdCeVdDeXdEe^dFe`dGebdHeddIefdJe\dKedLeZeDdEd9ZendMeedNedOd9ZendPeeeed7ed8d9ZendQeeBe<e@e:e>dRe8dd9ZendSee0e+e2e3e4e-e,e-e5e.e.e6e1dTe)dd9ZendUeeuevdVetdd9ZdWZG dXdY dYejZdS )[    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)AbsMask)	MultiMask)TCNSeparatorNomask)
AbsDecoder)ConvDecoder)NullDecoder)STFTDecoder)
AbsEncoder)ConvEncoder)NullEncoder)STFTEncoder)ESPnetEnhancementModel)
AbsEnhLoss)FrequencyDomainAbsCoherenceFrequencyDomainDPCLFrequencyDomainL1FrequencyDomainMSE)	CISDRLossMultiResL1SpecLossSDRLoss	SISNRLossSNRLossTimeDomainL1TimeDomainMSE)AbsLossWrapper)
DPCLSolver)FixedOrderSolver)MixITSolver)MultiLayerPITSolver)	PITSolver)AbsSeparator)AsteroidModel_Converter)ConformerSeparator)DANSeparator)DC_CRNSeparator)DCCRNSeparator)DPCLE2ESeparator)DPCLSeparator)DPRNNSeparator)DPTNetSeparator)FaSNetSeparator)iNeuBe)NeuralBeamformer)RNNSeparator)SkiMSeparator)SVoiceSeparator)TCNSeparator)TransformerSeparator)AbsIterFactory)AbsTask)
initialize)ClassChoices)CommonCollateFn)DistributedOption)AbsPreprocessorDynamicMixingPreprocessorEnhPreprocessor)Trainer)get_default_kwargs)NestedDictAction)str2boolstr_or_noneencoder)stftconvsamerI   )nameclasses
type_checkdefault	separatorasteroid	conformerdandc_crndccrndpcldpcl_e2edprnndptnetfasnetrnnskimsvoicetcntransformerwpe_beamformer
tcn_nomaskineubemask_module)
multi_maskrd   decoderloss_wrappers)pitfixed_ordermultilayer_pitrV   mixit
criterions)ci_sdrcohsdrsi_snrsnrl1rV   l1_fdl1_tdmsemse_fdmse_td	mr_l1_tfdpreprocessor)dynamic_mixingenhd   c                       sf  e Zd ZU dZeed< eeee	e
gZeZedejfddZedejdedeeeeeeejf f  geee eeejf f f fd	d
Zedejdedeeeeeej f geeejf f  fddZ!e	ddededeedf fddZ"e	ddededeedf fddZ#edejde$fddZ%e	ddejde&dede'de(f
 fddZ)  Z*S )EnhancementTask   num_optimizersparserc                 C   s  |j dd}|jddd d dg dd |jd	tttd
d |jdtdi di dgdd |j dd}|jdtd dd |jdtd dd |jdtddd |jdtd dd |jdtddd |jdtdd d |jd!td"d#d |jd$td%d&d |jd't	d(d)d |jd*t	d(d+d |jd,t	d-d.d |jd/td%d0d |jd1td%d2d |jd3td d4d |jd5td6d7d | j
D ]}|| qd S )8NzTask related)descriptionz--initc                 S   s   t |  S N)rG   lower)x r   E/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/enh.py<lambda>   s    z4EnhancementTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)typerO   helpchoicesz--model_confz&The keyword arguments for model class.)actionrO   r   z--criterionsro   rh   )rL   confwrapperwrapper_confz-The criterions binded with the loss wrappers.zPreprocess relatedz--speech_volume_normalizezScale the maximum amplitude to the given value or range. e.g. --speech_volume_normalize 1.0 scales it to 1.0.
--speech_volume_normalize 0.5_1.0 scales it to a random number in the range [0.5, 1.0))r   rO   r   z	--rir_scpzThe file path of rir scp file.z--rir_apply_prob      ?z-THe probability for applying RIR convolution.z--noise_scpz The file path of noise scp file.z--noise_apply_probz&The probability applying Noise adding.z--noise_db_range13_15z:The range of signal-to-noise ratio (SNR) level in decibel.z--short_noise_thres      ?znIf len(noise) / len(speech) is smaller than this threshold during dynamic mixing, a warning will be displayed.z--use_reverberant_refFzEWhether to use reverberant speech references instead of anechoic onesz	--num_spkr}   z'Number of speakers in the input signal.z--num_noise_typezNumber of noise types.z--sample_rate@  z"Sampling rate of the data (in Hz).z--force_single_channelz/Whether to force all data to be single-channel.z--dynamic_mixingzApply dynamic mixingz	--utt2spkz@The file path of utt2spk file. Only used in dynamic_mixing mode.z--dynamic_mixing_gain_db        z.Random gain (in dB) for dynamic mixing sources)add_argument_groupadd_argumentrE   rD   r   rG   floatstrrF   intclass_choices_listadd_arguments)clsr   groupclass_choicesr   r   r   add_task_arguments   s   	
z"EnhancementTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )Nr   r   )float_pad_valueint_pad_value)r   r>   )r   r   r   r   r   r   build_collate_fnU  s   
z EnhancementTask.build_collate_fnc                 C   s  t  sJ t|dd d u}|r|jdkr]|rZt|j|tjtj|j	d d |j
dd|j
d|jd |j
dd	|j
d
d|j
dd|j
dd t|dd d}nd }n|jdkrt|j|t|drp|jnd t|dry|jndt|dr|jnd t|dr|jndt|dr|jndt|dr|jndt|dr|jnd t|dr|jnd t|dr|jndt|dr|jndt|dr|jnd t|d!r|jnd"d#}ntd$|j d%d }t|sJ |S )&Nrx   ry   r   source_scp_namezspk1.scpref_numnum_spkdynamic_mixing_gain_dbr   speech_name
speech_mixspeech_ref_name_prefix
speech_refmixture_source_nameutt2spk)r   
source_scpr   r   r   r   r   r   rz   rir_scprir_apply_probr   	noise_scpnoise_apply_probnoise_db_ranger   short_noise_thresr   speech_volume_normalizeuse_reverberant_refr}   num_noise_typesample_rater   force_single_channelF)r   r   r   r   r   r   r   r   r   r   r   r   r   zPreprocessor type z is not supported.)r   getattrrx   preprocessor_choices	get_classospathjoindirname!train_data_path_and_name_and_typepreprocessor_confgetseparator_confhasattrr   r   r   r   r   r   r   r   r   r   r   r   
ValueErrorr	   )r   r   r   use_preprocessorretvalr   r   r   build_preprocess_fn`  s   





#z#EnhancementTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)speech_ref1)r   r   r   r   r   r   r   r   r   required_data_names  s
   z#EnhancementTask.required_data_namesc                 C   sr   dg}|dd t dtd D 7 }|dd t dtd D 7 }|dd t dtd D 7 }t|}t|s7J |S )Nr   c                 S      g | ]}d  |qS )zdereverb_ref{}format.0nr   r   r   
<listcomp>      z7EnhancementTask.optional_data_names.<locals>.<listcomp>r}   c                 S   r   )zspeech_ref{}r   r   r   r   r   r     r      c                 S   r   )znoise_ref{}r   r   r   r   r   r     r   )rangeMAX_REFERENCE_NUMtupler	   r   r   r   r   optional_data_names  s   z#EnhancementTask.optional_data_namesc                 C   s6  t  sJ t|jd
i |j}t|j|jfi |j}t	|j
d
i |j}|jdrAt|jd
d|ji|j}nd }g }t|dd d urz|jD ])}|di }t|d d
i |}	t|d d
d|	i|d }
||
 qPtd
|||||d	|j}|jd urt||j t|sJ |S )Nnomask	input_dimrk   r   rL   r   	criterionr   )rH   rP   re   rf   rc   r   )r   encoder_choicesr   rH   encoder_confseparator_choicesrP   
output_dimr   decoder_choicesre   decoder_confendswithmask_module_choicesrc   mask_module_confr   rk   r   criterion_choicesloss_wrapper_choicesappendr   
model_confinitr<   r	   )r   r   rH   rP   re   rc   rf   ctrcriterion_confr   loss_wrappermodelr   r   r   build_model  sN   



zEnhancementTask.build_modelNdistributed_optionmodekwargsc                    sD   t |dd}|r|dkrt|}|jdd |_t ||||S )Nry   Fr   r   r}   )r   copydeepcopyfold_lengthsuperbuild_iter_factory)r   r   r   r   r   ry   	__class__r   r   r     s
   	
z"EnhancementTask.build_iter_factory)TFr   )+__name__
__module____qualname__r~   r   __annotations__r   r   r   r   r   r   rC   trainerclassmethodargparseArgumentParserr   	Namespaceboolr   r   r   r   r   npndarrayr   torchTensorr   r   arrayr   r   r   r   r   r?   dictr:   r   __classcell__r   r   r   r   r|      s   
  
&W


/r|   r   )r  r   r   typingr   r   r   r   r   r   numpyr
  r  	typeguardr   r	   espnet2.diar.layers.abs_maskr
   espnet2.diar.layers.multi_maskr   +espnet2.diar.separator.tcn_separator_nomaskr   espnet2.enh.decoder.abs_decoderr    espnet2.enh.decoder.conv_decoderr    espnet2.enh.decoder.null_decoderr    espnet2.enh.decoder.stft_decoderr   espnet2.enh.encoder.abs_encoderr    espnet2.enh.encoder.conv_encoderr    espnet2.enh.encoder.null_encoderr    espnet2.enh.encoder.stft_encoderr   espnet2.enh.espnet_modelr   $espnet2.enh.loss.criterions.abs_lossr   %espnet2.enh.loss.criterions.tf_domainr   r   r   r   'espnet2.enh.loss.criterions.time_domainr   r   r   r   r   r    r!   %espnet2.enh.loss.wrappers.abs_wrapperr"   %espnet2.enh.loss.wrappers.dpcl_solverr#   %espnet2.enh.loss.wrappers.fixed_orderr$   &espnet2.enh.loss.wrappers.mixit_solverr%   /espnet2.enh.loss.wrappers.multilayer_pit_solverr&   $espnet2.enh.loss.wrappers.pit_solverr'   #espnet2.enh.separator.abs_separatorr(   %espnet2.enh.separator.asteroid_modelsr)   )espnet2.enh.separator.conformer_separatorr*   #espnet2.enh.separator.dan_separatorr+   &espnet2.enh.separator.dc_crn_separatorr,   %espnet2.enh.separator.dccrn_separatorr-   (espnet2.enh.separator.dpcl_e2e_separatorr.   $espnet2.enh.separator.dpcl_separatorr/   %espnet2.enh.separator.dprnn_separatorr0   &espnet2.enh.separator.dptnet_separatorr1   &espnet2.enh.separator.fasnet_separatorr2   &espnet2.enh.separator.ineube_separatorr3   'espnet2.enh.separator.neural_beamformerr4   #espnet2.enh.separator.rnn_separatorr5   $espnet2.enh.separator.skim_separatorr6   &espnet2.enh.separator.svoice_separatorr7   #espnet2.enh.separator.tcn_separatorr8   +espnet2.enh.separator.transformer_separatorr9   "espnet2.iterators.abs_iter_factoryr:   espnet2.tasks.abs_taskr;   espnet2.torch_utils.initializer<   espnet2.train.class_choicesr=   espnet2.train.collate_fnr>   espnet2.train.distributed_utilsr?   espnet2.train.preprocessorr@   rA   rB   espnet2.train.trainerrC    espnet2.utils.get_default_kwargsrD    espnet2.utils.nested_dict_actionrE   espnet2.utils.typesrF   rG   r  r   r   r   r   r   r   r   r   r|   r   r   r   r   <module>   s@    $		

