o
    iM                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
Zd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z! d dlm"Z"m#Z# d dlm$Z% d dlm&Z' d dlm(Z( d dl)m*Z* d dl)m+Z, d dl)mZ- d dl)m Z. d dl)m"Z/ d dl)m0Z0 d dl)m#Z1 d dl)m(Z2 d dl3m4Z4 d dl3mZ5 d dl3m Z6 d dl3m7Z8 d dl3m9Z: d dl;m<Z< d dl;mZ= d dl;m Z> d dl;m?Z@ d dl;mAZB d dl;m$ZC d dl;m&ZD d dlEmFZF d dlGmHZH d dlImJZJ d dlKmLZLmMZMmNZN d d lOmPZP d d!lQmRZR d d"lSmTZT d d#lUmVZVmWZWmXZX eYe6ZZd$eZ_[eYe5Z\d%e\_[eYe:Z]d&e]_[eYe8Z^d'e^_[eYe'Z_d(e__[eYe!Z`d)e`_[eYe%Zad*ea_[eYeZbd+eb_[eYeDZcd,ec_[eYe>Zdd-ed_[eYeCZed.ee_[eYe=Zfd/ef_[eYe@Zgd0eg_[eYeBZhd1eh_[eYe/Zid2ei_[eYe2Zjd3ej_[eYe1Zkd4ek_[eYe.Zld5el_[eYe-Zmd6em_[eYe,Znd7en_[d8Zoepe4ee<e*d9Zqg d:Zrg d;Zsg d<Ztg d=ZuG d>d? d?eZvdS )@    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)CTC)ESPnetASRModel)ESPnetDiarizationModel)ESPnetEnhS2TModel)ESPnetEnhancementModel)AbsTask)ASRTask)decoder_choices)encoder_choices)frontend_choicesnormalize_choices)postencoder_choices)preencoder_choices)specaug_choices)DiarizationTask)attractor_choices)r   )label_aggregator_choices)r   )EnhancementTask)mask_module_choices)separator_choices)STTask)extra_asr_decoder_choices)extra_mt_decoder_choices)g2p_choices)
initialize)CommonCollateFn)CommonPreprocessorCommonPreprocessor_multi MutliTokenizerCommonPreprocessor)Trainer)get_default_kwargs)NestedDictAction)int_or_nonestr2boolstr_or_noneenh_encoderenh_decoderenh_separatorenh_mask_moduleasr_preencoderasr_encoderasr_postencoderasr_decoderst_preencoder
st_encoderst_postencoder
st_decoderst_extra_asr_decoderst_extra_mt_decoderdiar_frontenddiar_specaugdiar_normalizediar_encoderdiar_decoderdiar_attractord   enhasrstdiar)	encoderencoder_conf	separatorseparator_confmask_modulemask_module_confdecoderdecoder_conf
criterions)
token_list
input_sizefrontendfrontend_confspecaugspecaug_conf	normalizenormalize_conf
preencoderpreencoder_confrG   rH   postencoderpostencoder_confrM   rN   ctc_conf)rP   src_token_listrQ   rR   rS   rT   rU   rV   rW   rX   rY   rG   rH   rZ   r[   rM   rN   r\   extra_asr_decoderextra_asr_decoder_confextra_mt_decoderextra_mt_decoder_conf)rQ   num_spkrR   rS   rT   rU   rV   rW   rG   rH   rM   rN   	attractorattractor_conflabel_aggregatorlabel_aggregator_confc                   @   s\  e Zd ZU dZeed< eeee	e
eeeeeeeeeeeeeeeeeeegZeZ e!de"j#fddZ$e!de"j%de&de'e(e)e*e+e*e,j-f f  ge)e.e* e+e*e/j0f f f fd	d
Z1e!de"j%de&de2e'e*e+e*e,j3f ge+e*e,j-f f  fddZ4e!	dde&de&de)e*df fddZ5e!	dde&de&de)e*df fddZ6e!de"j%de7fddZ8dS )
EnhS2TTask   num_optimizersparserc                 C   s  |j dd}|jdtd dd |jdtd dd |jdd	d
 d dg dd |jdtd dd |jdtttdd |jdtdi di dgdd |jdtd dd |jdtd dd |jdtttdd |jdtttdd |jdtttd d |jd!ttt	d"d |jd#t
d$d%g d&d'd( |jd)tttd*d |j d+d}|jd,td-d.d |jd/t
d0g d1d2d3 |jd4td d5d |jd6t
d0g d1d7d3 |jd8td d9d |jd:td;d< |jd=tg d>d d?d@ |jdAttd dBd@ |jdCd$dDgt
dEdF | jD ]}|| qd S )GNzTask related)descriptionz--token_listzA text mapping int-id to token)typedefaulthelpz--src_token_listz4A text mapping int-id to token (for source language)z--initc                 S   s   t |  S )N)r,   lower)x rq   I/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/enh_s2t.py<lambda>  s    z/EnhS2TTask.add_task_arguments.<locals>.<lambda>zThe initialization method)chainerxavier_uniformxavier_normalkaiming_uniformkaiming_normalN)rl   rm   rn   choicesz--input_sizez,The number of input dimension of the featurez
--ctc_confz$The keyword arguments for CTC class.)actionrm   rn   z--enh_criterionssi_snrfixed_order)nameconfwrapperwrapper_confz-The criterions binded with the loss wrappers.z--diar_num_spkzCThe number of speakers (for each recording) for diar submodel classz--diar_input_sizez--enh_model_confz-The keyword arguments for enh submodel class.z--asr_model_confz-The keyword arguments for asr submodel class.z--st_model_confz,The keyword arguments for st submodel class.z--diar_model_confz.The keyword arguments for diar submodel class.z--subtask_series+)rC   rD   rB   z'The series of subtasks in the pipeline.)rl   nargsrm   ry   rn   z--model_confz&The keyword arguments for model class.zPreprocess relatedz--use_preprocessorFz"Apply preprocessing to data or notz--token_typebpe)r   charwordphnz7The text will be tokenized in the specified level token)rl   rm   ry   rn   z
--bpemodelzThe model file of sentencepiecez--src_token_typez>The source text will be tokenized in the specified level tokenz--src_bpemodelz5The model file of sentencepiece (for source language)z--non_linguistic_symbolsz non_linguistic_symbols file path)rl   rn   z	--cleaner)Ntacotronjaconv
vietnamesezApply text cleaning)rl   ry   rm   rn   z--g2pz&Specify g2p method if --token_type=phnz--text_nametextz8Specify the text_name attribute used in the preprocessor)r   rm   rl   rn   )add_argument_groupadd_argumentr,   r*   r)   r(   r
   r   r   r   strr   r+   r!   class_choices_listadd_arguments)clsrj   groupclass_choicesrq   rq   rr   add_task_arguments   s6  	
zEnhS2TTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tdddS )Ng        )float_pad_valueint_pad_value)r   r#   )r   r   r   rq   rq   rr   build_collate_fn  s   
zEnhS2TTask.build_collate_fnc              
   C   sr  t  sJ |jrd|jv rtdi d|d|j|jgd|j|jgd|j|j	gd|j
d|jd|jd	t|d	r=|jnd d
t|d
rH|jnddt|drS|jnd dt|dr^|jnddt|dri|jnddt|drt|jnddt|dr|jnd dddddg}n%d|jv rt|d}nt||j|j|j|j
t|ddg|j|jd}nd }t|sJ |S )NrE   r   
token_typerP   bpemodelnon_linguistic_symbolstext_cleanerg2p_typerir_scprir_apply_probg      ?	noise_scpnoise_apply_probnoise_db_range13_15short_noise_thresg      ?speech_volume_normalizespeech_namespeech	text_namer   src_textrF   )r   )r   r   rP   r   r   r   r   r   rq   )r   use_preprocessorsubtask_seriesr&   r   src_token_typerP   r]   r   src_bpemodelr   cleanerg2phasattrr   r   r   r   r   r   r   r$   r%   getattrr	   )r   r   r   retvalrq   rq   rr   build_preprocess_fn  s   

	
zEnhS2TTask.build_preprocess_fnTF	inference.c                 C   s   |sd}|S d}|S )N)r   speech_ref1)r   rq   )r   r   r   r   rq   rq   rr   required_data_names  s
   zEnhS2TTask.required_data_namesc                 C   s   ddg}d|v r
dnd}|dd t |td D 7 }|dd t dtd D 7 }|d	d t dtd D 7 }|d
g7 }t|}t|sEJ |S )Nr   dereverb_ref1r      rh   c                 S      g | ]}d  |qS )zspeech_ref{}format.0nrq   rq   rr   
<listcomp>      z2EnhS2TTask.optional_data_names.<locals>.<listcomp>c                 S   r   )znoise_ref{}r   r   rq   rq   rr   r     r   c                 S   r   )z
text_spk{}r   r   rq   rq   rr   r     r   r   )rangeMAX_REFERENCE_NUMtupler	   )r   r   r   r   rE   rq   rq   rr   optional_data_names  s   
zEnhS2TTask.optional_data_namesc           	      C   s(  t  sJ |j }t|jD ]j\}}td td| dd}t| dD ] }t||d | d d ur>t||d | d nt||d ||< q&|dv rNd}n|dv rU|}nt| d	t	
d
| d|  t| tjdi ||| d< qtdi |}|jd urt||j t|sJ |S )Nzargs._model_conf)init
model_conf_attributes_)rD   rE   rF   s2t)rC   z not supported.z	Building z task model, using config: _modelrq   )r   r   copy	enumerater   dictevalr   
ValueErrorlogginginfo	name2taskbuild_modelargparse	Namespacer   r   r"   r	   )	r   r   r   r   subtasksubtask_confattr	m_subtaskmodelrq   rq   rr   r     s2   



zEnhS2TTask.build_modelN)TF)9__name__
__module____qualname__ri   int__annotations__enh_encoder_choicesenh_separator_choicesenh_decoder_choicesenh_mask_module_choicesr   r   r   asr_preencoder_choicesasr_encoder_choicesasr_postencoder_choicesasr_decoder_choicesst_preencoder_choicesst_encoder_choicesst_postencoder_choicesst_decoder_choicesst_extra_asr_decoder_choicesst_extra_mt_decoder_choicesdiar_frontend_choicesdiar_specaug_choicesdiar_normalize_choicesdiar_encoder_choicesdiar_decoder_choicesr   diar_attractor_choicesr   r'   trainerclassmethodr   ArgumentParserr   r   boolr   r   r   r   r   npndarrayr   torchTensorr   r   arrayr   r   r   r   r   rq   rq   rq   rr   rg      s   
 4 6
&5


rg   )wr   r   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr   r	   espnet2.asr.ctcr
   espnet2.asr.espnet_modelr   espnet2.diar.espnet_modelr    espnet2.enh.espnet_enh_s2t_modelr   espnet2.enh.espnet_modelr   espnet2.tasks.abs_taskr   espnet2.tasks.asrr   r   asr_decoder_choices_r   asr_encoder_choices_r   r   r   asr_postencoder_choices_r   asr_preencoder_choices_r   espnet2.tasks.diarr   r   diar_attractor_choices_diar_decoder_choices_diar_encoder_choices_diar_front_end_choices_r   diar_normalize_choices_diar_specaug_choices_espnet2.tasks.enhr   enh_decoder_choices_enh_encoder_choices_r   enh_mask_module_choices_r   enh_separator_choices_espnet2.tasks.str   st_decoder_choices_st_encoder_choices_r   st_extra_asr_decoder_choices_r    st_extra_mt_decoder_choices_st_postencoder_choices_st_preencoder_choices_espnet2.text.phoneme_tokenizerr!   espnet2.torch_utils.initializer"   espnet2.train.collate_fnr#   espnet2.train.preprocessorr$   r%   r&   espnet2.train.trainerr'    espnet2.utils.get_default_kwargsr(    espnet2.utils.nested_dict_actionr)   espnet2.utils.typesr*   r+   r,   deepcopyr   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   enh_attributesasr_attributesst_attributesdiar_attributesrg   rq   rq   rq   rr   <module>   s     



















