o
    i5                     @   s:  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
ZddlZddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@ e&deAe6e8e4de.ddZBe&d eAeed!edd"d#ZCe&d$eAeeed%ed&dZDe&d'eAe0d(e.dd"d#ZEe&d)eAe2d*e.dd"d#ZFe&d+eAeed!edd"d#ZGe&d,eAeed!edd"d#ZHG d-d. d.e!ZIdS )/GAN-based text-to-speech task.    N)Callable
CollectionDictListOptionalTuple)check_argument_typescheck_return_type)	AbsGANTTS)ESPnetGANTTSModel)JETS)JointText2Wav)VITS)AbsNormalize)	GlobalMVN)UtteranceMVN)AbsTaskoptim_classes)g2p_choices)ClassChoices)CommonCollateFn)
GANTrainer)CommonPreprocessor)AbsFeatsExtract)Dio)Energy)LinearSpectrogram)LogMelFbank)LogSpectrogram)get_default_kwargs)NestedDictAction)int_or_nonestr2boolstr_or_nonefeats_extract)fbanklog_spectrogramlinear_spectrogramr(   )classes
type_checkdefault	normalize)
global_mvnutterance_mvnT)r)   r*   r+   optionaltts)vitsjoint_text2wavjetsr1   pitch_extract)dioenergy_extract)energypitch_normalizeenergy_normalizec                   @   sb  e Zd ZU dZdZeed< eee	e
eeegZeZedejfddZedejded	eeeeeeejf f  geee eeej f f f fd
dZ!edejded	e"eeeeej#f geeejf f  fddZ$e	ddeded	eedf fddZ%e	ddeded	eedf fddZ&edejd	e'fddZ(edejde'd	eej)j* fddZ+dS )
GANTTSTaskr      num_optimizersparserc                 C   s  t  sJ |jdd}|d}|dg7 }|jdtd dd |jdtd d	d |jd
tttdd |jdd}|jdt	ddd |jdt
dg ddd |jdtd dd |jdtdd |jdtg dd dd |jdttd d d | jD ]}|| qwd S )!NzTask related)descriptionrequired
token_listz--token_listzA text mapping int-id to token)typer+   helpz--odimz)The number of dimension of output featurez--model_confz&The keyword arguments for model class.)actionr+   rB   zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typephn)bpecharwordrD   z7The text will be tokenized in the specified level token)rA   r+   choicesrB   z
--bpemodelzThe model file of sentencepiecez--non_linguistic_symbolsz non_linguistic_symbols file path)rA   rB   z	--cleaner)Ntacotronjaconv
vietnamesekorean_cleanerzApply text cleaning)rA   rH   r+   rB   z--g2pz&Specify g2p method if --token_type=phn)r	   add_argument_groupget_defaultadd_argumentr$   r"   r!   r    r   r#   strr   class_choices_listadd_arguments)clsr=   groupr?   class_choices rV   I/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/gan_tts.pyadd_task_arguments   s   



zGANTTSTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tddg ddS )Ng        r   )spembssidslids)float_pad_valueint_pad_valuenot_sequence)r	   r   )rS   rY   rZ   rV   rV   rW   build_collate_fn   s   
zGANTTSTask.build_collate_fnc              	   C   sH   t  sJ |jrt||j|j|j|j|j|jd}nd }t	|s"J |S )N)rZ   
token_typer@   bpemodelnon_linguistic_symbolstext_cleanerg2p_type)
r	   use_preprocessorr   rc   r@   rd   re   cleanerg2pr
   )rS   rY   rZ   retvalrV   rV   rW   build_preprocess_fn   s   

zGANTTSTask.build_preprocess_fnTF	inference.c                 C      |sd}|S d}|S )N)textspeech)ro   rV   rS   rZ   rm   rk   rV   rV   rW   required_data_names   s
   zGANTTSTask.required_data_namesc                 C   rn   )N)r\   	durationspitchr7   r]   r^   )r\   rp   rs   rt   r7   r]   r^   rV   rq   rV   rV   rW   optional_data_names   s
   	zGANTTSTask.optional_data_namesc              
   C   s   t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  | |_nt|jttfr=|j }ntdt	|}t
d|  |jd u ret|j}|di |j}| }nd |_d |_d }|j}|jd urt|j}|di |j}	nd }	t|j}
|
d||d|j}d }d }d }d }t|dd d urt|j}|di |j}t|d	d d urt|j}|di |j}t|d
d d urt|j }|di |j!}t|dd d urt"|j#}|di |j$}t%d||	|||||d|j&}t'|sJ |S )Nzutf-8)encodingc                 S   s   g | ]}|  qS rV   )rstrip).0linerV   rV   rW   
<listcomp>  s    z*GANTTSTask.build_model.<locals>.<listcomp>ztoken_list must be str or dictzVocabulary size: )idimodimr4   r6   r8   r9   )r%   r,   r4   r8   r6   r9   r0   rV   )(r	   
isinstancer@   rP   opencopytuplelistRuntimeErrorlenlogginginfor|   feats_extractor_choices	get_classr%   feats_extract_confoutput_sizer,   normalize_choicesnormalize_conftts_choicesr0   tts_confgetattrpitch_extractor_choicesr4   pitch_extract_confenergy_extractor_choicesr6   energy_extract_confpitch_normalize_choicesr8   pitch_normalize_confenergy_normalize_choicesr9   energy_normalize_confr   
model_confr
   )rS   rY   fr@   
vocab_sizefeats_extract_classr%   r|   normalize_classr,   	tts_classr0   r4   r6   r8   r9   pitch_extract_classenergy_extract_classpitch_normalize_classenergy_normalize_classmodelrV   rV   rW   build_model  s   




zGANTTSTask.build_modelr   c           	      C   sd  t |jdsJ t |jdsJ t|j}|d u r'tdtt d|j |jrOzdd l}W n t	y;   t
dw |jjjd|jj |d|j}n||jj fi |j}|g}t|j}|d u rvtdtt d|j |jrzdd l}W n t	y   t
dw |jjjd|jj |d|j}n||jj fi |j}||g7 }|S )	N	generatordiscriminatorzmust be one of z: r   z/Requiring fairscale. Do 'pip install fairscale')paramsoptimrV   )hasattrr0   r   getr   
ValueErrorr   sharded_ddp	fairscaleImportErrorr   ossOSSr   
parameters
optim_confoptim2r   optim2_conf)	rS   rY   r   optim_g_classr   optim_g
optimizersoptim_d_classoptim_drV   rV   rW   build_optimizerso  sZ   






zGANTTSTask.build_optimizersN)TF),__name__
__module____qualname____doc__r<   int__annotations__r   r   r   r   r   r   r   rQ   r   trainerclassmethodargparseArgumentParserrX   	Namespaceboolr   r   r   rP   r   npndarrayr   torchTensorrb   r   arrayrl   rr   ru   r   r   r   	Optimizerr   rV   rV   rV   rW   r:   h   s|   
 J&


Y
r:   )Jr   r   r   typingr   r   r   r   r   r   numpyr   r   	typeguardr	   r
   espnet2.gan_tts.abs_gan_ttsr   espnet2.gan_tts.espnet_modelr   espnet2.gan_tts.jetsr   espnet2.gan_tts.jointr   espnet2.gan_tts.vitsr   espnet2.layers.abs_normalizer   espnet2.layers.global_mvnr   espnet2.layers.utterance_mvnr   espnet2.tasks.abs_taskr   r   espnet2.text.phoneme_tokenizerr   espnet2.train.class_choicesr   espnet2.train.collate_fnr   espnet2.train.gan_trainerr   espnet2.train.preprocessorr   +espnet2.tts.feats_extract.abs_feats_extractr   espnet2.tts.feats_extract.dior    espnet2.tts.feats_extract.energyr   ,espnet2.tts.feats_extract.linear_spectrogramr   'espnet2.tts.feats_extract.log_mel_fbankr   )espnet2.tts.feats_extract.log_spectrogramr    espnet2.utils.get_default_kwargsr     espnet2.utils.nested_dict_actionr!   espnet2.utils.typesr"   r#   r$   dictr   r   r   r   r   r   r   r:   rV   rV   rV   rW   <module>   s    



