o
    i7                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
mZmZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZC ddlDmEZE ddlFmGZG dd lHmIZI dd!lJmKZKmLZLmMZM e#d"eNe;e=e9d#e3d$d%ZOe#d&eNe5d'e3dd(d)ZPe#d*eNe7d+e3dd(d)ZQe#d,eNed-ed.d(d)ZRe#d/eNed-edd(d)ZSe#d0eNed-edd(d)ZTe#d1eNe?eAe/e1eeed2e+d3d%ZUG d4d5 d5eZVdS )6zText-to-speech task.    N)Path)Callable
CollectionDictListOptionalTupleUnion)check_argument_typescheck_return_type)JETS)JointText2Wav)VITS)AbsNormalize)	GlobalMVN)AbsTask)g2p_choices)ClassChoices)CommonCollateFn)CommonPreprocessor)Trainer)AbsTTS)ESPnetTTSModel)
FastSpeech)FastSpeech2)AbsFeatsExtract)Dio)Energy)LinearSpectrogram)LogMelFbank)LogSpectrogram)	Tacotron2)Transformer) ParallelWaveGANPretrainedVocoder)get_default_kwargs)Spectrogram2Waveform)NestedDictAction)int_or_nonestr2boolstr_or_nonefeats_extract)fbankspectrogramlinear_spectrogramr+   )classes
type_checkdefaultpitch_extract)dioT)r.   r/   r0   optionalenergy_extract)energy	normalize)
global_mvnr7   pitch_normalizeenergy_normalizetts)	tacotron2transformer
fastspeechfastspeech2vitsjoint_text2wavjetsr;   c                   @   sv  e Zd ZU dZeed< eeee	e
eegZeZedejfddZedejdedeeeeeeejf f  geee eeejf f f fd	d
Z edejdede!eeeeej"f geeejf f  fddZ#e	ddededeedf fddZ$e	ddededeedf fddZ%edejde&fddZ'e				d de(e)ef de(e)ef de!e& defddZ*dS )!TTSTask   num_optimizersparserc                 C   s  t  sJ |jdd}|d}|dg7 }|jdtd dd |jdtd d	d |jd
tttdd |jdd}|jdt	ddd |jdt
dg ddd |jdtd dd |jdtdd |jdtg dd dd |jdttd d d | jD ]}|| qwd S )!NzTask related)descriptionrequired
token_listz--token_listzA text mapping int-id to token)typer0   helpz--odimz)The number of dimension of output featurez--model_confz&The keyword arguments for model class.)actionr0   rJ   zPreprocess relatedz--use_preprocessorTz"Apply preprocessing to data or notz--token_typephn)bpecharwordrL   z7The text will be tokenized in the specified level token)rI   r0   choicesrJ   z
--bpemodelzThe model file of sentencepiecez--non_linguistic_symbolsz non_linguistic_symbols file path)rI   rJ   z	--cleaner)Ntacotronjaconv
vietnamesekorean_cleanerzApply text cleaning)rI   rP   r0   rJ   z--g2pz&Specify g2p method if --token_type=phn)r
   add_argument_groupget_defaultadd_argumentr)   r'   r&   r$   r   r(   strr   class_choices_listadd_arguments)clsrE   grouprG   class_choices r^   E/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/tasks/tts.pyadd_task_arguments   s   



zTTSTask.add_task_argumentsargstrainreturnc                 C   s   t  sJ tddg ddS )Ng        r   )spembssidslids)float_pad_valueint_pad_valuenot_sequence)r
   r   )r[   ra   rb   r^   r^   r_   build_collate_fn   s   
zTTSTask.build_collate_fnc              	   C   sH   t  sJ |jrt||j|j|j|j|j|jd}nd }t	|s"J |S )N)rb   
token_typerH   bpemodelnon_linguistic_symbolstext_cleanerg2p_type)
r
   use_preprocessorr   rk   rH   rl   rm   cleanerg2pr   )r[   ra   rb   retvalr^   r^   r_   build_preprocess_fn   s   

zTTSTask.build_preprocess_fnTF	inference.c                 C      |sd}|S d}|S )N)textspeech)rw   r^   r[   rb   ru   rs   r^   r^   r_   required_data_names   s
   zTTSTask.required_data_namesc                 C   rv   )N)rd   	durationspitchr5   re   rf   )rd   rx   r{   r|   r5   re   rf   r^   ry   r^   r^   r_   optional_data_names   s
   	zTTSTask.optional_data_namesc              
   C   s  t  sJ t|jtr/t|jdd}dd |D }W d    n1 s$w   Y  | |_nt|jttfr=|j }ntdt	|}t
d|  |jd u ret|j}|di |j}| }nd |_d |_d }|j}|jd urt|j}|di |j}	nd }	t|j}
|
d||d|j}d }d }d }d }t|dd d urt|j}|jd	d d ur|jd	d |jd	d
ksJ n
|jd	d
|jd	< |di |j}t|dd d ur|jd	d d ur|jd	d |jd	d
ksJ n
|jd	d
|jd	< t|j}|di |j}t|dd d ur+t |j!}|di |j"}t|dd d urBt#|j$}|di |j%}t&d||||	|||d|j'}t(|sYJ |S )Nutf-8encodingc                 S   s   g | ]}|  qS r^   )rstrip).0liner^   r^   r_   
<listcomp>  s    z'TTSTask.build_model.<locals>.<listcomp>ztoken_list must be str or dictzVocabulary size: )idimodimr1   reduction_factorrC   r4   r8   r9   )r*   r1   r4   r6   r8   r9   r:   r^   ))r
   
isinstancerH   rX   opencopytuplelistRuntimeErrorlenlogginginfor   feats_extractor_choices	get_classr*   feats_extract_confoutput_sizer6   normalize_choicesnormalize_conftts_choicesr:   tts_confgetattrpitch_extractor_choicesr1   pitch_extract_confgetenergy_extract_confenergy_extractor_choicesr4   pitch_normalize_choicesr8   pitch_normalize_confenergy_normalize_choicesr9   energy_normalize_confr   
model_confr   )r[   ra   frH   
vocab_sizefeats_extract_classr*   r   normalize_classr6   	tts_classr:   r1   r4   r8   r9   pitch_extract_classenergy_extract_classpitch_normalize_classenergy_normalize_classmodelr^   r^   r_   build_model  s   








zTTSTask.build_modelNcpuvocoder_config_filevocoder_filer   devicec                 C   s   |d u rQi }|d ur*t |}|jddd}t|}W d    n1 s%w   Y  |jd ur7||j  d|v rJd|v rJd|v rJtd
i |S t	d d S t
|drbt||}||S t| d	)Nrr~   r   n_fftn_shiftfsz/Vocoder is not available. Skipped its building.z.pklz is not supported format.r^   )r   r   yaml	safe_loadr*   updateget_parametersr%   r   warningrX   endswithr#   to
ValueError)r[   r   r   r   r   vocoder_confr   vocoderr^   r^   r_   build_vocoder_from_files  s*   	


zTTSTask.build_vocoder_from_file)TF)NNNr   )+__name__
__module____qualname__rD   int__annotations__r   r   r   r   r   r   r   rY   r   trainerclassmethodargparseArgumentParserr`   	Namespaceboolr   r   r   rX   r   npndarrayr   torchTensorrj   r   arrayrt   rz   r}   r   r   r	   r   r   r^   r^   r^   r_   rB   h   s   
 J&


_

rB   )W__doc__r   r   pathlibr   typingr   r   r   r   r   r   r	   numpyr   r   r   	typeguardr
   r   espnet2.gan_tts.jetsr   espnet2.gan_tts.jointr   espnet2.gan_tts.vitsr   espnet2.layers.abs_normalizer   espnet2.layers.global_mvnr   espnet2.tasks.abs_taskr   espnet2.text.phoneme_tokenizerr   espnet2.train.class_choicesr   espnet2.train.collate_fnr   espnet2.train.preprocessorr   espnet2.train.trainerr   espnet2.tts.abs_ttsr   espnet2.tts.espnet_modelr   espnet2.tts.fastspeechr   espnet2.tts.fastspeech2r   +espnet2.tts.feats_extract.abs_feats_extractr   espnet2.tts.feats_extract.dior    espnet2.tts.feats_extract.energyr   ,espnet2.tts.feats_extract.linear_spectrogramr   'espnet2.tts.feats_extract.log_mel_fbankr   )espnet2.tts.feats_extract.log_spectrogramr    espnet2.tts.tacotron2r!   espnet2.tts.transformerr"   espnet2.tts.utilsr#    espnet2.utils.get_default_kwargsr$   espnet2.utils.griffin_limr%    espnet2.utils.nested_dict_actionr&   espnet2.utils.typesr'   r(   r)   dictr   r   r   r   r   r   r   rB   r^   r^   r^   r_   <module>   s    $

