o
    }oi?                     @   sd  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlmZ d dlmZmZmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 dZ4zd dl5Z5W n e6y   dZ4Y nw e3G dd deZ7dS )    N)instantiate)Trainer)WandbLogger)
DictConfig	OmegaConf)autocast)
functional)DistributedBucketSampler)DiscriminatorLossFeatureMatchingLossGeneratorLossKlLoss)TextToWaveform)MultiPeriodDiscriminator)clip_grad_value_g2p_backward_compatible_supportplot_spectrogram_to_numpyslice_segments)	SpeakerID)PretrainedModelInfo	typecheck)AudioSignal	FloatTypeIndexIntType
TokenIndex)
NeuralType)CosineAnnealing)loggingmodel_utils)experimentalTFc                       s0  e Zd Zd2deddf fddZdd Zd3d
edejfddZ	dd Z
eede ede d	dede d	dede d	dede d	dede d	dddd4ddZdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zed5d)d*Zed+ede d	did,ed-e id.dd/d0d1Z  ZS )6	VitsModelNcfgtrainerr   c                    s   t |}t |}d | _d | _i | _| | d | _| | | jd us'J t	| jj
}| jj| _t j||d t|j|jjjd| _t | _t | _t | _t | _t|j||jd d |j|j  | jd| _!t"|j#| _$d| _%d S )N)r"   r#   )highfreq      )n_vocabspec_channelssegment_sizepadding_idxF)&r   #convert_model_config_to_dict_configmaybe_update_config_version
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizer	tokenizer_setup_tokenizerlentokenspadtokenizer_padsuper__init__r   preprocessortrain_dsdatasetr$   audio_to_melspec_processorr   feat_matching_lossr
   	disc_lossr   gen_lossr   kl_losssynthesizern_fftr)   n_window_stridenet_gr   use_spectral_normnet_dautomatic_optimization)selfr"   r#   
num_tokens	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/vits.pyr8   6   s4   






zVitsModel.__init__c                 C   s   i }d|j v r`|j jd ur`|  r0|j jdd d ur0|j jd dr0t|j jd |j jd< i }d|j jv rC| d|j jj|d< d|j jv rT| d|j jj|d< t	|j jfi ||d< t	|j fi || _
d S )Ng2p_target_znemo_text_processing.g2pphoneme_dictztext_tokenizer.g2p.phoneme_dict
heteronymsztext_tokenizer.g2p.heteronyms)text_tokenizerrN   _is_model_being_restoredget
startswithr   register_artifactrP   rQ   r   r1   )rH   r"   text_tokenizer_kwargs
g2p_kwargsrL   rL   rM   r2   _   s.   
zVitsModel._setup_tokenizerTtextreturnc                 C   s   | j rtd |r| jd ur| j|fi | j}t }t| jdr*| jj	dd}| | j
|}W d    n1 s=w   Y  t| d| jS )Nz+parse() is meant to be called in eval mode.set_phone_prob      ?)probr   )trainingr   warningr.   r/   
contextlibnullcontexthasattrr1   r[   encodetorchtensorlong	unsqueezetodevice)rH   rY   	normalizeeval_phon_moder4   rL   rL   rM   parse~   s   
zVitsModel.parsec           	      C   s   | j j }t|d |dd }t|d t|| j d}t|| j	 d}|d urv|j
dkrHtjjj||jd}tjjj||jd}n|j
dkr`t||j|jd}t||j|jd}ntd	|d
d}|d
d}||g||gfS ||gS )NFschedT)paramsExponentialLR)gammar   )	optimizer	max_stepsmin_lrzUnknown optimizer.step)	schedulerinterval)_cfgoptimcopyr   
set_structpopr   rD   
parametersrF   namerd   lr_schedulerro   lr_decayr   rr   rs   
ValueError)	rH   optim_configsched_configoptim_goptim_dscheduler_dscheduler_gscheduler_g_dictscheduler_d_dictrL   rL   rM   configure_optimizers   s@   



zVitsModel.configure_optimizers)BT_text)r   )optional)r4   speakersnoise_scalelength_scalenoise_scale_wmax_len)input_typesr&   r\     c              	   C   s^   t |dgt|j}| jj|||||||d\}}	}
\}}}}||	|
||||ffS )N)r   r   r   r   r   )rd   re   sizerh   intri   rD   infer)rH   r4   r   r   r   r   r   text_len
audio_predattny_maskzz_pm_plogs_prL   rL   rM   forward   s    	zVitsModel.forwardc           2   
   C   s  d }t | jjjv r|\}}}}}n|\}}}}| j||dd\}}	tdd | ||||	|\}
}}}}}\}}}}}}W d    n1 sGw   Y  |
 }
| j|
d|dd\}}t	|
d|| jj | jj}| j|d|dd\}}tdd | ||
 \}}}}W d    n1 sw   Y  tdd | j||d\}}}|}W d    n1 sw   Y  |  \}} |   | | t| j d }!|   tdd | ||
\}}}"}#W d    n1 sw   Y  tdd@ t| }$t||| jj }%| j|||||d| jj }&| j|"|#d}'| j |d	\}(})|(|' |% |$ |& }*W d    n	1 s>w   Y  |  | |* t| j d }+|  | ! },|,d ur|,\}-}.| j"j#rst$|-tj%j&j'syt$|-t(r|-  |.  |(|'|%|$|&|*||+|!d
	}/t)|)D ]\}0}1|1|/d|0 < qt)|D ]\}0}1|1|/d|0 < qt)|D ]\}0}1|1|/d|0 < q| j*|/ddd d S )NT)linear_spec)enabledr&   F)disc_real_outputsdisc_generated_outputs)r   logs_qr   r   z_mask)fmap_rfmap_g)disc_outputs)	loss_genloss_fmloss_melloss_durloss_klloss_gen_allloss_disc_allgrad_gen	grad_discloss_gen_i_loss_disc_r_loss_disc_g_)on_step	sync_dist)+r   	_train_dlr;   sup_data_types_setr<   r   rD   floatsqueezer   rg   r"   rC   rw   r)   rF   detachr>   
optimizers	zero_gradmanual_backwardr   r|   rt   rd   sumFl1_lossc_melr@   c_klr=   r?   lr_schedulersr#   is_last_batch
isinstancerx   r~   ro   r   	enumeratelog_dict)2rH   batch	batch_idxr   audio	audio_lenrY   r   specspec_lengthsr   l_lengthr   	ids_slice	text_maskr   r   r   r   r   m_qr   audio_pred_mel_	audio_mel	y_d_hat_r	y_d_hat_g	loss_disclosses_disc_rlosses_disc_gr   r   r   norm_dr   r   r   r   r   r   r   
losses_genr   norm_g
schedulerssch1sch2metricsivrL   rL   rM   training_step   s   
 
	

zVitsModel.training_stepc              
   C   s  d }| j jdkr|\}}}}}n|\}}}}| jj|||dd^}}	}
}	| }|
ddg | jjj	j
 }| ||\}}| ||\}}|dkrt| jtrtr| jj}g }g }|tjt|dd d d |d f j  ddtjt|dd d d |d f j  ddg7 }|tj|dd |d f j tj d	| jjd
tj|dd |d f j tj d| jjd
g7 }|||d d S d S d S d S )Nr&   r   )r   r%   r   val_mel_target)captionval_mel_predictedval_wav_target)r   sample_rateval_wav_predicted)specsaudios)r"   
n_speakersrD   r   r   r   rf   rw   validation_dsr;   
hop_lengthr<   r   loggerr   
HAVE_WANDB
experimentwandbImager   datacpunumpyAudiorh   rd   r   r   log)rH   r   r   r   r   r   rY   r   r   r   maskaudio_pred_lenmelmel_lengthsr   audio_pred_mel_lenr   r   r   rL   rL   rM   validation_step&  sJ   &&$$zVitsModel.validation_stepc                 C   sj   z|d d }W n t jjy   td Y d S w t|j| j| j| j	d}t
jjjd||jd|jS )Nr;   manifest_filepathz9manifest_filepath was skipped. No dataset for this model.text_normalizerr/   rR   )r;   
collate_fnrL   )	omegaconferrorsMissingMandatoryValuer   r_   r   r;   r-   r/   r1   rd   utilsr   
DataLoaderr  dataloader_params)rH   r"   r   r;   rL   rL   rM   _loaderU  s$   

zVitsModel._loaderc                 C   sZ   t | jjj| j| j| jd}t|fi | jjj}t	j
jj|f|j|d| jjj}|S )Nr  )r  batch_sampler)r   r"   r:   r;   r-   r/   r1   r	   r  rd   r  r   r	  r  r
  )rH   r;   train_sampler
dataloaderrL   rL   rM   train_dataloaderh  s    zVitsModel.train_dataloaderc                 C      |  || _d S N)r  r   rH   r"   rL   rL   rM   setup_training_data{     zVitsModel.setup_training_datac                 C   r  r  )r  _validation_dlr  rL   rL   rM   setup_validation_data~  r  zVitsModel.setup_validation_datac                 C   s   dS )zOmitted.NrL   r  rL   rL   rM   setup_test_data  s   zVitsModel.setup_test_dataList[PretrainedModelInfo]c                 C   s<   g }t ddd| d}|| t ddd| d}|| |S )Ntts_en_lj_vitszrhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_vits/versions/1.13.0/files/vits_ljspeech_fp16_full.nemozThis model is trained on LJSpeech audio sampled at 22050Hz. This model has been tested on generating female English voices with an American accent.)pretrained_model_namelocationdescriptionclass_tts_en_hifitts_vitszphttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_hifitts_vits/versions/r1.15.0/files/vits_en_hifitts.nemozThis model is trained on HiFITTS sampled at 44100Hz with and can be used to generate male and female English voices with an American accent.)r   append)clslist_of_modelsmodelrL   rL   rM   list_available_models  s    

zVitsModel.list_available_modelsr4   r   )r   T_audio)r   output_types)r   c                C   s   | ||dd  d}|S )N)r4   r   r   r&   )r   )rH   r4   r   r   rL   rL   rM   convert_text_to_waveform  s   z"VitsModel.convert_text_to_waveformr  )T)Nr&   r&   r\   r   )rZ   r  )__name__
__module____qualname__r   r8   r2   strrd   re   rl   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  classmethodr#  r   r&  __classcell__rL   rL   rJ   rM   r!   4   s<    )(

Z/r!   )8r`   r  rd   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr   r   r   torch.cuda.ampr   torch.nnr   r   !nemo.collections.tts.data.datasetr	   'nemo.collections.tts.losses.vits_lossesr
   r   r   r    nemo.collections.tts.models.baser   )nemo.collections.tts.modules.vits_modulesr   (nemo.collections.tts.parts.utils.helpersr   r   r   r   )nemo.collections.tts.torch.tts_data_typesr   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   r   r   r   "nemo.core.neural_types.neural_typer   nemo.core.optim.lr_schedulerr   
nemo.utilsr   r   "nemo.utils.decorators.experimentalr    r   r   ModuleNotFoundErrorr!   rL   rL   rL   rM   <module>   s:   