o
    }oiMe                     @   s@  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z. dZ/zd dl0Z0W n e1y   dZ/Y nw G dd dee Z2dS )    N)Path)instantiate)WandbLogger)
DictConfig	OmegaConf	open_dict)DiscriminatorLossFeatureMatchingLossGeneratorLoss)Vocoder)MultiPeriodDiscriminatorMultiScaleDiscriminator)LoggingCallback)get_batch_sizeget_num_workersplot_spectrogram_to_numpy)
Exportable)PretrainedModelInfo	typecheck)AudioSignalMelSpectrogramType)
NeuralType)compute_max_stepsprepare_lr_scheduler)loggingmodel_utilsTFc                       sN  e Zd ZdZdIdeddf fddZedd	 Zed
d Z	dd Z
dJddZe dd Zedede idede iddKddZdd ZdLddZd d! Zd"d# Zd$d% Zd&d' Zd(d) ZdMd,ed-efd.d/Zd0d1 Zd2d3 Zd4d5 Zd6d7 Ze dNd9d:Z!dO fd;d<	Z"d=d> Z#ed?d@ Z$edAdB Z%dPdEdFZ&dGdH Z'  Z(S )QHifiGanModelzp
    HiFi-GAN model (https://arxiv.org/abs/2010.05646) that is used to generate audio from mel spectrogram.
    NcfgtrainerTrainerc                    s
  t |}t |}|jjj| _t j||d t	|j
| _t	|j
d dd| _t	|j| _td|v r5|jndd| _td|v rB|jndd| _t | _t | _t | _|dd| _| jj
j| _d | _d| _| jro| jjj | _|d	d| _!|d
d | _"d | _#d| _$d S )N)r   r   T)highfreq	use_gradsdebugF)r"   l1_loss_factor-   	log_audio
log_config)%r   #convert_model_config_to_dict_configmaybe_update_config_versiontrain_dsdataset_target_ds_classsuper__init__r   preprocessoraudio_to_melspec_precessortrg_melspec_fn	generatorr   r"   mpdr   msdr	   feature_lossr   discriminator_lossr
   generator_lossget	l1_factor_cfgsample_rate	stft_biasinput_as_mel	_train_dlload_precomputed_melr%   r&   lr_schedule_intervalautomatic_optimization)selfr   r   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/hifigan.pyr.   0   s,   


zHifiGanModel.__init__c              	   C   s|   d| j v r| j dS d| j vrtdd| j v r!| j j| j j S t| j j| jj| jjt	| jt
| jjt| j| jjdS )N	max_steps
max_epochsz)Must specify 'max_steps' or 'max_epochs'.steps_per_epoch)rH   accumulate_grad_batcheslimit_train_batchesnum_workersnum_samples
batch_size	drop_last)r:   r8   
ValueErrorrH   rI   r   r   rJ   rK   r   lenr>   r*   r   rO   rB   rE   rE   rF   rG   R   s   



zHifiGanModel.max_stepsc                 C   s    |d ur|S |d ur||  S d S NrE   rG   warmup_stepswarmup_ratiorE   rE   rF   get_warmup_stepsg   s
   zHifiGanModel.get_warmup_stepsc                 C   s  | j j }t|d |dd }t|d | j }t	| j
 | j }t||d}t||d}|d u r=||gS | j}| j||dd |dd d}t|d ||d< |rf||d< |dd  t|d t||| jd	}	t||| jd	}
|	d
 | _||g|	|
gfS )NFschedT)paramsrU   rV   rT   rG   )	optimizerscheduler_configtrain_dataloaderinterval)r:   optimcopyr   
set_structpopr2   
parameters	itertoolschainr4   r3   r   rG   rW   r8   r   r>   r@   )rB   optim_configsched_config
gen_paramsdisc_paramsoptim_goptim_drG   rU   scheduler_gscheduler_drE   rE   rF   configure_optimizersq   s<   



z!HifiGanModel.configure_optimizersstepc                 C   s>   |   }|d ur| j|kr|\}}|  |  d S d S d S rS   )lr_schedulersr@   rn   )rB   r]   
schedulerssch1sch2rE   rE   rF   	update_lr   s   zHifiGanModel.update_lrc                C      | j |dS z^
        Runs the generator, for inputs and outputs see input_types, and output_types
        xr2   rB   specrE   rE   rF   forward   s   zHifiGanModel.forwardrz   )BCTaudio)r|   r~   )input_typesoutput_typestorch.tensorreturnc                 C   s   | |d dS )Nrz      )squeezery   rE   rE   rF   convert_spectrogram_to_audio   s   z)HifiGanModel.convert_spectrogram_to_audioc                 C   s  |  |\}}}}| ||\}}|d}| j|d}| |d|\}	}|  \}
}|  | j|| d\}}}}| j	||d\}}}| j
|| d\}}}}| j	||d\}}}|| }| | |  |
  t|	|}| j||d\}}}}| j
||d\}}}}| j||d}| j||d}| j|d\}}| j|d\}}|| | | || j  }| | |
  |   ||||||||| j|
jd d d	
}| j|d
d
d | jd|d
dd
d d S )Nr   rv   )yy_hat)disc_real_outputsdisc_generated_outputs)fmap_rfmap_g)disc_outputsr   lr)
g_loss_fm_mpdg_loss_fm_msdg_loss_gen_mpdg_loss_gen_msdg_loss
d_loss_mpd
d_loss_msdd_lossglobal_stepr   T)on_step	sync_dist	g_l1_lossF)prog_barloggerr   )_process_batchr1   	unsqueezer2   r   
optimizers	zero_gradr3   detachr6   r4   manual_backwardrn   Fl1_lossr5   r7   r9   rs   r   param_groupslog_dictlog)rB   batch	batch_idxr   	audio_len	audio_mel_audio_trg_mel
audio_predaudio_pred_melri   rj   mpd_score_realmpd_score_genloss_disc_mpdmsd_score_realmsd_score_genloss_disc_msdloss_dloss_melfmap_mpd_realfmap_mpd_genfmap_msd_realfmap_msd_genloss_fm_mpdloss_fm_msdloss_gen_mpdloss_gen_msdloss_gmetricsrE   rE   rF   training_step   sV   


zHifiGanModel.training_stepc                 C   s   |  d d S )Nepoch)rs   rR   rE   rE   rF   on_train_epoch_end   s   zHifiGanModel.on_train_epoch_endc                 C   s  |  |\}}}}| |d}| jr| ||\}}	| |d|\}
}t||
}| jd|iddd | jr=|dkr?t| j	t
rAtrC| ||d}| ||\}}g }g }ttd|jd D ]}|tj||d || f j  d| | jd	tj||dd || f j  d
d| | jd	tj||d || f j  d| | jd	g7 }|tjt||d d d || f j  d| dtjt|
|d d d || f j  d| dtjt||d d d || f j  d| dg7 }| jr0|tjt||d d d || f j  d| dg7 }qf| j	j||d d S d S d S d S d S )Nr   r   val_lossT)on_epochr   r      zreal audio )captionr;   float32zgenerated audio zdenoised audio z
input mel )r   zoutput mel zdenoised mel zgt mel )r   specs)r   r=   r0   r   r   r   r   r%   
isinstancer   r   
HAVE_WANDB_bias_denoiserangeminshapewandbAudiodatacpunumpyr;   astypeImager   
experimentr   )rB   r   r   r   r   r   audio_mel_lenr   gt_mel
gt_mel_lenr   r   r   pred_denoisedpred_denoised_melclipsr   irE   rE   rF   validation_step   sl   
&$&&&&zHifiGanModel.validation_stepc                 C   sv   | j r|\}}}|jd g|jd  }||||fS | jdkr)|d}|d}n|\}}| ||\}}||||fS )Nr   r   8nemo.collections.tts.data.vocoder_dataset.VocoderDatasetr   
audio_lens)r=   r   r,   r8   r0   )rB   r   r   r   r   r   rE   rE   rF   r   (  s   


zHifiGanModel._process_batchc           
      C   s   dd }dd }| j d u s| j jd |jd kr@| tj||jdd}||\| _ }| j d d d d df d d d d d f | _ ||\}}|| jdd	| j   }t|d
}|||d}	|	S )Nc                 S   s`   t j| dddddd}t |}|d |d }}t |d |d  }t ||}||fS )	Nr         T)n_fft
hop_length
win_lengthreturn_complex).r   ).r      )torchstftr   view_as_realsqrtatan2)rw   comprealimagmagsphaserE   rE   rF   r   8  s   
z(HifiGanModel._bias_denoise.<locals>.stftc                 S   sB   t j| t | | t | gdd}t jt |dddd}|S )N)dimr   r   )r   r   r   )r   stackcossinistftview_as_complex)r   r   r   rw   rE   rE   rF   r   @  s   &z)HifiGanModel._bias_denoise.<locals>.istftr   )devicer   denoise_strengthg{Gzd?g        r   )	r<   r   r   
zeros_liker   r   r8   clampr   )
rB   r   melr   r   
audio_biasr   
audio_magsaudio_phaseaudio_denoisedrE   rE   rF   r   7  s    ,zHifiGanModel._bias_denoisec                 C   sD   t |j}|j|jj| jjd}tjj	j
|f|j|d|j}|S )N)
world_size)
collate_fnsampler)r   r*   get_samplerdataloader_paramsrN   r   r  r   utilsr   
DataLoaderr  )rB   r   r*   r  data_loaderrE   rE   rF   _setup_train_dataloaderR  s   
z$HifiGanModel._setup_train_dataloaderc                 C   s,   t |j}tjjj|fd|ji|j}|S )Nr  )r   r*   r   r
  r   r  r  r	  )rB   r   r*   r  rE   rE   rF   _setup_test_dataloaderZ  s   
z#HifiGanModel._setup_test_dataloaderTtrainshuffle_should_benamec                 C   s  d|vs
t |jtstd| d|vst |jts"td| |rad|jvrPtd|  d| d t|d  d	|j_W d    n1 sJw   Y  n#|jjs`t	d
| d|  d n|ss|jjrst	d
| d|  d t
|j}tjjj|fd|ji|jS )Nr*   zNo dataset for r	  zNo dataloader_params for shufflez"Shuffle should be set to True for z's zE dataloader but was not found in its config. Manually setting to TrueTzThe z dataloader for z has shuffle set to False!!!z has shuffle set to True!!!r  )r   r*   r   rP   r	  r   warningr   r  errorr   r   r
  r   r  r  )rB   r   r  r  r*   rE   rE   rF   __setup_dataloader_from_config_  s(   


z+HifiGanModel.__setup_dataloader_from_configc                 C   s*   | j dkr| || _d S | || _d S )Nr   )r,   r  r>   +_HifiGanModel__setup_dataloader_from_configrB   r   rE   rE   rF   setup_training_datat  s   
z HifiGanModel.setup_training_datac                 C   s0   | j dkr| || _d S | j|ddd| _d S )Nr   F
validation)r  r  )r,   r  _validation_dlr  r  rE   rE   rF   setup_validation_dataz  s   
z"HifiGanModel.setup_validation_datac                 C   s   d S rS   rE   r  rE   rE   rF   setup_test_data  s   zHifiGanModel.setup_test_datac              
   C   s   | j sg S | j jj}|dkrtd| | | j }t| j j}| j jr+t| j jnd }t	||| j j
| j j|| jj| j j| j jd}|gS )Nr   z6Sample logging only supported for VocoderDataset, got )
generatorsr  
log_epochsepoch_frequency
output_dirloggerslog_tensorboard	log_wandb)r&   r*   r+   rP   r  r   r  log_dirr   r   r  r  r   r!  r"  r#  )rB   sample_ds_classr  r  r$  log_callbackrE   rE   rF   configure_callbacks  s&   
z HifiGanModel.configure_callbacksOptional[Dict[str, str]]c                 C   s   g }t ddd| d}|| t ddd| d}|| t dd	d
| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| |S )Ntts_en_hifiganzehttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/files/tts_hifigan.nemozThis model is trained on LJSpeech audio sampled at 22050Hz and mel spectrograms generated from Tacotron2, TalkNet, and FastPitch. This model has been tested on generating female English voices with an American accent.)pretrained_model_namelocationdescriptionclass_tts_en_lj_hifigan_ft_mixerttszzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_hifigan/versions/1.6.0/files/tts_en_lj_hifigan_ft_mixertts.nemozThis model is trained on LJSpeech audio sampled at 22050Hz and mel spectrograms generated from Mixer-TTS. This model has been tested on generating female English voices with an American accent.tts_en_lj_hifigan_ft_mixerttsxz{https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_hifigan/versions/1.6.0/files/tts_en_lj_hifigan_ft_mixerttsx.nemozThis model is trained on LJSpeech audio sampled at 22050Hz and mel spectrograms generated from Mixer-TTS-X. This model has been tested on generating female English voices with an American accent.#tts_en_hifitts_hifigan_ft_fastpitchzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_hifitts_hifigan_ft_fastpitch.nemozThis model is trained on HiFiTTS audio sampled at 44100Hz and mel spectrograms generated from FastPitch. This model has been tested on generating male and female English voices with an American accent.1tts_de_hifigan_singleSpeaker_thorstenNeutral_2102zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2102.nemoa  This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx` by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102`. This model has been tested on generating male German neutral voices.1tts_de_hifigan_singleSpeaker_thorstenNeutral_2210zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2210.nemoa  This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx` by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210`. This model has been tested on generating male German neutral voices..tts_de_hui_hifigan_ft_fastpitch_multispeaker_5zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemoa  This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_hifitts_hifigan_ft_fastpitch` by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_multispeaker_5`. This model has been tested on generating male and female German voices.(tts_es_hifigan_ft_fastpitch_multispeakerzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemoa2  This model is trained on the audio from 6 crowdsourced Latin American Spanish OpenSLR datasets and finetuned on the mel-spectrograms generated from the FastPitch checkpoint `tts_es_fastpitch_multispeaker`. This model has been tested on generating male and female Spanish voices with Latin American accents.tts_zh_hifigan_sfspeechzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_hifigan_sfspeech.nemoa  This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx` by the mel-spectrograms generated from the FastPitch checkpoint `tts_zh_fastpitch_sfspeech`. This model has been tested on generating female Mandarin Chinese voices.)r   append)clslist_of_modelsmodelrE   rE   rF   list_available_models  s   







	
z"HifiGanModel.list_available_modelsc              	      s   i }t | jd d }| D ]9\}}|}d|v rD|d}t |dkrDt|d }	|	|  d|	|  }
d|
 dd|dd   }|||< qt j||d	 d S )
Nr2   resblock_kernel_sizes	resblocks.   r   zgenerator.resblocks.   )strict)rQ   r   itemssplitintjoinr-   load_state_dict)rB   
state_dictr@  new_state_dictnum_resblockskvnew_kpartslayer	new_layerrC   rE   rF   rE    s   

zHifiGanModel.load_state_dictc                 K   s4   | j d urz| j   W d S  ty   Y d S w d S rS   )r2   remove_weight_normrP   )rB   kwargsrE   rE   rF   _prepare_for_export  s   
z HifiGanModel._prepare_for_exportc                 C   s   dt dt iS )Nrz   )r|   Dr~   )r   r   rR   rE   rE   rF   r     s   zHifiGanModel.input_typesc                 C   s   dt dt| jiS )Nr   )r|   Sr~   )r   r   r;   rR   rE   rE   rF   r     s   zHifiGanModel.output_typesr   r   c                 C   s:   t |  }tj|| jd d |f| j|jd}d|ifS )zs
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
        r/   nfilt)r   dtyperz   )nextrb   r   randnr   r   rU  )rB   	max_batchmax_dimparr   rE   rE   rF   input_example"  s   $
zHifiGanModel.input_examplec                 C   rt   ru   rx   ry   rE   rE   rF   forward_for_export,  s   zHifiGanModel.forward_for_exportrS   )rn   )rz   r   r   r   )r   N)Tr  )r   r(  )T)r   r   ))__name__
__module____qualname____doc__r   r.   propertyrG   staticmethodrW   rm   rs   r   r{   r   r   r   r   r   r   r   r   r   r  r  boolstrr  r  r  r  r'  classmethodr:  rE  rQ  r   r   r[  r\  __classcell__rE   rE   rC   rF   r   +   sL    "

	
)

8>b



r   )3rc   pathlibr   r   torch.nn.functionalnn
functionalr   hydra.utilsr   lightning.pytorch.loggers.wandbr   	omegaconfr   r   r   *nemo.collections.tts.losses.hifigan_lossesr   r	   r
    nemo.collections.tts.models.baser   ,nemo.collections.tts.modules.hifigan_modulesr   r   *nemo.collections.tts.parts.utils.callbacksr   (nemo.collections.tts.parts.utils.helpersr   r   r   nemo.core.classesr   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   "nemo.core.neural_types.neural_typer   nemo.core.optim.lr_schedulerr   r   
nemo.utilsr   r   r   r   ModuleNotFoundErrorr   rE   rE   rE   rF   <module>   s2   