o
    wiD                     @   s<  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- dZ.zd dl/Z/W n e0y   dZ.Y nw G dd deeZ1dS )    N)Dict)instantiate)WandbLogger)
DictConfig	OmegaConf	open_dict)DiscriminatorLossGeneratorLoss)MultiResolutionSTFTLoss)Vocoder)MultiPeriodDiscriminatorMultiResolutionDiscriminator)get_batch_sizeget_num_workersplot_spectrogram_to_numpy)
Exportable)PretrainedModelInfo	typecheck)AudioSignalMelSpectrogramType)
NeuralType)compute_max_stepsprepare_lr_scheduler)loggingmodel_utilsTFc                       s  e Zd ZdZd:deddf fddZdd	 Zed
d Zdd Z	e
 dd Ze
dede idede idd;ddZdd Zdd Zdd Zd<d!ed"efd#d$Zd%d& Zd'd( Zd)d* Zed=d,d-Zd.d/ Zed0d1 Zed2d3 Zd>d6d7Zd8d9 Z   Z!S )?UnivNetModelzeUnivNet model (https://arxiv.org/abs/2106.07889) that is used to generate audio from mel spectrogram.NcfgtrainerTrainerc                    s>  t |}t |}t j||d t|j| _t|jd dd| _t|j	|jj
|jjd| _	t|jjd|v r9|jndd| _t|jjd|v rI|jndd| _t | _t | _|jjj| _dd	 | jD | _d
d	 | jD | _dd	 | jD | _t| j| j| j| _|j| _| jjj| _d | _ d| _!| j"r| j"j#j$| _!d| _%d S )N)r   r   T)highfreq	use_grads)n_mel_channels
hop_lengthdebugF)r#   c                 S      g | ]}|d  qS )r    .0resr%   r%   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/univnet.py
<listcomp>C       z)UnivNetModel.__init__.<locals>.<listcomp>c                 S   r$   )   r%   r&   r%   r%   r)   r*   D   r+   c                 S   r$   )   r%   r&   r%   r%   r)   r*   E   r+   )&r   #convert_model_config_to_dict_configmaybe_update_config_versionsuper__init__r   preprocessoraudio_to_melspec_precessortrg_melspec_fn	generatornfiltn_window_strider   discriminatormpdr#   r   mrdr   discriminator_lossr	   generator_lossresolutionsstft_resolutions	fft_sizes	hop_sizeswin_lengthsr
   mrstft_loss	stft_lamb_cfgsample_rate	stft_biasinput_as_mel	_train_dldatasetload_precomputed_melautomatic_optimization)selfr   r   	__class__r%   r)   r1   .   s0   

  
zUnivNetModel.__init__c              	   C   s:   t | jj| jj| jjt| jt| jj	t
| j| jjdS )N)
max_epochsaccumulate_grad_batcheslimit_train_batchesnum_workersnum_samples
batch_size	drop_last)r   rD   rO   r   rP   rQ   r   lenrH   rI   r   rU   rL   r%   r%   r)   _get_max_stepsR   s   
zUnivNetModel._get_max_stepsc                 C   s<   |d ur|d urt d|d ur|S |d ur||  S t d)Nz5Either use warmup_steps or warmup_ratio for schedulerz2Specify warmup_steps or warmup_ratio for scheduler)
ValueError	max_stepswarmup_stepswarmup_ratior%   r%   r)   get_warmup_steps]   s   zUnivNetModel.get_warmup_stepsc           	      C   s6  | j j }t|d |dd }t|d |d u r%d| j v r%| j j}t|| j	 d}t|t
| j	 | j	 d}|d ur| j dd }|d u sQ|dk rU|  }tj||dd |dd d	}t|d ||d< ||d< |dd  t|d t||| jd
}t||| jd
}||g||gfS ||gS )NFschedT)paramsr[   r   r\   r]   rZ   )	optimizerscheduler_configtrain_dataloader)rD   optimcopyr   
set_structpopr_   r   r5   
parameters	itertoolschainr:   r9   getrX   r   r^   r   rH   )	rL   optim_configsched_configoptim_goptim_dr[   r\   scheduler_gscheduler_dr%   r%   r)   configure_optimizersj   sH   

z!UnivNetModel.configure_optimizersc                C      | j |dS z^
        Runs the generator, for inputs and outputs see input_types, and output_types
        xr5   rL   specr%   r%   r)   forward   s   zUnivNetModel.forwardry   )BCTaudio)r{   r}   )input_typesoutput_typestorch.tensorreturnc                 C   s   | |d dS )Nry   r,   )squeezerx   r%   r%   r)   convert_spectrogram_to_audio   s   z)UnivNetModel.convert_spectrogram_to_audioc                 C   s  | j r	|\}}}n|\}}| ||\}}|d}| j|d}| |d|\}}|  \}	}
|
  | j||	 d\}}}}| j
||d\}}}| j||	 d\}}}}| j
||d\}}}|| }| | |
  |	  | j|d|d|d\}}t| }t| }|| | j }| j||d\}}}}| j||d\}}}}| j|d\}}| j|d\}}|| | }| | |	  |||||||||| j|	jd d d	}| j|d
d
d | jd|d
dd
d d S )Nr,   ru   )yy_hat)disc_real_outputsdisc_generated_outputs)rv   r   input_lengths)disc_outputsr   lr)	g_loss_sc
g_loss_magg_loss_mrstftg_loss_gen_mpdg_loss_gen_mrdg_loss
d_loss_mpd
d_loss_mrdd_lossglobal_stepr   T)on_step	sync_distg_mrstft_lossF)prog_barloggerr   )rG   r3   	unsqueezer5   r4   r   
optimizers	zero_gradr9   detachr;   r:   manual_backwardsteprB   torchstackmeanrC   r<   r   param_groupslog_dictlog)rL   batch	batch_idxr~   	audio_len	audio_mel_
audio_predaudio_pred_melrn   ro   mpd_score_realmpd_score_genloss_disc_mpdmrd_score_realmrd_score_genloss_disc_mrdloss_dloss_scloss_magloss_mrstftloss_gen_mpdloss_gen_mrdloss_gmetricsr%   r%   r)   training_step   s\   

 
zUnivNetModel.training_stepc                 C   s  | j r|\}}}|jd g|jd  }n|\}}| ||\}}| |d}| ||d}| ||\}	}
| j rA| ||\}}| |d|\}}
t||}| jd|iddd |dkrPt| j	t
rRtrTg }g }ttd|jd D ]}|tj||d || f j  d| | jd	tj||dd || f j  d
d| | jd	tj||d || f j  d| | jd	g7 }|tjt||d d d || f j  d| dtjt||d d d || f j  d| dtjt|	|d d d || f j  d| dg7 }| j rC|tjt||d d d || f j  d| dg7 }qy| j	j||d d S d S d S d S )Nr,   r   r   val_lossT)on_epochr      zreal audio )captionrE   float32zgenerated audio zdenoised audio z
input mel )r   zoutput mel zdenoised mel zgt mel )r~   specs)rG   shaper3   _bias_denoiser   Fl1_lossr   
isinstancer   r   
HAVE_WANDBrangeminwandbAudiodatacpunumpyrE   astypeImager   
experimentr   )rL   r   r   r~   r   r   audio_mel_lenr   pred_denoisedpred_denoised_melr   gt_mel
gt_mel_lenr   loss_melclipsr   ir%   r%   r)   validation_step   st   

$&&&&zUnivNetModel.validation_stepc           
      C   s   dd }dd }| j d u s| j jd |jd kr@| tj||jdd}||\| _ }| j d d d d df d d d d d f | _ ||\}}|| jdd	| j   }t|d
}|||d}	|	S )Nc                 S   s`   t j| dddddd}t |}|d |d }}t |d |d  }t ||}||fS )	Nr,         T)n_fftr"   
win_lengthreturn_complex).r   ).r,   r-   )r   stftr   view_as_realsqrtatan2)rv   comprealimagmagsphaser%   r%   r)   r   '  s   
z(UnivNetModel._bias_denoise.<locals>.stftc                 S   sB   t j| t | | t | gdd}t jt |dddd}|S )N)dimr   r   )r   r"   r   )r   r   cossinistftview_as_complex)r   r   r   rv   r%   r%   r)   r   /  s   &z)UnivNetModel._bias_denoise.<locals>.istftr   )devicer   denoise_strengthg{Gzd?g        r,   )	rF   r   r   
zeros_liker   r   rk   clampr   )
rL   r~   melr   r   
audio_biasr   
audio_magsaudio_phaseaudio_denoisedr%   r%   r)   r   &  s    ,zUnivNetModel._bias_denoiseTtrainshuffle_should_benamec                 C   s  d|vs
t |jtstd| d|vst |jts"td| |rad|jvrPtd|  d| d t|d  d	|j_W d    n1 sJw   Y  n#|jjs`t	d
| d|  d n|ss|jjrst	d
| d|  d t
|j}tjjj|fd|ji|jS )NrI   zNo dataset for dataloader_paramszNo dataloder_params for shufflez"Shuffle should be set to True for z's zE dataloader but was not found in its config. Manually setting to TrueTzThe z dataloader for z has shuffle set to False!!!z has shuffle set to True!!!
collate_fn)r   rI   r   rY   r   r   warningr   r   errorr   r   utilsr   
DataLoaderr   )rL   r   r   r   rI   r%   r%   r)   __setup_dataloader_from_configA  s(   


z+UnivNetModel.__setup_dataloader_from_configc                 C   s   |  || _d S N)+_UnivNetModel__setup_dataloader_from_configrH   rL   r   r%   r%   r)   setup_training_dataV  s   z UnivNetModel.setup_training_datac                 C   s   | j |ddd| _d S )NF
validation)r   r   )r  _validation_dlr  r%   r%   r)   setup_validation_dataY  s   z"UnivNetModel.setup_validation_datac                 C   s   d S r  r%   r  r%   r%   r)   setup_test_data\  s   zUnivNetModel.setup_test_dataOptional[Dict[str, str]]c                 C   s<   g }t ddd| d}|| t ddd| d}|| |S )Ntts_en_lj_univnetznhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_univnet/versions/1.7.0/files/tts_en_lj_univnet.nemozThis model is trained on LJSpeech sampled at 22050Hz, and has been tested on generating female English voices with an American accent.)pretrained_model_namelocationdescriptionclass_tts_en_libritts_univnetzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_libritts_univnet/versions/1.7.0/files/tts_en_libritts_multispeaker_univnet.nemozThis model is trained on all LibriTTS training data (train-clean-100, train-clean-360, and train-other-500) sampled at 22050Hz, and has been tested on generating English voices.)r   append)clslist_of_modelsmodelr%   r%   r)   list_available_models_  s    

z"UnivNetModel.list_available_modelsc                 K   s4   | j d urz| j   W d S  ty   Y d S w d S r  )r5   remove_weight_normrY   )rL   kwargsr%   r%   r)   _prepare_for_exportu  s   
z UnivNetModel._prepare_for_exportc                 C   s   dt dt iS )Nry   )r{   Dr}   )r   r   rW   r%   r%   r)   r   |  s   zUnivNetModel.input_typesc                 C   s   dt dt| jiS )Nr~   )r{   Sr}   )r   r   rE   rW   r%   r%   r)   r     s   zUnivNetModel.output_typesr,   r   c                 C   s:   t |  }tj|| jd d |f|j|jd}d|ifS )zs
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
        r2   r6   )r   dtypery   )nextrh   r   randnr   r   r  )rL   	max_batchmax_dimparr   r%   r%   r)   input_example  s   $
zUnivNetModel.input_examplec                 C   rs   rt   rw   rx   r%   r%   r)   forward_for_export  s   zUnivNetModel.forward_for_exportr  )ry   r   r   r   )Tr   )r   r  )r,   r   )"__name__
__module____qualname____doc__r   r1   rX   staticmethodr^   rr   r   rz   r   r   r   r   r   r   r   boolstrr  r  r  r  classmethodr  r  propertyr   r   r$  r%  __classcell__r%   r%   rM   r)   r   +   s<    $
1
;B



r   )2ri   typingr   r   torch.nn.functionalnn
functionalr   hydra.utilsr   lightning.pytorch.loggers.wandbr   	omegaconfr   r   r   *nemo.collections.tts.losses.hifigan_lossesr   r	   &nemo.collections.tts.losses.stftlossesr
    nemo.collections.tts.models.baser   ,nemo.collections.tts.modules.univnet_modulesr   r   (nemo.collections.tts.parts.utils.helpersr   r   r   	nemo.corer   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   "nemo.core.neural_types.neural_typer   nemo.core.optim.lr_schedulerr   r   
nemo.utilsr   r   r   r   ModuleNotFoundErrorr   r%   r%   r%   r)   <module>   s2   