o
    wi M                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dlmZ d dlmZmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) e)G dd deeZ*dS )    N)instantiate)Trainer)TensorBoardLogger)
DictConfig	OmegaConf)BaseTokenizer)AttentionBinarizationLoss
RADTTSLoss)SpectrogramGenerator)batch_from_raggedg2p_backward_compatible_supportplot_alignment_to_numpyregulate_lensample_tts_input)
Exportable)	typecheck)IndexMelSpectrogramTypeRegressionValuesTypeTokenDurationType
TokenIndex)
NeuralType)RAdam)logging)experimentalc                
       sF  e Zd ZdAdedef fddZdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zeede ddede ddeddddede iddBd"d#d$ed%ed&ejfd'd(Zed)d* Zd+d, ZdCd.ed&ejfd/d0Zed1d2 ZdD fd3d4	Z ed5d6 Z!ed7d8 Z" fd9d:Z#dEd=d>Z$d?d@ Z%  Z&S )FRadTTSModelNcfgtrainerc              	      s  t |tr
t|}d | _d | _i | _| | d | _| 	| | jd us'J | jj
| _| jj| _d | _d | _t j||d d| _|j| _|j| _|j| _t| jj| jj| jj| jj| jj| jj| jjd| _ t! | _"t#|j| _$d | _%d | _&|| _'d| _(d| j$j)j*fdd| jj+d| _,d S )N)r   r         ?)vpred_model_configloss_weightsFr   T)	emb_rangeenable_volumeenable_ragged_batchesnum_speakers)-
isinstancedictr   create
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizer	tokenizer_setup_tokenizerpadtokenizer_padoovtokenizer_unktext_tokenizer_pad_idtokenssuper__init__feat_loss_weightmodelConfigmodel_configtrainerConfigtrain_configoptimr	   sigman_group_sizedur_model_configf0_model_configenergy_model_configv_model_configr    	criterionr   attention_kl_lossr   model_parser
_tb_loggerr   log_train_images	embeddingnum_embeddings
n_speakersexport_config)selfr   r   	__class__ _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/radtts.pyr5   1   sL   






zRadTTSModel.__init__c                 C   s   t |dk r"tdg|d d  | j}|d }|d }n|d }|d }|d }|d |d |d |d |d	 |d
 |d |d |d |||d |d |d}|S )N   r      	   
                                 )audio
audio_lenstext	text_lenslog_mellog_mel_lensalign_prior_matrixpitch
pitch_lensvoiced_maskp_voicedenergyenergy_lens
speaker_id)lentorchtensorsizecudatodevice)rL   
batch_dataspk_idv_mp_vbatch_data_dictrO   rO   rP   
batch_dicta   s.   &
zRadTTSModel.batch_dictc                 C   s`  |  |}|d }|d }|d }|d }|d }|d }|d }	|d }
|d	 }| jjd
kr9| j| jjkr9d}nd}| j||||||||	||
d
}| |||}d }| D ]\}\}}|d
kro|d u ri|| n|||  }qW|r| j| jjkr| |d |d }||7 }nt	
|}|df|d< | D ]\}\}}| jd| || d
 dd qd|iS )Nrc   rl   ra   rb   rd   re   rf   rh   rj   r   TFbinarize_attention
attn_priorf0
energy_avgrh   attn	attn_softr   binarization_lossztrain/)on_steploss)ry   r:   binarization_start_iterglobal_steprD   rB   itemskl_loss_start_iterrC   rn   
zeros_likelog)rL   batch	batch_idxmelspeaker_idsra   in_lensout_lensr|   r}   rh   r~   binarizeoutputsloss_outputsr   kvwr   rO   rO   rP   training_step|   sP   


zRadTTSModel.training_stepc                 C   sx  |  |}|d }|d }|d }|d }|d }|d }|d }	|d }
|d	 }| jjd
kr9| j| jjkr9d}nd}| j|||||d|||
|	d
}| |||}d }| D ]\}\}}|d
kro|d u ri|| n|||  }qW|r| jjd
kr| j| jjkr| |d |d }||7 }nt	
|}||d< ||d
kr|d nd |d
kr|d nd |d
krdnd d}| j| |S )Nrl   ra   rb   rd   re   rf   rh   rj   rc   r   TFrz   r   r   r   audio_1)r   r   r   
audiopaths)ry   r:   r   r   rD   rB   r   r   rC   rn   r   validation_step_outputsappend)rL   r   r   r   ra   r   r   r|   r}   rh   r~   r   r   r   r   r   r   r   r   r   val_outputsrO   rO   rP   validation_step   s^   


zRadTTSModel.validation_stepc                 C   s   | j d d }| D ]\}}|dkr"| jd| || d ddd q| j d d }| j d d }| jjd	t|d
 j  j	dd| j
dd | jjdt|d
 j  j	dd| j
dd d| _| j   d S )Nr   r   r   zval/T)	sync_diston_epochr   r   attention_weights_mas)r   r   r_   )titleHWC)dataformatsattention_weights)r   r   r   	tb_logger	add_imager   datacpunumpyTr   rG   clear)rL   r   r   r   r   r   rO   rO   rP   on_validation_epoch_end   s*   z#RadTTSModel.on_validation_epoch_endc                    s   t d| jj  t| jjr2t D ]\ }t	 fdd| jjD r.t d  d|_
qd|_
q| jjdkrJtjj| j | jj| jjd}|S | jjd	kr`t| j | jj| jjd}|S t d
| jj  td |S )NzInitializing %s optimizerc                    s   g | ]}| v qS rO   rO   ).0lnamerO   rP   
<listcomp>  s    z4RadTTSModel.configure_optimizers.<locals>.<listcomp>zFine-tuning parameterTFAdam)lrweight_decayr   z<Unrecognized optimizer %s! Please choose the right optimizerrV   )r   infor;   r   rm   r:   finetune_layersrD   named_parametersanyrequires_gradrn   r   
parametersr   r   r   exit)rL   param	optimizerrO   r   rP   configure_optimizers  s$   	z RadTTSModel.configure_optimizersc                 C   sf   z|j j}W n tjjy   td Y d S w t|j | j| j	| j
d}tjjjd||jd|jS )Nz9manifest_filepath was skipped. No dataset for this model.)text_normalizerr*   text_tokenizer)dataset
collate_fnrO   )r   manifest_filepath	omegaconferrorsMissingMandatoryValuer   warningr   r(   r*   r,   rn   utilsr   
DataLoaderr   dataloader_params)rL   r   _r   rO   rO   rP   _loader  s$   

zRadTTSModel._loaderc                 C      |  || _d S N)r   	_train_dlrL   r   rO   rO   rP   setup_training_data0     zRadTTSModel.setup_training_datac                 C   r   r   )r   _validation_dlr   rO   rO   rP   setup_validation_data3  r   z!RadTTSModel.setup_validation_datac                 C   s   dS )zOmitted.NrO   r   rO   rO   rP   setup_test_data6  s   zRadTTSModel.setup_test_dataBT_textToptionalr   )r3   speakerr<   spectr   DT_spec)input_typesoutput_typesr   r   r3   ztorch.tensorr   r<   returnc                 C   sR   |    | jrtd t|g  | j	}| j
j|||d}|d }|S )Nz:generate_spectrogram() is meant to be called in eval mode.)r<   r   )evaltrainingr   r   rn   ro   longrq   rr   rs   rD   infer)rL   r3   r   r<   r   r   rO   rO   rP   generate_spectrogram:  s   
z RadTTSModel.generate_spectrogramc                 C   s   | j d ur| j S | j S r   )rE   rL   rO   rO   rP   parserN  s   
zRadTTSModel.parserc                 C   s  i }d|j v rZ|  r*|j jdd d ur*|j jd dr*t|j jd |j jd< i }d|j jv r=| d|j jj|d< d|j jv rN| d|j jj|d< t	|j jfi ||d< t	|j fi || _
t| j
trv| j
j| _| j
j| _d S td u r~tdtd u rtd	t| _t| _d S )
Ng2p_target_znemo_text_processing.g2pphoneme_dictztext_tokenizer.g2p.phoneme_dict
heteronymsztext_tokenizer.g2p.heteronymszNtext_tokenizer_pad_id must be specified if text_tokenizer is not BaseTokenizerz?tokens must be specified if text_tokenizer is not BaseTokenizer)r   _is_model_being_restoredr   get
startswithr   register_artifactr   r   r   r,   r%   r   r.   r2   r3   
ValueError)rL   r   text_tokenizer_kwargs
g2p_kwargsrO   rO   rP   r-   T  s@   



zRadTTSModel._setup_tokenizerFra   c                 C   s   | j rtd |r| jd ur| j|fi | j}t }t| jdr.| jj	dd}t
d | | j|}W d    n1 sAw   Y  t
d t| d | jS )Nz+parse() is meant to be called in eval mode.set_phone_probrV   )probzchanged to oneztext to token phone_probr   )r   r   r   r)   r*   
contextlibnullcontexthasattrr,   r   printencodern   ro   r   	unsqueezerq   rr   rs   )rL   ra   	normalizeeval_phon_moder3   rO   rO   rP   parse  s   
 zRadTTSModel.parsec                 C   sZ   | j d u r*| jd u r| jjd u rd S | jj}| jjD ]}t|tr&|j} nq|| _ | j S r   )rF   logger
experimentr   loggersr%   r   )rL   r   r   rO   rO   rP   r     s   

zRadTTSModel.tb_loggerc                    sJ   i }|  D ]\}}|dd}|dd}|||< qt j||d d S )Nzprojection_fn.weightzprojection_fn.conv.weightzprojection_fn.biaszprojection_fn.conv.bias)strict)r   replacer4   load_state_dict)rL   
state_dictr  new_state_dictr   r   rM   rO   rP   r    s   
zRadTTSModel.load_state_dictc                 C      | j S r   )_input_typesr   rO   rO   rP   r        zRadTTSModel.input_typesc                 C   r  r   )_output_typesr   rO   rO   rP   r     r	  zRadTTSModel.output_typesc                    s   | j   t jdi | | jd rdnd}t|t tdtdt tdt tdt t|t t|d| _	tdt
 tdt tdt d| _| jd	 rft|d
d| j	d< tdt | jd< d S d S )Nr#   r   )r   r   r   )ra   batch_lengthsrl   speaker_id_textspeaker_id_attributesrf   pacer   r   )r   
num_framesdurs_predictedr"   Tr   volume)r   r   volume_alignedrO   )rD   remove_normsr4   _prepare_for_exportrK   r   r   r   r   r  r   r   r
  )rL   kwargstensor_shaperM   rO   rP   r    s&   










zRadTTSModel._prepare_for_exportrV     c           	   	   C   s   t | j }t| j|j||d}|d}|d }| jj}|dkr&|d n|d |||k< |	|||d ||d ||||d |d	 |d
 d}|fS )N)	max_batchmax_dimr   ra   r   rV   )rl   r  r  r  rf   r  r  )ra   r  rl   r  r  rf   r  r  )
nextrD   r   r   rK   rs   popr,   r.   update)	rL   r  r  parinputsr   inppad_id
new_inputsrO   rO   rP   input_example  s,   
 zRadTTSModel.input_examplec	                 C   s   | j d rt||||| j|d\}}}}	}
|d ur|	}n|jtjd}
| jj||||ddd|
||d
 \}}}}}|	 ||	 f}|d ur||	 }t
|
}t||d d d |f d|d d d |f | jj|
d\}}|d		 }||f }|S )
Nr#   )r  padding_idxr  )dtypegffffff?g        )r  r  r<   f0_meanf0_stdr   pitch_shiftr  )
group_sizedur_lensrW   )rK   r   r/   rr   rn   int64rD   r   valuesfloatmaxr   r   r=   squeeze)rL   ra   r  rl   r  r  rf   r  r  volume_tensorlensr   n_framesdurr   
ret_valuesr  truncated_lengthvolume_extendedrO   rO   rP   forward_for_export  sR   



zRadTTSModel.forward_for_exportr   )r   r   )F)T)rV   r  )'__name__
__module____qualname__r   r   r5   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   intr-  rn   ro   r   propertyr   r-   strTensorr   r   r  r   r   r  r"  r7  __classcell__rO   rO   rM   rP   r   /   sD    05< 


+




r   )+r   rn   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr   r   r   r   @nemo.collections.common.tokenizers.text_to_speech.tts_tokenizersr   &nemo.collections.tts.losses.radttslossr   r	    nemo.collections.tts.models.baser
   (nemo.collections.tts.parts.utils.helpersr   r   r   r   r   nemo.core.classesr   nemo.core.classes.commonr   nemo.core.neural_types.elementsr   r   r   r   r   "nemo.core.neural_types.neural_typer   nemo.core.optim.radamr   
nemo.utilsr   nemo.utils.decoratorsr   r   rO   rO   rO   rP   <module>   s&   