o
    }oix                     @   sb  d dl Z d dlmZmZ d dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZ d dlmZm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 G dd dee(Z:dS )    N)ListOptional)instantiate)Trainer)WandbLogger)
DictConfig)nn)
functional)AlbertTokenizer)EnglishCharsTokenizerEnglishPhonemesTokenizer)BinLossForwardSumLoss)SpectrogramGenerator)average_featuresregulate_len)binarize_attention_parallelg2p_backward_compatible_supportget_mask_from_lengthsplot_pitch_to_numpyplot_spectrogram_to_numpy)
Exportable)PretrainedModelInfo	typecheck)LengthsTypeLogprobsTypeMelSpectrogramType	ProbsTypeRegressionValuesTypeTokenDurationType
TokenIndexTokenLogDurationType)
NeuralType)loggingmodel_utilsc                       s  e Zd ZdZdOdeddf fddZdd	 ZdPddZdPddZdPddZ								dQddZ
ejjdd Zeede ede ede ddede ddede ddede ddede dddede ede ede ede ede ede ede ede dd dRd!d"Z						#		dSd$d%Zd&d' Zd(d) Zd*d+ Zeede dded,e ddede ddeddgeddd-d.ede id 						
dTd/eej d0eej d1eej d2ee e!  d3e"d4e!fd5d6Z#dUd7e!d8ejfd9d:Z$d;d< Z%d=d> Z&d?d@ Z'dAdB Z(e)dVdDdEZ*e+dFdG Z,e+dHdI Z-dWdKdLZ.dOdMdNZ/  Z0S )XMixerTTSModelzwMixer-TTS and Mixer-TTS-X models (https://arxiv.org/abs/2110.03584) that is used to generate mel spectrogram from text.Ncfgtrainerr   c                    s  t |}t |}d | _d | _i | _| | d | _| | | jd us'J t	| jj
}| jj| _| jj| _t j||d |j| _|j| _|j| _t|j| _t | _t | _d| _d| _|j| _|j| _|dd| _ | j r| j!d ur{| j!j"j#n| $|j%| _#| &|j%| _'d| j'j(_)t|j*| j'j(j+d d| _*t|j,|| jd| _,| j,j-| _.t|j/| _/t0|j1t0|j2| _1| _2t|j3| _3t|j4| _4t|j5| _5t|j6| _6t78| j6j9|j:| _;d S )N)r&   r'   F        cond_on_lm_embeddings   )n_lm_tokens_channels)
num_tokenspadding_idx)<r$   #convert_model_config_to_dict_configmaybe_update_config_version
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizer	tokenizer_setup_tokenizerlentokenspadtokenizer_padoovtokenizer_unksuper__init__pitch_loss_scaledurs_loss_scalemel_loss_scaler   alignment_modulealignerr   forward_sum_lossr   bin_lossadd_bin_lossbin_loss_scalebin_loss_start_ratiobin_loss_warmup_epochsgetr)   	_train_dldatasetlm_padding_value_get_lm_padding_valuelm_model_get_lm_embeddingslm_embeddingsweightrequires_gradself_attention_moduleshapeencoderto_embed
symbol_embduration_predictorfloat
pitch_mean	pitch_stdpitch_predictor	pitch_embpreprocessordecoderr   Lineard_modeln_mel_channelsproj)selfr&   r'   r,   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/mixer_tts.pyr=   ?   sV   










zMixerTTSModel.__init__c                 C   s   i }d|j v rZ|  r*|j jdd d ur*|j jd dr*t|j jd |j jd< i }d|j jv r=| d|j jj|d< d|j jv rN| d|j jj|d< t	|j jfi ||d< t	|j fi || _
d S )Ng2p_target_znemo_text_processing.g2pphoneme_dictztext_tokenizer.g2p.phoneme_dict
heteronymsztext_tokenizer.g2p.heteronyms)text_tokenizer_is_model_being_restoredri   rI   
startswithr   register_artifactrk   rl   r   r4   )rd   r&   text_tokenizer_kwargs
g2p_kwargsrg   rg   rh   r5   ~   s.   

zMixerTTSModel._setup_tokenizeralbertc                 C   s`   t | dd d ur| jS | jd ur| jjd ur| jjj| _|dkr)td| _| jS t| d)N_lm_model_tokenizerrs   albert-base-v2D lm model is not supported. Only albert is supported at this moment.)getattrrt   rJ   rK   lm_model_tokenizerr
   from_pretrainedNotImplementedErrorrd   rN   rg   rg   rh   _get_lm_model_tokenizer   s   z%MixerTTSModel._get_lm_model_tokenizerc                 C   s&   |dkrt jdjjS t| d)Nrs   ru   rv   )transformersAlbertModelry   
embeddingsword_embeddingsrz   r{   rg   rg   rh   rO      s
   z MixerTTSModel._get_lm_embeddingsc                 C   s(   |dkrt jddS t| d)Nrs   ru   <pad>rv   )r}   r
   ry   _convert_token_to_idrz   r{   rg   rg   rh   rM      s
   z#MixerTTSModel._get_lm_padding_valuec              	   C   s  t |}t |}d}tj||d   dd}||  }| |  }| d }tj|dd}|	 
 }||k|   |  d }||  dk|   |  d }||  dk|   |  d }|dd	}tj||ddjd
d}||  }| |  }|| j|  | j|  }d\}}| j|	||d}|| }| jr| j||
d}|| j|  }t|d|d}tj||dd}||  |  }|| j|  }|||||||||f	S )Nr(   r*   none)	reductionr   )mind         dimNN)attn_logprobin_lensout_lens)hard_attentionsoft_attention)r   Fmse_lossrY   logsumexptorch	clamp_minroundlongabs	transposemeanr?   r@   rC   rE   rD   rF   r   	unsqueezesqueezer>   )rd   	true_durstrue_text_len	pred_durs
true_pitch
pred_pitch
true_spect
pred_specttrue_spect_lenr   	attn_soft	attn_hardattn_hard_dur	text_maskmel_maskloss	durs_loss	durs_predacc
acc_dist_1
acc_dist_3mel_lossrD   ctc_losstrue_avg_pitch
pitch_lossrg   rg   rh   _metrics   s8    ((zMixerTTSModel._metricsc                 C   s   |  |}| j||ddd|dk|d\}}	t|||}
|
dd d dd d f }tt|jdd|s:J ||	|
|fS )Nr   r   r*   )mask
attn_priorr   )rW   rB   permuter   r   r   alleq)rd   texttext_lenr   spect	spect_lenr   text_embr   r   r   r   rg   rg   rh   run_aligner   s   

zMixerTTSModel.run_alignerBT_text)r   )r   T_audioToptionalr   DT_spec)r   r   r   r   T_lm_tokensr   r   pitchr   r   r   	lm_tokens)r   Sr   r   )r   durs_predictedlog_durs_predictedpitch_predictedr   r   r   r   )input_typesoutput_typesc                 C   s  | j r	|d us	J t|d}| ||\}	}
d\}}}}|d ur0| ||||||\}}}}| jrH| |}| j|	|||
d|| j	kd}| 
|	|
}t| d d}| |	|
}| j s|d uryt|d|d}| |d}n| |d}nt|d|d}| |d}|	|dd }	| jr|	| }	t||	\}}| |t|d\}}| |}||||||||fS )Nr   )NNNNq_maskkv_maskr*   r   )trainingr   r   rU   r   r)   rP   rS   r   rL   rX   r   clampr   r\   r   r]   r   r   r_   rc   )rd   r   r   r   r   r   r   r   r   enc_outenc_maskr   r   r   r   lm_emblm_featuresr   r   r   r]   len_regulated_enc_outdec_lensdec_outr   rg   rg   rh   forward  sN   

zMixerTTSModel.forwardFc
                 C   s^  |d u rt |d}| ||\}
}d }|r%| ||||||\}}}}| jr=| |}| j|
|||d|| jkd}| 	|
|}t
| d d}|rg|	d urgt|	d|d}	| |	d}n| |
|}| |d}|
|dd }
| jr|
| }
|r|d urt||
\}}n	tt||
\}}| |t |d\}}| |}|S )Nr   r   r*   r   )r   r   rU   r   r)   rP   rS   r   rL   rX   r   r   r   r   r]   r\   r   r   rz   r_   rc   )rd   r   r   r   r   r   r   use_gt_dursr   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   r   r   _r   rg   rg   rh   inferW  s>   

zMixerTTSModel.inferc                 C   sb   t | j| jj }| js| j|krtd| j  d| _| jr/t	| j| | j
 d| _d S d S )Nz#Using hard attentions after epoch: T      ?)npceilrG   _trainer
max_epochsrE   current_epochr#   infor   rH   rF   )rd   bin_loss_start_epochrg   rg   rh   on_train_epoch_start  s   z"MixerTTSModel.on_train_epoch_startc                 C   s@  d\}}| j r|\}}}}}}	}
}n	|\}}}}}}	}
| j||d\}}|	dk}|	| j | j }	d|	|< | |||	||||d\}}
}}}}}}| j|||||	|||||||d\	}}}}}}}}}|||d u rptd|jn|||||d u rtd|jn||d u rtd|jn|d}|d	d
 |	 D |dS )Nr   input_signallengthr   r(   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )
train_losstrain_durs_losstrain_pitch_losstrain_mel_losstrain_durs_acctrain_durs_acc_dist_3train_ctc_losstrain_bin_lossc                 S   s   i | ]	\}}||  qS rg   )detach).0kvrg   rg   rh   
<dictcomp>  s    z/MixerTTSModel.training_step.<locals>.<dictcomp>)r   progress_barr   )
r)   r^   rZ   r[   r   r   tensortodeviceitems)rd   batch	batch_idxr   r   audio	audio_lenr   r   r   r   r   r   zero_pitch_idxr   pred_log_dursr   r   r   r   r   r   r   r   r   r   r   r   r   rD   	train_logrg   rg   rh   training_step  sx   zMixerTTSModel.training_stepc           #      C   s
  d\}}| j r|\}}}}}}	}
}n	|\}}}}}}	}
| j||d\}}|	dk}|	| j | j }	d|	|< | |||	||||d\}}
}}}}}}| j|||||	|||||||d\	}}}}}}}}}| ||d ||||d\}}
}}}}}}| j|||||	|||||||d^ }
}}
}
|||d u rtd|jn||||||d u rtd|jn||d u rtd|jn|d	}| j	|d	d
d
d
d |dkr| j
d dkrt| jtrg } g }!ttd|jd D ]}"| tjt||"d d d ||" f j  d|" dtjt|dd|"d d d ||" f j  d|" dg7 } |!tjtt|	d|d|"d ||" f j  ddgdd|" dg7 }!|!tjt||"d ||" f j  ddgdd|" dg7 }!q| jj| |!d d S d S d S d S )Nr   r   r   r(   r   r   r   )	val_lossval_durs_lossval_pitch_lossval_mel_lossval_with_pred_features_mel_lossval_durs_accval_durs_acc_dist_3val_ctc_lossval_bin_lossFT)prog_baron_epochlogger	sync_dist   r   zgt mel )captionr*   r   z	pred mel g      g      @)
ylim_rangez	gt pitch zpred pitch )specspitches)r)   r^   rZ   r[   r   r   r  r  r  log_dictr   
isinstancer  r   ranger   rT   wandbImager   datacpunumpyr   r   r   r   r   
experimentr   )#rd   r  r  r   r   r  r	  r   r   r   r   r   r   r
  r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   rD   with_pred_features_mel_lossval_logr  r  irg   rg   rh   validation_step  s   
(&.(zMixerTTSModel.validation_stepr   )r7   
tokens_lenr   	raw_textsrN   r   r7   r-  r   r.  norm_text_for_lm_modelrN   c                    s  |d ur|d u r|j jkjdd}n4|d u rtdfdd|D }tjjjjfdd|D dj jd}tj	d	d |D tj
|jd
}jr|d u r|d u rVtd|  d} dtj tsstj tssJ |rjd urfdd|D }fdd|D }	 fdd|	D }
j jrfdd|
D }
tjt|
tdd |
D f||jd}t|
D ]\}}tj	||jd||d t|f< qj|||ddd}|S )Nr   z-raw_texts must be specified if tokens is Nonec                    s   g | ]}  |qS rg   )r4   r   trd   rg   rh   
<listcomp>  s    z6MixerTTSModel.generate_spectrogram.<locals>.<listcomp>c                    s    g | ]}t j|t j jd qS )dtyper  )r   r  r   r  r1  r3  rg   rh   r4         T)	sequencesbatch_firstpadding_valuec                 S      g | ]}t |qS rg   r6   r1  rg   rg   rh   r4        r5  z0raw_texts must be specified if lm_tokens is Noner   u   ▁c                    s    g | ]} j |fi  jqS rg   )r1   r2   r1  r3  rg   rh   r4    r7  c                    s   g | ]} j |qS rg   )r4   text_preprocessing_funcr1  r3  rg   rh   r4    s    c                    s   g | ]	} j |d dqS )F)add_special_tokens)encoder1  )rx   rg   rh   r4    s    c                    s   g | ]
} g|  g qS rg   rg   r1  )lm_space_valuerg   rh   r4    s    c                 S   r;  rg   r<  r1  rg   rg   rh   r4    r=  )
fill_valuer  )r  )r   r*   r   )r4   r8   r   
ValueErrorr   r   utilsrnnpad_sequencer  r   r  r)   r|   r   r!  r   r   r1   pad_with_spacefullr6   max	enumerater   r   )rd   r7   r-  r   r.  r/  rN   t_seqsrL   preprocess_texts_as_tts_inputlm_tokens_as_ids_listr+  lm_tokens_ir   rg   )rx   rA  rd   rh   generate_spectrograml  sN   




"z"MixerTTSModel.generate_spectrogramr   returnc                 C   s   | j rtd |r| jd ur| j|fi | j}t }t| jdr*| jj	dd}| | j
|}W d    n1 s=w   Y  t| d| jS )Nz+parse() is meant to be called in eval mode.set_phone_probr   )probr   )r   r#   warningr1   r2   
contextlibnullcontexthasattrr4   rQ  r@  r   r  r   r   r  r  )rd   r   	normalizeeval_phon_moder7   rg   rg   rh   parse  s   
zMixerTTSModel.parsec                 C   sf   z|j j}W n tjjy   td Y d S w t|j | j| j	| j
d}tjjjd||jd|jS )Nz9manifest_filepath was skipped. No dataset for this model.)text_normalizerr2   rm   )rK   
collate_fnrg   )rK   manifest_filepath	omegaconferrorsMissingMandatoryValuer#   rS  r   r0   r2   r4   r   rD  r%  
DataLoaderr[  dataloader_params)rd   r&   r   rK   rg   rg   rh   _loader  s$   

zMixerTTSModel._loaderc                 C      |  || _d S N)rb  rJ   rd   r&   rg   rg   rh   setup_training_data     z!MixerTTSModel.setup_training_datac                 C   rc  rd  )rb  _validation_dlre  rg   rg   rh   setup_validation_data  rg  z#MixerTTSModel.setup_validation_datac                 C   s   dS )zOmitted.Nrg   re  rg   rg   rh   setup_test_data  s   zMixerTTSModel.setup_test_dataList[PretrainedModelInfo]c                 C   s<   g }t ddd| d}|| t ddd| d}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        tts_en_lj_mixerttszphttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixertts/versions/1.6.0/files/tts_en_lj_mixertts.nemozThis model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent.)pretrained_model_namelocationdescriptionclass_tts_en_lj_mixerttsxzrhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_mixerttsx/versions/1.6.0/files/tts_en_lj_mixerttsx.nemo)r   append)clslist_of_modelsmodelrg   rg   rh   list_available_models  s    

z#MixerTTSModel.list_available_modelsc                 C   s   t dt t dt dddS )Nr   r   Tr   )r   r   )r"   r    r3  rg   rg   rh   r     s   
zMixerTTSModel.input_typesc                 C   s   dt dt iS )Nr   r   )r"   r   r3  rg   rg   rh   r      s   zMixerTTSModel.output_types
   c                 C   sb   t jdt| jjd|f| jt jd}d|i}| jr.t jd| jj	j
d d|f| jt jd|d< |fS )Nr   r*   )lowhighsizer  r6  r   r   )r   randintr6   r4   r7   r  r   r)   rP   rQ   rT   )rd   max_text_lenmax_lm_tokens_lenr   inputsrg   rg   rh   input_example  s"   

zMixerTTSModel.input_examplec                 C   s4   || j kd}| j|||ddd}|tjS )Nr   )r   r   r   r*   )r9   r   r   r   r  r   rY   )rd   r   r   r   r   rg   rg   rh   forward_for_export  s   z MixerTTSModel.forward_for_exportrd  )rs   )NNNNNNN)NNNNN)NNNNNFNN)NNNNTrs   )T)rP  rk  )rw  rw  )1__name__
__module____qualname____doc__r   r=   r5   r|   rO   rM   r   r   jitunusedr   r   r"   r    r   r   r   r   r   r!   r   r   r   r   r  r,  r   Tensorr   strboolrO  rY  rb  rf  ri  rj  classmethodrv  propertyr   r   r  r  __classcell__rg   rg   re   rh   r%   <   s    ?



:











@
=G 

;


r%   );rT  typingr   r   r'  r   r]  r   r}   r#  hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr   r   r   torch.nnr	   r   r
   @nemo.collections.common.tokenizers.text_to_speech.tts_tokenizersr   r   (nemo.collections.tts.losses.aligner_lossr   r    nemo.collections.tts.models.baser   &nemo.collections.tts.modules.fastpitchr   r   (nemo.collections.tts.parts.utils.helpersr   r   r   r   r   	nemo.corer   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   r   r   r   r   r    r!   "nemo.core.neural_types.neural_typer"   
nemo.utilsr#   r$   r%   rg   rg   rg   rh   <module>   s2   (
