o
    wiA                     @   sD  d dl Z d dlmZmZ d dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZmZmZ d dlmZ d dl	mZ d d	lmZ d d
lmZ d dlmZ d dlmZmZm Z m!Z! d dl"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 eG dd dZ1eG dd dZ2G dd deZ3dS )    N)	dataclassfield)AnyDictListOptional)instantiate)TensorBoardLoggerWandbLogger)MISSING
DictConfig	OmegaConf	open_dict)ConfigAttributeError)nn)parsers)Tacotron2Loss)SpectrogramGenerator)g2p_backward_compatible_supportget_mask_from_lengthstacotron2_log_to_tb_functacotron2_log_to_wandb_func)PretrainedModelInfo	typecheck)AudioSignalEmbeddedTextTypeLengthsType
LogitsTypeMelSpectrogramTypeSequenceToSequenceAlignmentType)
NeuralType)loggingmodel_utilsc                   @   s&   e Zd ZU eZeed< eZeed< dS )Preprocessor_target_	pad_valueN)	__name__
__module____qualname__r   r$   str__annotations__r%   float r,   r,   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/tacotron2.pyr#   0   s   
 r#   c                   @   s   e Zd ZU edd dZeed< eZe	e
e
f ed< eZe	e
e
f ed< eZe	e
e
f ed< eZeed< d	Zee	e
e
f  ed
< d	Zee	e
e
f  ed< d	S )Tacotron2Configc                   C   s   t  S N)r#   r,   r,   r,   r-   <lambda>8   s    zTacotron2Config.<lambda>)default_factorypreprocessorencoderdecoderpostnetlabelsNtrain_dsvalidation_ds)r&   r'   r(   r   r2   r#   r*   r   r3   r   r   r4   r5   r6   r   r7   r   r8   r,   r,   r,   r-   r.   6   s   
 r.   c                       s   e Zd ZdZd1deddf fddZedd	 Zd2dede	j
fddZedd Zedd Ze dddddZedede idede iddd Zdd Zdd  Zd!d" Zd#d$ Zd3d&ed'efd(d)Zd*d+ Zd,d- Zed4d/d0Z  ZS )5Tacotron2ModelzDTacotron 2 Model that is used to generate mel spectrograms from textNcfgtrainerTrainerc                    sn  t |}t |}d | _d | _i | _| | d | _t|dr7| 	| t
| jj| _| jj| _| jj| _nt
|jd | _t j||d tt}t|trWt|}nt|tsftdt| dzt|| |jj| _W n t y   |jj!j| _t"#d Y nw d | _$t%|j| _&t'(| jd| _)t%| j*j+| _+t%| j*j,| _,t%| j*j-| _-t. | _/d| _0d S )	Ntext_tokenizer   )r:   r;   zcfg was type: z(. Expected either a dict or a DictConfigzYour config is using an old NeMo yaml configuration. Please ensure that the yaml matches the current version in the main branch for future compatibility.i   T)1r"   #convert_model_config_to_dict_configmaybe_update_config_version
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizer	tokenizerhasattr_setup_tokenizerlentokens
num_tokenspadtokenizer_padoovtokenizer_unkr6   super__init__r   
structuredr.   
isinstancedictcreater   
ValueErrortypemerger2   r%   r   paramsr!   warning_parserr   audio_to_melspec_precessorr   	Embeddingtext_embedding_cfgr3   r4   r5   r   losscalculate_loss)selfr:   r;   schema	__class__r,   r-   rP   D   sH   









zTacotron2Model.__init__c              	   C   sr   | j d ur| j S | jjjjdd }|dkrd | _ | j S t| jdr5tj| jj	ddddddd	| _ | j S t
d
)N.
TTSDatasetr6   enT	fastpitchF)r6   nameunk_idblank_iddo_normalizeabbreviation_version
make_tablezDWanted to setup parser, but model does not have necessary paramaters)rZ   r^   r7   datasetr$   splitrF   r   make_parserr6   rU   )ra   ds_class_namer,   r,   r-   parserw   s$   
zTacotron2Model.parserTtextreturnc                 C   s   | j rtd |r| jd ur| j|fi | j}t }t| jdr*| jj	dd}|+ | jd ur9| j
|}n| |}t| jjg| t| jjd g }W d    n1 sZw   Y  t|d| j}|S )Nz+parse() is meant to be called in eval mode.set_phone_probg      ?)prob   r   )trainingr!   rY   rB   rC   
contextlibnullcontextrF   rE   rw   encodert   rH   r^   r6   torchtensor
unsqueeze_todevice)ra   ru   	normalizeeval_phon_moderI   tokens_tensorr,   r,   r-   parse   s    


$zTacotron2Model.parsec                 C   sj   | j rtdt tdt tdt tdt dS tdt tdt tdt ddtdt dddS )NBTr   )rI   	token_lenaudio	audio_lenT)optional)rz   r    r   r   r   ra   r,   r,   r-   input_types   s   





zTacotron2Model.input_typesc                 C   s   | j s"| js"tdt tdt tdt tdt tdt dS tdt tdt tdt tdt tdt tdt dS )Nr   Dr   r   )r   r   r   r   )spec_pred_decspec_pred_postnet	gate_pred
alignmentspred_length)r   r   r   spec_targetspec_target_lenr   )r`   rz   r    r   r   r   r   r   r,   r,   r-   output_types   s   










zTacotron2Model.output_types)r   r   c                C   s   |d ur|d ur|  ||\}}n
| js| jrtd| |dd}| j||d}| jr:| j|||d\}	}
}n| j||d\}	}
}}| j|	d}| jsX| jsX|	||
||fS |	||
|||fS )Nze'audio' and 'audio_len' can not be None when either 'self.training' or 'self.calculate_loss' is True.ry      )token_embeddingr   )memorydecoder_inputsmemory_lengths)r   r   )mel_spec)	r[   rz   r`   rU   r]   	transposer3   r4   r5   )ra   rI   r   r   r   r   r   r   encoder_embeddingr   r   r   r   r   r,   r,   r-   forward   s&   zTacotron2Model.forwardrI   r   specr   )r   r   c                C   s   |    d| _tdd |D | j}| ||d}|d }|jd dkrLt|d  }||jd |	d|	d}|
ddd}|j|| j |S )	NFc                 S   s   g | ]}t |qS r,   )rH   ).0ir,   r,   r-   
<listcomp>       z7Tacotron2Model.generate_spectrogram.<locals>.<listcomp>)rI   r   ry   r   rf   r   )evalr`   r~   r   r   r   shaper   expandsizepermutedatamasked_fill_r%   )ra   rI   r   tensorsspectrogram_predmaskr,   r,   r-   generate_spectrogram   s    z#Tacotron2Model.generate_spectrogramc                 C   s^   |\}}}}| j ||||d\}}}	}
}}| j|||	|
|| jd\}}|d|id|id}|S )Nr   r   rI   r   r   r   r   r   r   r%   training_lossr_   )r_   progress_barlog)r   r_   r%   )ra   batch	batch_idxr   r   rI   r   r   r   r   r   r   _r_   outputr,   r,   r-   training_step   s"   

zTacotron2Model.training_stepc                 C   sh   |\}}}}| j ||||d\}}}	}
}}| j|||	|
|| jd\}}||
||	||d}| j| |S )Nr   r   )val_loss
mel_targetmel_postnetgategate_targetr   )r   r_   r%   validation_step_outputsappend)ra   r   r   r   r   rI   r   r   r   r   r   r   r   r_   r   r,   r,   r-   validation_step  s*   
	zTacotron2Model.validation_stepc                 C   s   | j d urK| j jd urK| j j}| jjD ]}t|tr|j} nqt|tr6t|| jd  | j	dddd nt|t
rKt|| jd  | j	dddd tdd | jD  }| d| | j  d S )	Nr   valTF)tag
log_images	add_audioc                 S   s   g | ]}|d  qS )r   r,   )r   xr,   r,   r-   r   @  r   z:Tacotron2Model.on_validation_epoch_end.<locals>.<listcomp>r   )logger
experimentr;   loggersrR   r	   r   r   valuesglobal_stepr
   r   r~   stackmeanr   clear)ra   r   avg_lossr,   r,   r-   on_validation_epoch_end&  s@   


z&Tacotron2Model.on_validation_epoch_endc                 C   s   i }d|j v r`|j jd ur`|  r0|j jdd d ur0|j jd dr0t|j jd |j jd< i }d|j jv rC| d|j jj|d< d|j jv rT| d|j jj|d< t	|j jfi ||d< t	|j fi || _
d S )Ng2pr$   znemo_text_processing.g2pphoneme_dictztext_tokenizer.g2p.phoneme_dict
heteronymsztext_tokenizer.g2p.heteronyms)r=   r   _is_model_being_restoredget
startswithr   register_artifactr   r   r   rE   )ra   r:   text_tokenizer_kwargs
g2p_kwargsr,   r,   r-   rG   E  s.   
zTacotron2Model._setup_tokenizertrainshuffle_should_berj   c                 C   s  d|vs
t |jtstd| d|vst |jts"td| |r`d|jvrOtd|  d| d t|j d	|j_W d    n1 sIw   Y  n#|jjs_t	d
| d|  d n|sr|jjrrt	d
| d|  d t
|j| j| j| jd}tjjj|fd|ji|jS )Nrp   zNo dataset for dataloader_paramszNo dataloder_params for shufflez"Shuffle should be set to True for z's zE dataloader but was not found in its config. Manually setting to TrueTzThe z dataloader for z has shuffle set to False!!!z has shuffle set to True!!!)text_normalizerrC   r=   
collate_fn)rR   rp   r   rU   r   r!   rY   r   r   errorr   rA   rC   rE   r~   utilsr   
DataLoaderr   )ra   r:   r   rj   rp   r,   r,   r-   __setup_dataloader_from_configd  s2   

z-Tacotron2Model.__setup_dataloader_from_configc                 C   s   |  || _d S r/   )-_Tacotron2Model__setup_dataloader_from_config	_train_dlra   r:   r,   r,   r-   setup_training_data  s   z"Tacotron2Model.setup_training_datac                 C   s   | j |ddd| _d S )NF
validation)r   rj   )r   _validation_dlr   r,   r,   r-   setup_validation_data  s   z$Tacotron2Model.setup_validation_dataList[PretrainedModelInfo]c                 C   s&   g }t ddd| dgd}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        tts_en_tacotron2zmhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.10.0/files/tts_en_tacotron2.nemozThis model is trained on LJSpeech sampled at 22050Hz, and can be used to generate female English voices with an American accent.zTacotron2-22050Hz)pretrained_model_namelocationdescriptionclass_aliases)r   r   )clslist_of_modelsmodelr,   r,   r-   list_available_models  s   
z$Tacotron2Model.list_available_modelsr/   )T)Tr   )rv   r   )r&   r'   r(   __doc__r   rP   propertyrt   r)   r~   Tensorr   r   r   r   r   r    r   r   r   r   r   r   rG   boolr   r   r   classmethodr   __classcell__r,   r,   rc   r-   r9   A   s4    3



r9   )4r{   dataclassesr   r   typingr   r   r   r   r~   hydra.utilsr   lightning.pytorch.loggersr	   r
   	omegaconfr   r   r   r   omegaconf.errorsr   r   +nemo.collections.common.parts.preprocessingr   )nemo.collections.tts.losses.tacotron2lossr    nemo.collections.tts.models.baser   (nemo.collections.tts.parts.utils.helpersr   r   r   r   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   r   r   r   r   "nemo.core.neural_types.neural_typer    
nemo.utilsr!   r"   r#   r.   r9   r,   r,   r,   r-   <module>   s,    
