o
    }oi(                     @   s   d dl mZ d dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZmZ d dlmZ d dlmZ d dlmZmZ dZzd dl Z W n e!yq   dZY nw G dd deeZ"dS )    )ListN)instantiate)Trainer)WandbLogger)
DictConfig)nn)BinLossForwardSumLoss)NeedsNormalizer)binarize_attentiong2p_backward_compatible_supportget_mask_from_lengthsplot_alignment_to_numpy)ModelPT)PretrainedModelInfo)loggingmodel_utilsTFc                       s   e Zd ZdZd deddf fddZdd	 Zdd
ddZdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zedee fddZ  ZS )!AlignerModelzSpeech-to-text alignment model (https://arxiv.org/pdf/2108.10447.pdf) that is used to learn alignments between mel spectrogram and text.Ncfgtrainerr   c                    s   t |}t |}d | _d | _i | _| | d | _| | | jd us'J t	| jj
}| jj| _| jj| _t j||d t||j| _t|j| _t|j| _t | _t | _d| _d| _|j| _|j| _d S )N)r   r   F        ) r   #convert_model_config_to_dict_configmaybe_update_config_version
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizer	tokenizer_setup_tokenizerlentokenspadtokenizer_padoovtokenizer_unksuper__init__r   	Embeddingsymbols_embedding_dimembedr   preprocessoralignment_encoderr	   forward_sum_lossr   bin_lossadd_bin_lossbin_loss_scalebin_loss_start_ratiobin_loss_warmup_epochs)selfr   r   
num_tokens	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/aligner.pyr&   0   s,   





zAlignerModel.__init__c                 C   s   i }d|j v rZ|  r*|j jdd d ur*|j jd dr*t|j jd |j jd< i }d|j jv r=| d|j jj|d< d|j jv rN| d|j jj|d< t	|j jfi ||d< t	|j fi || _
d S )Ng2p_target_znemo_text_processing.g2pphoneme_dictztext_tokenizer.g2p.phoneme_dict
heteronymsztext_tokenizer.g2p.heteronyms)text_tokenizer_is_model_being_restoredr8   get
startswithr   register_artifactr:   r;   r   r   )r2   r   text_tokenizer_kwargs
g2p_kwargsr6   r6   r7   r   Q   s.   

zAlignerModel._setup_tokenizer)
attn_priorc                C   st   t jj| jjdd$ | j|| |ddt|	ddk|d\}}W d    ||fS 1 s1w   Y  ||fS )NF)enabled      r   )querieskeysmaskrC   )
torchampautocastdevicetyper+   r)   	transposer   	unsqueeze)r2   specspec_lentexttext_lenrC   	attn_softattn_logprobr6   r6   r7   forwardp   s   
zAlignerModel.forwardc           	      C   sV   d\}}}| j |||d}||7 }| jr%t|||}| j||d}||7 }||||fS )N)r   NN)rW   in_lensout_lens)hard_attentionsoft_attention)r,   r.   r   r-   )	r2   rV   rW   rS   rU   lossr-   	attn_hardr,   r6   r6   r7   _metrics{   s   
zAlignerModel._metricsc                 C   sb   t | j| jj }| js| j|krtd| j  d| _| jr/t	| j| | j
 d| _d S d S )Nz#Using hard attentions after epoch: T      ?)npceilr0   _trainer
max_epochsr.   current_epochr   infominr1   r/   )r2   bin_loss_start_epochr6   r6   r7   on_train_epoch_start   s   z!AlignerModel.on_train_epoch_startc                 C   s   |\}}}}}| j ||d\}}	| ||	|||d\}
}| |
||	|\}}}}||d u r5td|jn|d}|dd | D |dS )Ninput_signallengthrR   rS   rT   rU   rC   r`   )train_forward_sum_losstrain_bin_lossc                 S   s   i | ]	\}}||  qS r6   )detach).0kvr6   r6   r7   
<dictcomp>   s    z.AlignerModel.training_step.<locals>.<dictcomp>)r]   progress_barlog)r*   r_   rK   tensortorN   items)r2   batch	batch_idxaudio	audio_lenrT   rU   rC   rR   rS   rV   rW   r]   r,   r-   _	train_logr6   r6   r7   training_step   s   

zAlignerModel.training_stepc                 C   s|  |\}}}}}| j ||d\}}	| ||	|||d\}
}| |
||	|\}}}}|dkrt| jtrtr|d u r=t|
||	}g }ttd|j	d D ]L}|
tjttt|
|dd |	| d || f j  dd |
tjttt||dd |	| d || f j  dd qI| jjd|i |||d u rtd	|jn|d
}| j|ddddd d S )Nrj   rm   r      z	attn soft)captionz	attn hardattn_matricesr`   )val_lossval_forward_sum_lossval_bin_lossFT)prog_baron_epochlogger	sync_dist)r*   r_   
isinstancer   r   
HAVE_WANDBr   rangerg   shapeappendwandbImager   ra   fliplrrot90datacpunumpy
experimentrv   rK   rw   rx   rN   log_dict)r2   rz   r{   r|   r}   rT   rU   rC   rR   rS   rV   rW   r]   r,   r-   r^   r   ival_logr6   r6   r7   validation_step   sB   

4	4	zAlignerModel.validation_stepc                 C   sf   z|j j}W n tjjy   td Y d S w t|j | j| j	| j
d}tjjjd||jd|jS )Nz9manifest_filepath was skipped. No dataset for this model.)text_normalizerr   r<   )dataset
collate_fnr6   )r   manifest_filepath	omegaconferrorsMissingMandatoryValuer   warningr   r   r   r   rK   utilsr   
DataLoaderr   dataloader_params)r2   r   r~   r   r6   r6   r7   _loader   s$   

zAlignerModel._loaderc                 C      |  || _d S N)r   	_train_dlr2   r   r6   r6   r7   setup_training_data      z AlignerModel.setup_training_datac                 C   r   r   )r   _validation_dlr   r6   r6   r7   setup_validation_data   r   z"AlignerModel.setup_validation_datac                 C   s   dS )zOmitted.Nr6   r   r6   r6   r7   setup_test_data   s   zAlignerModel.setup_test_datareturnc                 C   s<   g }t ddd| d}|| t ddd| d}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        tts_en_radtts_alignerzqhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/ARPABET_1.11.0/files/Aligner.nemozbThis model is trained on LJSpeech sampled at 22050Hz with and can be used to align text and audio.)pretrained_model_namelocationdescriptionclass_tts_en_radtts_aligner_ipazmhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_radtts_aligner/versions/IPA_1.13.0/files/Aligner.nemo)r   r   )clslist_of_modelsmodelr6   r6   r7   list_available_models   s    

z"AlignerModel.list_available_modelsr   )__name__
__module____qualname____doc__r   r&   r   rX   r_   ri   r   r   r   r   r   r   classmethodr   r   r   __classcell__r6   r6   r4   r7   r   -   s    !+r   )#typingr   r   ra   r   rK   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr   r   r   (nemo.collections.tts.losses.aligner_lossr   r	    nemo.collections.tts.models.baser
   (nemo.collections.tts.parts.utils.helpersr   r   r   r   nemo.core.classesr   nemo.core.classes.commonr   
nemo.utilsr   r   r   r   ModuleNotFoundErrorr   r6   r6   r6   r7   <module>   s,   