o
    }oi                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZA eG dd dZBeG dd dZCeG dd dZDG dd de!e0e%ZEdS )    N)	dataclassfield)Path)ListOptional)instantiate)Trainer)TensorBoardLogger)
DictConfig	OmegaConf	open_dict)
Hypothesis)parsers)BinLossForwardSumLoss)DurationLoss
EnergyLossMelLoss	PitchLoss)SpectrogramGenerator)FastPitchModule)FastPitchAdapterModelMixin)LoggingCallback)batch_from_raggedg2p_backward_compatible_supportplot_alignment_to_numpyplot_spectrogram_to_numpyprocess_batchsample_tts_input)
Exportable)PretrainedModelInfo	typecheck)IndexLengthsTypeMelSpectrogramType	ProbsTypeRegressionValuesTypeTokenDurationType
TokenIndexTokenLogDurationType)
NeuralType)loggingmodel_utilsc                   @   s>   e Zd ZU dZeed< dZeed< dZeed< dZe	ed< d	S )
	G2PConfigz8nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p_target_z.scripts/tts_dataset_files/cmudict-0.7b_nv22.10phoneme_dictz+scripts/tts_dataset_files/heteronyms-052722
heteronymsg      ?phoneme_probabilityN)
__name__
__module____qualname__r.   str__annotations__r/   r0   r1   float r8   r8   Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/fastpitch.pyr-   9   s
   
 r-   c                   @   sx   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< ed
d dZeed< dS )TextTokenizerzYnemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizerr.   Tpunctstresseschars
apostrophepad_with_spaceadd_blank_atc                   C      t  S N)r-   r8   r8   r8   r9   <lambda>J       zTextTokenizer.<lambda>default_factoryg2pN)r2   r3   r4   r.   r5   r6   r;   boolr<   r=   r>   r?   r@   r   rG   r-   r8   r8   r8   r9   r:   A   s   
 r:   c                   @   s$   e Zd ZU edd dZeed< dS )TextTokenizerConfigc                   C   rA   rB   )r:   r8   r8   r8   r9   rC   O   rD   zTextTokenizerConfig.<lambda>rE   text_tokenizerN)r2   r3   r4   r   rJ   r:   r6   r8   r8   r8   r9   rI   M   s   
 rI   c                       s<  e Zd ZdZdVdedef fddZdd Zd	d
 Ze	dd Z
e	dd ZdWdedejfddZeede ede ede ede ddede ddeddede ddede ddede ddede ddede ddede ddddddddddddddddddZed ede id!				dXd"d#d$ee d%ed&ed# d'ed# dejfd(d)Zd*d+ Zd,d- Zd.d/ Z d0d1 Z!d2d3 Z"dYd5e#d6efd7d8Z$d9d: Z%d;d< Z&d=d> Z'd?d@ Z(e)dZdBdCZ* fdDdEZ+dFdG Z,e	dHdI Z-e	dJdK Z.e	dLdM Z/d[dPdQZ0d\dRdSZ1dTdU Z2  Z3S )]FastPitchModelzfFastPitch model (https://arxiv.org/abs/2006.06873) that is used to generate mel spectrogram from text.Ncfgtrainerc                    s\  t |}t |}d | _d | _i | _| | |dd| _i }| jr^d | _	|j
jj| _| jdd | _| jdvrDtd| j d| | | j	d usPJ t| j	j|d< | j	j|d< d | _d | _t j||d	 |d
d| _|dd| _d| _| jrdnd}|d|}|d|}|d|}t | _t|d| _t |d| _!t"|d| _#d | _$| jr|dd}t%| j&j'| _$t(|d| _)t*|d| _+t%| j&j,| _,t%| j&j-fi |}	t%| j&j.}
t%| j&j/}t%| j&j0}t%| j&dd }|dd}t%| j&dd }|dd}|dd}|dd}|dd}|dd}|dd}|dkr=d |	j1j2vr=|	j1j23d  |rN|j1j23d  |j1j23d  |rX|
j1j23d  |ri| j$d uri| j$j1j23d  t4|	|
|||| j$|||j5|j6||j7||j8|| _9d  | _:| _;d| j9j<j=j>fddd!| _?| j9j@d ur|jA| j?d"< |d#d | _B| C  d S )$Nlearn_alignmentF.),nemo.collections.tts.data.dataset.TTSDatasetDnemo.collections.tts.data.text_to_speech_dataset.TextToSpeechDatasetz*nemo.collections.tts.torch.data.TTSDatasetzUnknown dataset class: n_embedpadding_idx)rL   rM   bin_loss_warmup_epochsd   
log_imagesg?      ?dur_loss_scalepitch_loss_scaleenergy_loss_scale)
loss_scalealigner_loss_scalespeaker_encoderenergy_embedding_kernel_sizer   energy_predictor
n_speakersspeaker_emb_condition_prosodyspeaker_emb_condition_decoderspeaker_emb_condition_alignermin_token_durationuse_log_energyT   add)	emb_rangeenable_volumeenable_ragged_batchesnum_speakers
log_config)Dr,   #convert_model_config_to_dict_configmaybe_update_config_version
normalizertext_normalizer_calltext_normalizer_call_kwargs_setup_normalizergetrN   vocabtrain_dsdatasetr.   ds_classsplitds_class_name
ValueError_setup_tokenizerlentokenspad_parser
_tb_loggersuper__init__rU   rW   log_train_imagesr   mel_loss_fnr   pitch_loss_fnr   duration_loss_fnr   energy_loss_fnalignerr   _cfgalignment_moduler   forward_sum_loss_fnr   bin_loss_fnpreprocessor	input_fft
output_fftduration_predictorpitch_predictor
cond_inputcondition_typesappendr   symbols_embedding_dimpitch_embedding_kernel_sizen_mel_channelsmax_token_duration	fastpitch_input_types_output_typesencoderword_embnum_embeddingsexport_configspeaker_embra   rm   setup_adapters)selfrL   rM   input_fft_kwargsdefault_prosody_loss_scalerY   rZ   r[   r]   r   r   r   r   r^   r_   r`   ra   rb   rc   rd   re   rf   	__class__r8   r9   r   U   s   




zFastPitchModel.__init__c                 C   s   t  }tt|S rB   )rI   r   createto_yaml)r   rJ   r8   r8   r9    _get_default_text_tokenizer_conf   s   z/FastPitchModel._get_default_text_tokenizer_confc                 C   s   i }d|j v rZ|  r*|j jdd d ur*|j jd dr*t|j jd |j jd< i }d|j jv r=| d|j jj|d< d|j jv rN| d|j jj|d< t	|j jfi ||d< t	|j fi || _
d S )NrG   r.   znemo_text_processing.g2pr/   ztext_tokenizer.g2p.phoneme_dictr0   ztext_tokenizer.g2p.heteronyms)rJ   _is_model_being_restoredrG   rt   
startswithr   register_artifactr/   r0   r   ru   )r   rL   text_tokenizer_kwargs
g2p_kwargsr8   r8   r9   r|      s.   

zFastPitchModel._setup_tokenizerc                 C   sZ   | j d u r*| jd u r| jjd u rd S | jj}| jjD ]}t|tr&|j} nq|| _ | j S rB   )r   logger
experimentrM   loggers
isinstancer	   )r   	tb_loggerr   r8   r8   r9   r      s   

zFastPitchModel.tb_loggerc              	   C   sJ   | j d ur| j S | jr| jj| _ | j S tj| jjddddddd| _ | j S )NenrP   Tr   F)labelsnameunk_idblank_iddo_normalizeabbreviation_version
make_table)r   rN   ru   encoder   make_parserr   r   r   r8   r8   r9   parser   s   

	zFastPitchModel.parserT	str_inputreturnc                 C   s   | j rtd t|tr|j}|r!| jd ur!| j|fi | j}| jrMt	
 }t| jdr5| jjdd}| | |}W d    n1 sGw   Y  n| |}t|d | j}|S )Nz+parse() is meant to be called in eval mode.set_phone_probrX   probr   )trainingr+   warningr   r   textrq   rr   rN   
contextlibnullcontexthasattrru   r   r   torchtensor
unsqueeze_longtodevice)r   r   	normalizeeval_phon_moder~   xr8   r8   r9   parse  s"   


zFastPitchModel.parse)BT_text)r   T_audiooptionalr   )r   DT_spec)r   r   r   r   durspitchenergyspeakerpacespec
attn_priormel_lens
input_lensreference_specreference_spec_lens)input_typesrX   )r   r   r   r   r   r   r   r   r   r   r   c                C   s"   | j |||||||||	|
||dS )Nr   )r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r8   r8   r9   forward$  s   !zFastPitchModel.forwardspect)output_typesr~   ztorch.tensorr   r   r   r   c              	   C   sL   | j rtd t|trt|g| j}| |d d ||||d^}}|S )Nz:generate_spectrogram() is meant to be called in eval mode.)r   r   r   r   r   r   r   )	r   r+   r   r   intr   r   r   r   )r   r~   r   r   r   r   r   _r8   r8   r9   generate_spectrogramT  s   	


	z#FastPitchModel.generate_spectrogramc           (      C   s6  d\}}}}}}| j rT| jdkr|}	nt|| jjj}	|	d}
|	d}|	d}|	d}|	dd }|	dd }|	d	d }|	d
d }|	dd }|	dd }n	|\}
}}}}}}| j|
|d\}}d\}}|d urw| j||d\}}| |||||d| j r|nd |||||d\}}}}}}}}}}}}|d u r|}| j||d}| j	|||d}|| }| j r| j
|||d} t| j| j dd }!| j||d|! }"|| |" 7 }| j|||d}#| j|||d}$||#|$ 7 }| d| | d| | d| | d|# |d ur| d|$ | j r| d|  | d|" | jr| jrt| jtrd| _| jjdt|d  j   | jd!d" |d  j   }%| jjd#t|%| jd!d" | j r|d  j    }&| jjd$t |&j!| jd!d" |d  j    }'| jjd%t |'j!| jd!d" |S )&NNNNNNNrR   audio
audio_lensr   	text_lensalign_prior_matrixr   r   
speaker_idreference_audioreference_audio_lensinput_signallengthNNrX   r   r   r   r   r   r   r   r   r   r   r   r   spect_predicted	spect_tgtlog_durs_predicteddurs_tgtr}   )attn_logprobin_lensout_lens)hard_attentionsoft_attentionpitch_predicted	pitch_tgtr}   energy_predicted
energy_tgtr   t_loss
t_mel_loss
t_dur_losst_pitch_losst_energy_loss
t_ctc_loss
t_bin_lossFtrain_mel_targetr   HWCdataformatstrain_mel_predicted
train_attntrain_soft_attn)"rN   rx   r   	_train_dlrw   sup_data_types_setrt   r   r   r   r   mincurrent_epochrU   r   r   r   logrW   r   r   r   r	   r   	add_imager   datacpur7   numpyglobal_stepsqueezer   T)(r   batch	batch_idxr   r   r   r   r   reference_audio_len
batch_dictr   r   r   r   r   melsspec_lenr   reference_spec_len	mels_predr   log_durs_pred
pitch_pred	attn_softr  	attn_hardattn_hard_durenergy_predr  mel_lossdur_losslossctc_lossbin_loss_weightbin_loss
pitch_lossenergy_lossspec_predictattn	soft_attnr8   r8   r9   training_stepl  s   






zFastPitchModel.training_stepc                  C   s  d\}}}}}}| j rT| jdkr|}	nt|| jjj}	|	d}
|	d}|	d}|	d}|	dd }|	dd }|	d	d }|	d
d }|	dd }|	dd }n	|\}
}}}}}}| j|
|d\}}d\}}|d urw| j||d\}}| |||||d| j r|nd |||||d\}}}}}}}}}}}}|d u r|}| j||d}| j	|||d}| j
|||d}| j|||d}|| | | }|||||d ur|nd |dkr|nd |dkr|nd d}| j| |S )Nr   rR   r   r   r   r   r   r   r   r   r   r   r   r   rX   r   r   r   r  r  r   )val_lossr6  r7  r<  r=  
mel_targetmel_pred)rN   rx   r   r  rw   r  rt   r   r   r   r   r   validation_step_outputsr   ) r   r(  r)  r   r   r   r   r   r*  r+  r   r   r   r   r   r,  r   r   r.  r/  r   r0  r1  r4  r5  r  r6  r7  r<  r=  r8  val_outputsr8   r8   r9   validation_step  s   





	zFastPitchModel.validation_stepc           
         s4   fdd}|d}|d}|d}|d} j d|dd  j d	|dd  j d
|dd  j d|dd  jd d d urK|d} j d|dd  jd  \}}}}}}}	 jrt jtr jjdt	|d j
    jdd |	d j
   }	 jjdt	|	 jdd d _ j  d S )Nc                    s   t  fddjD  S )Nc                    s   g | ]}|  qS r8   r8   ).0r   keyr8   r9   
<listcomp>:  s    zLFastPitchModel.on_validation_epoch_end.<locals>.<lambda>.<locals>.<listcomp>)r   stackrE  meanrI  r   rI  r9   rC   :  s    z8FastPitchModel.on_validation_epoch_end.<locals>.<lambda>rB  r6  r7  r<  T)	sync_distval_mel_lossval_dur_lossval_pitch_lossr   r=  val_energy_lossval_mel_targetr  r  val_mel_predicted)r   rE  valuesrW   r   r   r	   r   r!  r   r"  r#  r7   r$  r%  r   clear)
r   collectrB  r6  r7  r<  r=  r   spec_targetr>  r8   r   r9   on_validation_epoch_end9  s:   z&FastPitchModel.on_validation_epoch_endc                 C   s   t  }t| jdr| j| jj}| t|j| jd}W d    n1 s'w   Y  |j|j	j
| jjd}tjjj|f|j|d|j	S )Nr   rJ   )
world_size)
collate_fnsampler)r   r   r   ru   r   r1   r   rw   get_samplerdataloader_params
batch_sizerM   r[  r   utilsr"  
DataLoaderr\  )r   rL   	phon_moderw   r]  r8   r8   r9   _setup_train_dataloaderZ  s"   z&FastPitchModel._setup_train_dataloaderc                 C   sr   t  }t| jdr| jd}| t|j| jd}W d    n1 s%w   Y  tjj	j
|fd|ji|jS )Nr   g        rZ  r\  )r   r   r   ru   r   r   rw   r   ra  r"  rb  r\  r_  )r   rL   rc  rw   r8   r8   r9   _setup_test_dataloaderj  s   z%FastPitchModel._setup_test_dataloadertrainshuffle_should_ber   c                 C   s  d|vs
t |jtstd| d|vst |jts"td| |r`d|jvrOtd|  d| d t|j d	|j_W d    n1 sIw   Y  n!|jjs_t	d
| d|  d n|jjrpt	d
| d|  d | j
dkrt }t| jdr| jj|dkrd n| jjd}| t|j| j| j| jd}W d    n1 sw   Y  nt|j}tjjj|fd|ji|jS )Nrw   zNo dataset for r_  zNo dataloader_params for shufflez"Shuffle should be set to True for z's zE dataloader but was not found in its config. Manually setting to TrueTzThe z dataloader for z has shuffle set to False!!!z has shuffle set to True!!!rQ   r   valr   )text_normalizerrr   rJ   r\  )r   rw   r
   r{   r_  r+   r   r   rh  errorrx   r   r   r   ru   r   r1   r   rp   rr   r   ra  r"  rb  r\  )r   rL   rg  r   rc  rw   r8   r8   r9   __setup_dataloader_from_configw  sB   



z-FastPitchModel.__setup_dataloader_from_configc                 C   s*   | j dkr| || _d S | || _d S )NrR   )rx   rd  r  -_FastPitchModel__setup_dataloader_from_configr   rL   r8   r8   r9   setup_training_data  s   
z"FastPitchModel.setup_training_datac                 C   s0   | j dkr| || _d S | j|ddd| _d S )NrR   Fri  )rg  r   )rx   re  _validation_dlrm  rn  r8   r8   r9   setup_validation_data  s   
z$FastPitchModel.setup_validation_datac                 C   s   dS )zOmitted.Nr8   rn  r8   r8   r9   setup_test_data  s   zFastPitchModel.setup_test_datac              
   C   s   | j sg S | j jj}|dkrtd| | | j }t| j j}| j jr+t| j jnd }t	||| j j
| j j|| jj| j j| j jd}|gS )NrR   z=Logging callback only supported for TextToSpeechDataset, got )
generatorsdata_loader
log_epochsepoch_frequency
output_dirr   log_tensorboard	log_wandb)rm   rw   r.   r{   re  r   rs  log_dirr   r   ru  rv  rM   r   rx  ry  )r   sample_ds_classrt  rs  rz  log_callbackr8   r8   r9   configure_callbacks  s&   
z"FastPitchModel.configure_callbacksList[PretrainedModelInfo]c                 C   s   g }t ddd| d}|| t ddd| d}|| t dd	d
| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| t ddd| d}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        tts_en_fastpitchzrhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemozThis model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent. It is ARPABET-based.)pretrained_model_namelocationdescriptionclass_tts_en_fastpitch_ipaz{https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/IPA_1.13.0/files/tts_en_fastpitch_align_ipa.nemozThis model is trained on LJSpeech sampled at 22050Hz with and can be used to generate female English voices with an American accent. It is IPA-based.tts_en_fastpitch_multispeakerzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_fastpitch_multispeaker.nemozThis model is trained on HiFITTS sampled at 44100Hz with and can be used to generate male and female English voices with an American accent.3tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2102.nemou   This model is trained on a single male speaker data in Thorsten Müller’s German Neutral 21.02 Dataset sampled at 22050Hz and can be used to generate male German voices.3tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2210.nemou   This model is trained on a single male speaker data in Thorsten Müller’s German Neutral 22.10 Dataset sampled at 22050Hz and can be used to generate male German voices.tts_de_fastpitch_multispeaker_5zhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemozThis model is trained on 5 speakers in HUI-Audio-Corpus-German clean subset sampled at 44100Hz with and can be used to generate male and female German voices.tts_es_fastpitch_multispeakerzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_fastpitch_multispeaker.nemozThis model is trained on 174 speakers in 6 crowdsourced Latin American Spanish OpenSLR datasets sampled at 44100Hz and can be used to generate male and female Spanish voices with Latin American accents.tts_zh_fastpitch_sfspeechzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_fastpitch_sfspeech.nemoa  This model is trained on a single female speaker in SFSpeech Bilingual Chinese/English dataset sampled at 22050Hz and can be used to generate female Mandarin Chinese voices. It is improved using richer dict and jieba word segmenter for polyphone disambiguation.#tts_en_fastpitch_for_asr_finetuningzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning/versions/1.20.0/files/tts_en_fastpitch_for_asr_finetuning.nemoa  This model is trained on LibriSpeech, train-960 subset. STFT parameters follow those commonly used in ASR: 25 ms window, 10 ms hop. This model is supposed to be used with its companion SpetrogramEnhancer for  ASR fine-tuning. Usage for regular TTS tasks is not advised.)r    r   )clslist_of_modelsmodelr8   r8   r9   list_available_models  s   








	z$FastPitchModel.list_available_modelsc              
      s   t  jdi | | jd rdnd}t|t t|t t|t|ddtdddtdt ddd| _tdt tdt	 tdt	 tdt
 tdt d	| _| jd
 ratdt | jd< d S d S )Nrk   r'  )r   r'  Tr   r   )r   r   r   volumebatch_lengthsr   )r   r   r'  )r   
num_framesdurs_predictedr  r	  rj   volume_alignedr8   )r   _prepare_for_exportr   r*   r(   r&   r"   r   r$   r'   r)   r   )r   kwargstensor_shaper   r8   r9   r  (  s$   




	




z"FastPitchModel._prepare_for_exportc                 C   s   d  | _ | _d S rB   )r   r   r   r8   r8   r9   _export_teardown@  s   zFastPitchModel._export_teardownc                 C   sH   t  }| jjdu r|d | jd s|d | jd s"|d |S )zHImplement this method to return a set of input names disabled for exportNr   rk   r  rj   r  )setr   r   rh   r   )r   disabled_inputsr8   r8   r9   disabled_deployment_input_namesC  s   




z.FastPitchModel.disabled_deployment_input_namesc                 C      | j S rB   )r   r   r8   r8   r9   r   O     zFastPitchModel.input_typesc                 C   r  rB   )r   r   r8   r8   r9   r   S  r  zFastPitchModel.output_typesrg   ,   c                 C   s>   t | j }t| j|j||d}d| jvr|dd |fS )zs
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
        )	max_batchmax_dimrk   r  N)nextr   
parametersr   r   r   pop)r   r  r  parinputsr8   r8   r9   input_exampleW  s
   
zFastPitchModel.input_examplec           	      C   sP   | j d rt||||| jjj|d\}}}}}|d ur|}| jj|||||dS )Nrk   )rT   r  )r   r   r   r  r   )r   r   r   r   rT   infer)	r   r   r   r   r  r  r   volume_tensorlensr8   r8   r9   forward_for_exportc  s   
z!FastPitchModel.forward_for_exportc           
      C   s   | j jdu r
td| j jjj d }||ks ||ks ||kr(td| d| j tj|tjd	 
  }| j tj|tjd	 
  }|| ||  }	|	| j jjj|< dS )al  
        This method performs speaker interpolation between two original speakers the model is trained on.

        Inputs:
            original_speaker_1: Integer speaker ID of first existing speaker in the model
            original_speaker_2: Integer speaker ID of second existing speaker in the model
            weight_speaker_1: Floating point weight associated in to first speaker during weight combination
            weight_speaker_2: Floating point weight associated in to second speaker during weight combination
            new_speaker_id: Integer speaker ID of new interpolated speaker in the model
        NzCurrent FastPitch model is not a multi-speaker FastPitch model. Speaker interpolation can only                 be performed with a multi-speaker modelr   zParameters original_speaker_1, original_speaker_2, new_speaker_id should be less than the total                 total number of speakers FastPitch was trained on (n_speakers = z).)dtype)r   r   	Exceptionweightr"  sizer   r   int32cudaclonedetach)
r   original_speaker_1original_speaker_2weight_speaker_1weight_speaker_2new_speaker_idra   speaker_emb_1speaker_emb_2new_speaker_embr8   r8   r9   interpolate_speakerl  s"   ""z"FastPitchModel.interpolate_speakerrB   )T)NrX   NN)Trf  )r   r~  )rg   r  )NNN)4r2   r3   r4   __doc__r
   r   r   r   r|   propertyr   r   r5   r   r   r   r!   r*   r(   r'   r&   r"   r$   r%   r#   r   r   r   r7   r   rA  rG  rY  rd  re  rH   rm  ro  rq  rr  r}  classmethodr  r  r  r  r   r   r  r  r  __classcell__r8   r8   r   r9   rK   R   s    s"




zS!#d




	rK   )Fr   dataclassesr   r   pathlibr   typingr   r   r   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr	   	omegaconfr
   r   r   +nemo.collections.asr.parts.utils.rnnt_utilsr   +nemo.collections.common.parts.preprocessingr   (nemo.collections.tts.losses.aligner_lossr   r   )nemo.collections.tts.losses.fastpitchlossr   r   r   r    nemo.collections.tts.models.baser   &nemo.collections.tts.modules.fastpitchr   !nemo.collections.tts.parts.mixinsr   *nemo.collections.tts.parts.utils.callbacksr   (nemo.collections.tts.parts.utils.helpersr   r   r   r   r   r   nemo.core.classesr   nemo.core.classes.commonr    r!   nemo.core.neural_types.elementsr"   r#   r$   r%   r&   r'   r(   r)   "nemo.core.neural_types.neural_typer*   
nemo.utilsr+   r,   r-   r:   rI   rK   r8   r8   r8   r9   <module>   s<    (
