o
    }o™iNA  ã                   @   sØ   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZ d dlm Z  e G dd„ deƒƒZ!dS )é    N)ÚIterable)Úinstantiate)ÚTrainer)ÚTensorBoardLogger)Ú
DictConfig)ÚDurationLossÚMelLossÚ	PitchLoss)ÚFastPitchSSLModuleÚaverage_features)Úmask_from_lens)Úplot_multipitch_to_numpyÚplot_spectrogram_to_numpy)ÚModelPT)ÚPretrainedModelInfo)ÚloggingÚmodel_utils)Úexperimentalc                       s¼   e Zd ZdZd'dedef‡ fdd„Zdd„ Zed	d
„ ƒZ	ddddddœdd„Z
d(dd„Zdd„ Zdd„ Zdd„ Z						d)dd„Zdd„ Zdd„ Zdd „ Zd!d"„ Zed*d%d&„ƒZ‡  ZS )+ÚFastPitchModel_SSLa*  
    FastPitch based model that can synthesize mel spectrograms from content and speaker embeddings
    obtained from SSLDisentangler. This model can be used for voice conversion by swapping the speaker embedding
    of a given source utterance, with the speaker embedding of a target speaker.
    NÚcfgÚtrainerc                    s¼  t  |¡}t  |¡}d| _d | _d | _tƒ j||d | dd¡| _	d| _
| jr*dnd}|}|}d|v r7|j}d|v r>|j}tƒ | _t|d	| _t|d	| _d }| d
d¡ | _}|rbt| jjƒ| _t| jjƒ}	d }
| dd¡| _| jrƒ| jd us}J dƒ‚t| jjƒ}
| dd¡ | _}|r•t| jjƒ}nd }tj | jj| jj ¡| _!tj | jj"| jj#¡| _$| dd¡| _%| j%dkrÆtj &| j%| jj'¡| _(t)||	|
||j'|j*|j+ƒ| _,i | _-|| j-d< d S )NF)r   r   Úbin_loss_warmup_epochséd   gš™™™™™¹?ç      ð?Údur_loss_scaleÚpitch_loss_scale)Ú
loss_scaleÚuse_encoderÚuse_duration_predictorz:use_encoder must be True if use_duration_predictor is TrueÚpitch_conditioningTÚ
n_datasetsé   Úvocoder).r   Ú#convert_model_config_to_dict_configÚmaybe_update_config_versionÚlearn_alignmentÚ_parserÚ
_tb_loggerÚsuperÚ__init__Úgetr   Úlog_train_imagesr   r   r   Úmel_lossr	   Ú
pitch_lossr   Úduration_lossr   r   Ú_cfgÚencoderÚ
output_fftr   Úduration_predictorr   Úpitch_predictorÚtorchÚnnÚLinearÚcontent_emb_indimÚcontent_emb_outdimÚcontent_projection_layerÚspeaker_emb_indimÚspeaker_emb_outdimÚspeaker_projection_layerÚnum_datasetsÚ	EmbeddingÚsymbols_embedding_dimÚdataset_embedding_layerr
   Úpitch_embedding_kernel_sizeÚn_mel_channelsÚ	fastpitchÚnon_trainable_models)Úselfr   r   r"   r   r   r   Ú	input_fftr   r1   r2   r   r3   ©Ú	__class__© ú]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/fastpitch_ssl.pyr)   )   s`   


ù
zFastPitchModel_SSL.__init__c                 C   s   | j d d u rt d¡ t ¡ . | j d j}t |¡ d¡ tj	¡ |¡}| j d j
|dd }| ¡  ¡ W  d   ƒ S 1 sAw   Y  d S )Nr"   z<Vocoder is none, should be instantiated as a HiFiGAN vocoderr   )Úx)rD   r   Úerrorr4   Úno_gradÚdeviceÚ
from_numpyÚ	unsqueezeÚtoÚfloat32Ú	generatorÚcpuÚnumpy)rE   ÚspectrogramÚvocoder_deviceÚ_specÚwav_generatedrI   rI   rJ   Úvocode_spectrogramn   s   


$üz%FastPitchModel_SSL.vocode_spectrogramc                 C   sd   | j d u r/| jd u r| jjd u rd S | jj}t| jtƒr,| jD ]}t|tƒr+|j} nq|| _ | j S ©N)r'   ÚloggerÚ
experimentÚ
isinstancer   )rE   Ú	tb_loggerr\   rI   rI   rJ   r_   y   s   


þzFastPitchModel_SSL.tb_loggerr   ©Úenc_outÚenc_maskÚdursÚpitchÚpacec                C   s   | j |||||dS )Nr`   )rC   )rE   ra   rb   rc   rd   re   rI   rI   rJ   Úforward‡   s   	ûzFastPitchModel_SSL.forwardc           
      C   s¼   |  ddd¡}|  |¡}|  ddd¡}|  |¡}|d d …d d …d f  dd|jd ¡}tj||gdd}|  ddd¡}| jdkr\|  |¡}|d d …d d d …f  d|jd d¡}	||	 }|S )Nr   é   r!   )Údim)	Úpermuter9   r<   ÚrepeatÚshaper4   Úcatr=   r@   )
rE   Úcontent_embeddingÚspeaker_embeddingÚ
dataset_idÚcontent_embedding_projectedÚspeaker_embedding_projectedÚspeaker_embedding_repeatedÚencodedÚdataset_embeddingÚdataset_embedding_repeatedrI   rI   rJ   Úcompute_encoding˜   s   

ÿ

&z#FastPitchModel_SSL.compute_encodingc                 C   s˜  |d }|d }|d }|d }|d }|d }|d }	|   |||¡}
| jr/| j|
|d\}
}t|ƒ}|d d …d d …d f }| |
||	|d	d
\}}}}}}d}| j||d}||7 }| jro| j||	|d}|  d|¡ ||7 }| jr„| j	|||d}||7 }|  d|¡ |  d|¡ |  d|¡ | j
rÊt| jtƒrÊd| _
| jjdt|d j ¡  ¡  ¡ ƒ| jdd |d j ¡  ¡  ¡ }| jjdt|ƒ| jdd |S )Nrm   Úencoded_lenrn   Úmel_spectrogramÚpitch_contourro   Úduration©ÚinputÚseq_lensr   r`   r   ©Úspect_predictedÚ	spect_tgt©Úlog_durs_predictedÚdurs_tgtÚlenÚ
t_dur_loss©Úpitch_predictedÚ	pitch_tgtr„   Út_pitch_lossÚt_lossÚ
t_mel_lossFÚtrain_mel_targetÚHWC©ÚdataformatsÚtrain_mel_predicted)rv   r   r0   r   r,   r   r.   Úlogr   r-   r+   r^   r\   r   r_   Ú	add_imager   ÚdatarT   ÚfloatrU   Úglobal_step)rE   ÚbatchÚ	batch_idxrm   rw   rn   Úmelsrd   ro   rc   ra   Ú_rb   Ú	mels_predÚlog_durs_predÚ
pitch_predÚlossr,   Údur_lossr-   Úspec_predictrI   rI   rJ   Útraining_step¯   s`   ûüüz FastPitchModel_SSL.training_stepc                 C   sL  |d }|d }|d }|d }|d }|d }|d }	|d }
|   |||	¡}| jr3| j||d	\}}t|ƒ}|d d …d d …d f }| |||
|d
d\}}}}}}| j||d}|||dkr`|nd |dkrg|nd |dkrn|nd |dkru|nd |dkr||nd dœ}| jr| j||
|d}||d< | jr¤| j|||d}||d< || |d< |S )Nrm   rw   rn   rx   Úmel_lenry   ro   rz   r{   r   r`   r~   r   )Úval_lossr,   Ú
mel_targetÚmel_predÚspec_lenÚpitch_targetrœ   r   rž   r†   r-   r¢   )	rv   r   r0   r   r,   r   r.   r   r-   )rE   r–   r—   rm   rw   rn   r˜   r¥   rd   ro   rc   ra   r™   rb   rš   r›   rœ   r,   Úval_outrž   r-   rI   rI   rJ   Úvalidation_stepë   sB   
ÿù
z"FastPitchModel_SSL.validation_stepc                    sØ  ‡ fdd„}|dƒ}|dƒ}|   d|¡ |   d|¡ | jr'|dƒ}|   d|¡ ˆ d	 }|d
 }|d }|d }	|d }
|d }t| jtƒrêt d	|jd	 d ¡}| jj	dt
|| j ¡  ¡  ¡ ƒ| jdd || j ¡  ¡  ¡ }| jj	dt
|ƒ| jdd | jr¡|| j ¡  ¡ }|
| j ¡  ¡ }| jj	dt||ƒ| jdd |	| j ¡  ¡ }|  || j ¡  ¡  ¡ d d …d |…f ¡}| j d|d	 | jd¡ |  |d d …d |…f ¡}| j d|d	 | jd¡ d| _d S d S )Nc                    s   t  ‡ fdd„ˆD ƒ¡ ¡ S )Nc                    s   g | ]}|ˆ  ‘qS rI   rI   )Ú.0rK   ©ÚkeyrI   rJ   Ú
<listcomp>  s    zPFastPitchModel_SSL.on_validation_epoch_end.<locals>.<lambda>.<locals>.<listcomp>)r4   ÚstackÚmeanrª   ©Úoutputsrª   rJ   Ú<lambda>  s    z<FastPitchModel_SSL.on_validation_epoch_end.<locals>.<lambda>r¢   r,   Úv_lossÚ
v_mel_lossr-   Úv_pitch_lossr   r£   r¤   r¥   r¦   rœ   r!   Úval_mel_targetr   rŽ   Úval_mel_predictedÚ	val_pitchz
Real audioi"V  zGenerated AudioT)r‘   r   r^   r\   r   ÚrandomÚrandintrk   r_   r’   r   r“   rT   r”   rU   r•   r   ÚitemrZ   Ú	add_audior+   )rE   r°   Úcollectr¢   r,   r-   Úsingle_outputÚspec_targetrŸ   r¥   r¦   rœ   Ú	_rand_idxÚ_pitch_predÚ_pitch_targetÚ	_spec_lenÚwav_vocodedrI   r¯   rJ   Úon_validation_epoch_end  sZ   üüü,
ßz*FastPitchModel_SSL.on_validation_epoch_endFr   c	                 C   sv  |  ¡ \}	}
}|du rt |	¡|  ¡  | j¡}t |	¡|  ¡  | j¡}| j|||d}| jr9| j||d\}}
t	|ƒ}|rBd}n|durI|}n| j
 dd¡}t |¡| }|dd…dd…df }|durŽ|dkrŽ|durzt| d¡|ƒ d¡}n|durŠt| d¡|ƒ d¡}ntdƒ‚d}| ||||d	d
^}}
g }t|	ƒD ]}|| j ¡  ¡  ¡ }|  |¡}| |¡ q¡|S )a  
        Args:
            content_embedding : Content embedding from SSL backbone (B, C, T)
            speaker_embedding : Speaker embedding from SSL backbone (B, C)
            pitch_contour : Normalized Pitch contour derived from the mel spectrogram
            encoded_len: Length of each content embedding, optional if batch size is 1.
            compute_pitch: if true, predict pitch contour from content and speaker embedding.
            compute_duration: if true, predict duration from content and speaker embedding.
            durs_gt: Ground truth duration of each content embedding, ignored if compute_duration is True.
            dataset_id: Dataset id if training is conditioned on multiple datasets
        Returns:
            List of waveforms
        N)ro   r{   Ússl_downsampling_factoré   Fr!   z durs or durs_gt must be providedr   r`   )Úsizer4   ÚonesÚlongrQ   rN   rv   r   r0   r   r/   r*   Ú	ones_liker   rP   ÚsqueezeÚ
ValueErrorÚranger“   rT   r”   rU   rZ   Úappend)rE   rm   rn   rw   ry   Úcompute_pitchÚcompute_durationÚdurs_gtro   Ú_bsr™   Ú_n_timera   rb   rc   rÅ   rd   rš   ÚwavsÚidxr¤   ÚwavrI   rI   rJ   Úgenerate_wavM  s:   
zFastPitchModel_SSL.generate_wavc                 C   s(   t |jƒ}tjjj|fd|ji|j¤ŽS )NÚ
collate_fn)r   Údatasetr4   Úutilsr“   Ú
DataLoaderÚpad_collate_fnÚdataloader_params)rE   r   rÙ   rI   rI   rJ   Ú__setup_dataloader_from_configŽ  s   
z1FastPitchModel_SSL.__setup_dataloader_from_configc                 C   ó   |   |¡| _d S r[   )Ú1_FastPitchModel_SSL__setup_dataloader_from_configÚ	_train_dl©rE   r   rI   rI   rJ   Úsetup_training_data“  ó   z&FastPitchModel_SSL.setup_training_datac                 C   rß   r[   )rà   Ú_validation_dlrâ   rI   rI   rJ   Úsetup_validation_data–  rä   z(FastPitchModel_SSL.setup_validation_datac                 C   s   dS )zOmitted.NrI   râ   rI   rI   rJ   Úsetup_test_data™  ó   z"FastPitchModel_SSL.setup_test_dataÚreturnúList[PretrainedModelInfo]c                 C   s   g S r[   rI   )ÚclsrI   rI   rJ   Úlist_available_models  rè   z(FastPitchModel_SSL.list_available_models)NNr[   )NNFFNr   )ré   rê   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r)   rZ   Úpropertyr_   rf   rv   r    r¨   rÄ   r×   rà   rã   ræ   rç   Úclassmethodrì   Ú__classcell__rI   rI   rG   rJ   r   !   s:    E
ù
<,:
÷Ar   )"r¸   Útypingr   r4   Úhydra.utilsr   Úlightning.pytorchr   Úlightning.pytorch.loggersr   Ú	omegaconfr   Ú)nemo.collections.tts.losses.fastpitchlossr   r   r	   Ú&nemo.collections.tts.modules.fastpitchr
   r   Ú(nemo.collections.tts.modules.transformerr   Ú(nemo.collections.tts.parts.utils.helpersr   r   Únemo.core.classesr   Únemo.core.classes.commonr   Ú
nemo.utilsr   r   Únemo.utils.decoratorsr   r   rI   rI   rI   rJ   Ú<module>   s"   