o
    }oiZ                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ e$G dd deZ%dS )    N)IterableOptional)instantiate)Trainer)TensorBoardLogger)CombinedLoader)
DictConfig	OmegaConf)AngularSoftmaxLoss)
TTSDataset)GreedyCTCDecoder)BaseTokenizerEnglishCharsTokenizer)ModelPT)PretrainedModelInfo)WarmupPolicy)logging)experimentalc                       s   e Zd ZdZddedef fddZedee	 fdd	Z
ed
d Zdd Zdd Zdd Zdd Zd ddZd ddZdd Zdd Zdd Z  ZS )!SSLDisentanglera  
    SSLDisentangler is a Conformer based model for extracting disentangled content and speaker embeddings
    from an audio waveform. This model uses a pre-trained Conformer SSL model. To extract the linguistic content
    and speaker representations using a pre-trained Conformer, two randomly initialized downstream
    heads are added and the entire setup is finetuned in multi-task manner for speech recognition and speaker verification.
    These representations can be used by FastPitchModel_SSL for voice conversion by swapping the speaker embedding
    of a given source utterance, with the speaker embedding of a target speaker.
    Ncfgtrainerc                    s  t  j||d t| jj| _t| jj| _tdd| _	d | _
tj | _| jjjD ]}|dkr[| jjj}| jjj}| jjj}tj||| j|< tj||| _tddd| _q,|dkr| jjj}| jjj}t| j	j}tj||| j|< tj||| _tjj| j	jd	d
| _| jdd| _ | jdd| _!| jdd| _"| jdd| _#| j#o| j!dksJ dtj$ | _%t&| j	j| j	j| _'q,t(| dd| _)| jj}t*j+j,|j-|j.|j/ddd}	tj0|	tj1d2d}
| 3d|
 d S )N)r   r   lastadd_blank_atspeaker_verification   g?)scalemargincontentT)blankzero_infinitypitch_augmentFaugment_ctcaug_loss_typemsestop_gradientz=stop_gradient and augment_ctc cannot be true at the same timeC is not a valid task. Task must be speaker_verification or content.r   i@  )srn_fftn_melsfminfmax)dtypefb)4super__init__r   from_config_dict_cfgpreprocessorpreprocessor_disentanglerencoderr   _text_tokenizer
_tb_loggertorchnn
ModuleDictdownstream_netsdownstream_heads
task_namesd_modelspeaker_embed_sizenum_speakersLinear	sv_linearr
   sv_losscontent_embed_sizelentokenscontent_linearCTCLossr   ctc_lossgetr!   r"   r#   r%   MSELossmse_lossr   ctc_decoder
ValueErrorautomatic_optimizationlibrosafiltersmelsample_rater(   featurestensorfloat	unsqueezeregister_buffer)selfr   r   taskin_dimout_dimr?   	num_charsstft_cfglibrosa_mel_filterr-   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/ssl_tts.pyr/   0   sZ   





zSSLDisentangler.__init__returnc                 C   s8   g }t dddd}|| t dddd}|| |S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        ssl_en_conformer_largezsFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_largezyhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_large/versions/1.10.1/files/ssl_en_conformer_large.nemo)pretrained_model_namedescriptionlocationssl_en_conformer_xlargeztFor details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:ssl_en_conformer_xlargez{https://api.ngc.nvidia.com/v2/models/nvidia/nemo/ssl_en_conformer_xlarge/versions/1.10.0/files/ssl_en_conformer_xlarge.nemo)r   append)clsresultsmodelra   ra   rb   list_available_modelsf   s   

z%SSLDisentangler.list_available_modelsc                 C   sd   | j d u r/| jd u r| jjd u rd S | jj}t| jtr,| jD ]}t|tr+|j} nq|| _ | j S N)r6   logger
experiment
isinstancer   r   )rX   	tb_loggerro   ra   ra   rb   rr      s   


zSSLDisentangler.tb_loggerc           	      C   sj  t | drt| jtr| j}nt | drt| jtstd tdd }| _| jjj	D ]}|dkret
|d | jj||d dg|d	 |d
dd}tjjj||d |j|d |dd|ddd}q,|dkrt
|d | jj||d |d |dd|dd|d	 |d
dd	}tjjj||d |j|d |dd|ddd}q,t| d||d}|S ) Nr5   zStest_tokenizer is set but not a BaseTokenizer. Will be set to EnglishCharsTokenizerr   r   r    manifest_speaker_verification_fpsegment_max_duration
speaker_idsup_data_pathpad_multiple   )manifest_filepathrR   text_tokenizerrt   sup_data_typesrv   rw   batch_size_svshufflenum_workers_svr   
pin_memoryF)
batch_size
collate_fnr}   num_workersr   r   manifest_content_fpmin_duration_contentmax_duration_contentr!   cache_pitch_augmentT)	ry   rR   rz   min_durationmax_durationr!   r   rv   rw   batch_size_contentnum_workers_contentr&   )svr   )hasattrrq   r5   r   r   warningr   r1   r;   r<   r   rR   rI   r7   utilsdata
DataLoadergeneral_collate_fnrM   )	rX   data_configr5   rY   
sv_dataset	sv_loadercontent_datasetcontent_loaderloadersra   ra   rb   __setup_dataloader_from_config   s^   

	

	






z.SSLDisentangler.__setup_dataloader_from_configc                 C   s   |  | jj| _d S rn   )._SSLDisentangler__setup_dataloader_from_configr1   train_ds	_train_dlrX   r   ra   ra   rb   setup_training_data   s   z#SSLDisentangler.setup_training_datac                 C   s   t | | jj| _d S rn   )r   r   r1   validation_ds_validation_dlr   ra   ra   rb   setup_validation_data   s   z%SSLDisentangler.setup_validation_datac              	   C   s  | j j }| j j }t|d |dd }t|d t|d |dd }t|d t|| j	 d}t|t
| j	 | j	 | j	 | j	 d}|d ur|d urt|d |j|jd}|dd}t|d |j|jd}	|	dd}
||g||
gfS ||gS )NFschedT)params)	optimizer	max_stepsmin_lrwarmup_stepsstep)	schedulerinterval)r1   optim_backbonecopyoptim_downstreamr	   
set_structpopr   r4   
parameters	itertoolschainr:   rA   rF   rB   r   r   r   )rX   optim_backbone_configoptim_downstream_configsched_backbone_configsched_downstream_configr   r   scheduler_backbone	sch1_dictscheduler_downstream	sch2_dictra   ra   rb   configure_optimizers   sT   
z$SSLDisentangler.configure_optimizersTc                 C   s  | j ||d\}}| j||d\}}| jjjD ]c}|dkr@| jd |d d d d df }	tj|	dddd}
|	|
 }| |}q|d	krt|	ddd
}| jd	 |}|ratj|dddd}|| }| 
|}|jdd}|	d
dd}qt| d|||||fS )N)input_signallength)audio_signalr   r   r      T)pdimkeepdimr   rx   r   r&   )r3   r4   r1   r;   r<   r:   r7   normrA   permuterF   log_softmaxrM   )rX   r   input_signal_lengthnormalize_contentprocessed_signalprocessed_signal_lengthencodedencoded_lenrY   speaker_embeddingl2_normspeaker_embedding_normalizedspeaker_logitsencoded_btccontent_embeddingl2_norm_contentcontent_logitscontent_log_probsra   ra   rb   forward  s6   
 
zSSLDisentangler.forwardc                 C   s   | j |||dS )N)r   r   r   )r   )rX   r   r   r   ra   ra   rb   forward_for_export*  s
   z"SSLDisentangler.forward_for_exportc           !   	   C   s  d}|   \}}|  }| D ]{}|dkr|| d }|| d }	|| d }
| j||	d\}}}}}tj|dd}| j||
d	}||7 }| jjs`|	  |	  | 
| |  |  ||
j|  }|t|
 d
 }| d|  | d| q|dkrd}|| d }|| d }	|| d }|| d }| j||	d\}}}}}| ||||}t|r| d|  ||7 }ntd | jr_|| d }| jrt  | j||	d\}}}}}W d    n1 sw   Y  n| j||	d\}}}}}| jdkr| ||}n| jdkr+tjjj||dd }d| }|| jj | 7 }| d|  | j!r_| ||||}t|rZ||7 }| d|  ntd ||7 }| jjs}|	  |	  | 
| |  |  t"|tj#r| d|  q| jjr|	  |	  | 
| |  |  |d ur|\}} |  |   | j$j%d dkr| d|j&d d  | d|j&d d  | d | d S d S )!Ng        r   audio
audio_lensru   r   r   rx   r   logitslabelsd   	t_sv_losst_sv_accuracyr   r   text	text_lens
t_ctc_losszctc_loss is not finiteaudio_shiftedr$   cosiner         ?
t_sim_losst_ctc_loss_augzCctc_loss_aug is not finite. Add min duration to avoid getting here.t_content_loss
   lr_backbonelrlr_downstreamt_loss)'
optimizerslr_schedulerskeysr   r7   argmaxrB   r1   combined_loss	zero_gradmanual_backwardr   eqr   view_assumitemrD   logrH   isfiniter   r   r!   r%   no_gradr#   rK   r8   
functionalcosine_similaritymeanaugment_sim_alphar"   rq   Tensorr   global_stepparam_groups)!rX   batch	batch_idxlossr   r   
schedulerskeysignal
signal_lenru   	sv_logitssv_emb_pred_speakerrB   correctacccontent_losstarget
target_lenr   r   r   rH   augmented_signalcontent_embedding_augcontent_log_probs_augsim_lossr   ctc_loss_augsch1sch2ra   ra   rb   training_step3  s   











zSSLDisentangler.training_stepc           $         s  d}|  D ]?}|dkrW|| d }|| d }|| d }| j||d\}}	}
}
}
tj|dd}| j||d	}||7 }||j| 	 }|t
| d
 }t|}|dkrFd}|| d }|| d }|| d }|| d }| j||d\}
}
}}}| ||||}t|r||7 }ntd | jr|| d }| j||d\}
}
}}}
| jdkr| ||}n| jdkrtjjj||dd }d| }|| jj| 7 }||7 }g }t|jd D ]a}|d d |d d f d ||   }|| d ||   }| |\}
} | j  j fdd|  D }!t!"| |!}"t#t
| t
|!dkr>d|" t#t
| t
|! }#nd}#|$|# qq| | | | | t%|  dS )Nr   r   r   r   ru   r   rx   r   r   r   r   r   r   z?ctc_loss is not finite. Add min duration to avoid getting here.r   r$   r   r   r   c                 3   s    | ]} j | V  qd S rn   )	_id2token).0t	tokenizerra   rb   	<genexpr>  s    z2SSLDisentangler.validation_step.<locals>.<genexpr>)val_lossrB   rH   r  accuracy_svcer)&r   r   r7   r   rB   r   r   r   r   r   rD   	as_tensorrH   r   r   r   r!   r#   rK   r8   r   r   r   r1   r   rangeshapecpurL   r5   sepjointolisteditdistanceevalmaxri   rT   )$rX   r  r  
loss_totalr  r	  r
  ru   r  r  r  r  rB   r  r  acc_valr  r  r  r   r   r   rH   r  r  r  r  r   cers_idxitem_log_probitem_targetpredicted_str
target_strednormalized_edra   r  rb   validation_step  s|   






&zSSLDisentangler.validation_stepc           	         s    fdd}|d}|d}|d}|d}|d}|d}|  d| |  d| |  d	| |  d
| |  d| |  d| d S )Nc                    s   t  fddD  S )Nc                    s"   g | ]}t |  r|  qS ra   )r7   r   )r  xr  ra   rb   
<listcomp>  s   " zMSSLDisentangler.on_validation_epoch_end.<locals>.<lambda>.<locals>.<listcomp>)r7   stackr   r;  outputsr;  rb   <lambda>  s    z9SSLDisentangler.on_validation_epoch_end.<locals>.<lambda>r"  rB   rH   r  r#  r$  val_ctc_lossval_content_loss)r   )	rX   r?  collectr"  val_sv_lossrA  rB  r#  r$  ra   r>  rb   on_validation_epoch_end  s   z'SSLDisentangler.on_validation_epoch_endrn   )NNT)__name__
__module____qualname____doc__r   r   r/   classmethodr   r   rm   propertyrr   r   r   r   r   r   r   r  r9  rE  __classcell__ra   ra   r_   rb   r   %   s     	6
:
6
&	mLr   )&r   typingr   r   r,  rO   r7   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr   +lightning.pytorch.utilities.combined_loaderr   	omegaconfr   r	   'nemo.collections.asr.losses.angularlossr
   !nemo.collections.tts.data.datasetr   $nemo.collections.tts.modules.ssl_ttsr   )nemo.collections.tts.torch.tts_tokenizersr   r   nemo.core.classesr   nemo.core.classes.commonr   nemo.core.optim.lr_schedulerr   
nemo.utilsr   nemo.utils.decoratorsr   r   ra   ra   ra   rb   <module>   s*   