o
    wiݍ                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
  mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZmZmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z: zd dl;Z;dZ<W n e=y   dZ<Y nw e:G dd de(Z>dS )    N)ceil)Path)ListTuple)	rearrange)instantiate)Trainer)
DictConfig	OmegaConf	open_dict)FeatureMatchingLossMultiResolutionMelLossMultiResolutionSTFTLossRelativeFeatureMatchingLoss	SISDRLossTimeDomainLoss)ResNetSpeakerEncoder)GaussianDropout)LoggingCallback)get_batch_sizeget_num_workers)ModelPT)PretrainedModelInfo	typecheck)AudioSignalEncodedRepresentationLengthsType
TokenIndex)
NeuralType)compute_max_stepsprepare_lr_scheduler)loggingmodel_utils)experimentalTFc                	       s  e Zd ZdVdedef fddZdW fdd		ZdX fdd	ZdYddZe	e
de e
ede de
de e
ede dddejdejdeejejf fddZe	e
de e
ede de
de e
ede dddejdejdeejejf fddZe	e
de e
ede dde
d e idd!ejd"ejdejfd#d$Ze	e
d e e
ede d%d&e
de iddejd'ejdejfd(d)Ze	e
de e
ede de
d e e
ede d%ddejdejdeejejf fd*d+Ze	e
d e e
ede d%e
de e
ede dddejd'ejdeejejf fd,d-Ze	e
de e
ede de
de e
ede d.ddejdejdeejejf fd/d0Zd1d2 Zd3d4 Zedefd5d6Zdefd7d8Z d9d: Z!d;d< Z"d=d> Z#d?d@ Z$dAdB Z%dCdD Z&dEdF Z'dGdH Z(dIdJ Z)edKdL Z*dMdN Z+dZdPdQZ,dRdS Z-e.de/e0 fdTdUZ1  Z2S )[AudioCodecModelNcfgtrainerc           
         s  t |}t |}d| _|d ur|j|j | _t j||d |j| _|j	| _	|
dd| _|
dd| _| j| jkrItd| j d| j dt|j| _|
dd	}|r^t|d
| _nd | _d|v rt|j| _t| jj }t|dkr|d dkrd| _td nd| _td ntd d | _t|j| _t|j| _|j}|
d}|
dd}|
dd| _|
dd| _t | j|||d| _!|
dd}|
dd	| _"t#||d| _$|
dd| _%|
dd	| _&t' | _(t) | _*|
dd| _+|
d d| _,t|j-| _.t|j/| _0|
d!d"}	|	d"krt1 | _2n|	d#kr)t3 | _2ntd$|	 d%| jr=|
d&d| _4nd	| _4| j4d'krN| jsNtd(|
d)d| _5|
d*d| _6| j5rut7 | _8| j8j9d+dd, | j8:  t;d- d| _<d| _=|
d.d | _>d | _?d| _@d S )/N   )r%   r&   disc_updates_per_perioddisc_update_periodz!Number of discriminator updates (z=) per period must be less or equal to the configured period ()encoder_noise_stdev        )stdevvector_quantizer   commit_lossTz&Vector quantizer supports commit loss.Fz.Vector quantizer does not support commit loss.z"Vector quantizer will not be used.mel_loss_dimsmel_loss_log_guardg      ?mel_loss_l1_scalemel_loss_l2_scale)sample_ratemel_dimsresolutions	log_guardstft_loss_log_guardstft_loss_scale)r8   r9   time_domain_loss_scalesi_sdr_loss_scalegen_loss_scalefeature_loss_scalefeature_loss_typerelativeabsolutezUnknown feature loss type .commit_loss_scaler   z=Commit loss is enabled but the quantizer does not support it.use_scl_lossscl_loss_scalezThttps://huggingface.co/Edresson/Speaker_Encoder_H_ASP/resolve/main/pytorch_model.binstrictz$Speaker encoder loaded and frozen !!
log_config)Ar"   #convert_model_config_to_dict_configmaybe_update_config_version
world_size	num_nodesnum_devicessuper__init__r6   samples_per_framegetr(   r)   
ValueErrorr   audio_encoderr   encoder_noiser.   listoutput_typeskeyslen vector_quantizer_has_commit_lossr!   infowarningaudio_decoderdiscriminatorloss_resolutionsr4   r5   r   mel_loss_fnr;   r   stft_loss_fnr<   r=   r   time_domain_loss_fnr   si_sdr_loss_fnr>   r?   generator_lossgen_loss_fndiscriminator_lossdisc_loss_fnr   feature_loss_fnr   rD   rE   rF   r   speaker_encoderload_checkpointfreezeprintuse_asr_consitency_lossacl_loss_scalerI   lr_schedule_intervalautomatic_optimization)
selfr%   r&   r+   vq_output_typesr_   r2   r3   r:   r@   	__class__ d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/audio_codec.pyrP   9   s   










zAudioCodecModel.__init__ Fc                    sd   t | dr
| jr
i S t |||}t| D ]}| jr$d|v r$||= d|v r/d|v r/||= q|S )N_no_state_dictspeaker_encoder.r^   .slm_model.ssl_model.)hasattrrx   rO   
state_dictrV   rX   rE   )rq   destinationprefix	keep_varsr|   keyrs   ru   rv   r|      s   zAudioCodecModel.state_dictTc                    sP   t | D ]}| jrd|v r||= d|v rd|v r||= qt j|dd d S )Nry   r^   rz   FrG   )rV   rX   rE   rO   load_state_dict)rq   r|   rH   r   rs   ru   rv   r      s   zAudioCodecModel.load_state_dictc                 C   s   |s<t  , trtj|| j| jjd }n	t	
d td| j|ddd}W d    |S 1 s5w   Y  |S trLtj|| j| jjd }n	t	
d td| j|ddd}|S )Nr6   zCould not import torchaudio!zAtorchaudio is not installed but is necessary to audio resample !!T)l2_normr0   )torchno_gradHAVE_TORCHAUDIO
torchaudio
functionalresampler6   ri   audio_configr!   errorModuleNotFoundError	unsqueeze)rq   audiorequires_gradaudio_resampledgru   ru   rv   get_speaker_embedding   s*   



z%AudioCodecModel.get_speaker_embedding)BT_audior   r   	audio_len)r   D	T_encodedencodedencoded_len)input_typesrW   r   r   returnc                 C   s*   |  ||\}}| j||d\}}||fS )a{  Apply encoder on the input audio signal. Input will be padded with zeros so
        the last frame has full `self.samples_per_frame` samples.

        Args:
            audio: input time-domain signal
            audio_len: valid length for each example in the batch

        Returns:
            Encoder output `encoded` and its length in number of frames `encoded_len`
        r   )	pad_audiorT   )rq   r   r   r   r   ru   ru   rv   encode_audio   s   zAudioCodecModel.encode_audioinputs	input_lenr   r   c                 C   s   | j ||d\}}||fS )a  Apply decoder on the input. Note that the input is a non-quantized encoder output or a dequantized representation.

        Args:
            inputs: encoded signal
            input_len: valid length for each example in the batch

        Returns:
            Decoded output `audio` in the time domain and its length in number of samples `audio_len`.
            Note that `audio_len` will be a multiple of `self.samples_per_frame`.
        r   )r]   )rq   r   r   r   r   ru   ru   rv   decode_audio  s   zAudioCodecModel.decode_audiotokens)r   Cr   r   r   c                 C   s,   | j std| j j||d}t|d}|S )aQ  Quantize the continuous encoded representation into a discrete
        representation for each frame.

        Args:
            encoded: encoded signal representation
            encoded_len: valid length of the encoded representation in frames

        Returns:
            A tensor of tokens for each codebook for each frame.
        z!Cannot quantize without quantizerr   zC B T -> B C T)r.   rS   encoder   )rq   r   r   r   ru   ru   rv   quantize   s
   
zAudioCodecModel.quantizer   
tokens_lendequantizedr   c                 C   s,   | j stdt|d}| j j||d}|S )aO  Convert the discrete tokens into a continuous encoded representation.

        Args:
            tokens: discrete tokens for each codebook for each time frame
            tokens_len: valid length of each example in the batch

        Returns:
            Continuous encoded representation of the discrete input representation.
        z#Cannot dequantize without quantizerzB C T -> C B T)indicesr   )r.   rS   r   decode)rq   r   r   r   ru   ru   rv   
dequantize;  s
   
zAudioCodecModel.dequantizec                 C   s(   | j ||d\}}| j||d}||fS )a  Convert input time-domain audio signal into a discrete representation (tokens).

        Args:
            audio: input time-domain signal, shape `(batch, number of samples)`
            audio_len: valid length for each example in the batch, shape `(batch size,)`

        Returns:
            Tokens for each codebook for each frame, shape `(batch, number of codebooks, number of frames)`,
            and the corresponding valid lengths, shape `(batch,)`
        r   r   )r   r   )rq   r   r   r   r   r   ru   ru   rv   r   V  s   zAudioCodecModel.encodec                 C   s(   | j ||d}| j||d\}}||fS )a  Convert discrete tokens into a continuous time-domain signal.

        Args:
            tokens: discrete tokens for each codebook for each time frame, shape `(batch, number of codebooks, number of frames)`
            tokens_len: valid lengths, shape `(batch,)`

        Returns:
            Decoded output `audio` in the time domain and its length in number of samples `audio_len`.
            Note that `audio_len` will be a multiple of `self.samples_per_frame`.
        r   r   )r   r   )rq   r   r   r   r   r   ru   ru   rv   r   q  s   zAudioCodecModel.decode)output_audiooutput_audio_lenc                 C   sZ   | j ||d\}}| jr | j||d}| j||d\}}||fS | j||d\}}||fS )aN  Apply encoder, quantizer, decoder on the input time-domain signal.

        Args:
            audio: input time-domain signal
            audio_len: valid length for each example in the batch

        Returns:
            Reconstructed time-domain signal `output_audio` and its length in number of samples `output_audio_len`.
        r   r   r   r   )r   r.   r   r   r   )rq   r   r   r   r   r   r   r   ru   ru   rv   forward  s   zAudioCodecModel.forwardc                 C   sL   | j t|| j    }|  }||jd  }t|d|f}||fS )a  Zero pad the end of the audio so that we do not have a partial end frame.
        The output will be zero-padded to have an integer number of frames of
        length `self.samples_per_frame`.

        Args:
            audio: input time-domain signal
            audio_len: valid length for each example in the batch

        Returns:
            Padded time-domain signal `padded_audio` and its length `padded_len`.
        r'   r   )	rQ   r   r   intmaxitemshapeFpad)rq   r   r   
padded_lenmax_lennum_paddingpadded_audioru   ru   rv   r     s
   zAudioCodecModel.pad_audioc           	      C   s   | d}| d}| ||\}}| j||d\}}| jd ur%| |}| jrB| jr6| j||d\}}}n| j||d\}}d}nd}| j||d\}}||||fS )Nr   
audio_lensr   r   r,   )rR   r   rT   rU   r.   rZ   r]   )	rq   batchr   r   r   r   _r1   	audio_genru   ru   rv   _process_batch  s   



zAudioCodecModel._process_batchc                 C   s   | j | j S )z*Probability of updating the discriminator.)r(   r)   rq   ru   ru   rv   disc_update_prob  s   z AudioCodecModel.disc_update_probc                 C   s   || j  }|| jk S )zDecide whether to update the descriminator based
        on the batch index and configured discriminator update period.
        )r)   r(   )rq   	batch_idxdisc_update_stepru   ru   rv   should_update_disc  s   

z"AudioCodecModel.should_update_discc           "      C   s2  |   \}}| |\}}}}| j|jd d d}	| |rD| j|| d\}
}}}| j|
|d}||	d< |  | 	| |
  g }| j|||d\}}| jr_||	d< || j|  | jrn||	d	< || j|  | jr| j|||d}||	d
< || j|  | jr| j|||d}||	d< || j|  | jr| j|||d}||	d< || j|  | j||d\}}}}| jr| j|d}||	d< || j|  | jr| j||d}||	d< || j|  | jr||	d< || j|  | jr6tj|d|dfdd}| j|dd}tj|ddd\}}dtj j!"||#  | j$ }||	d< ||	d  | j%rptj|d|dfdd}| &|\}}tj|ddd\}}tj j!'||| j( } | |	d< ||	d  t)|}!|  | 	|! |
  | *  | j+|	ddd | j,d|dddd d S )Nr   lr)global_stepr   )
audio_realr   )disc_scores_realdisc_scores_gend_lossr   r   r   g_loss_mel_l1g_loss_mel_l2g_loss_stftg_loss_time_domaing_loss_si_sdr)r   
g_loss_gen)
fmaps_real	fmaps_geng_loss_featureg_loss_commitr'   dimTr      r0   
g_loss_scl
g_loss_acl)on_step	sync_distt_lossF)prog_barloggerr   )-
optimizersr   r   param_groupsr   r^   detachrg   	zero_gradmanual_backwardstepr`   r4   appendr5   r;   ra   r<   rb   r=   rc   r>   re   r?   rh   rD   rE   r   catsqueezer   chunknnr   cosine_similaritymeanrF   rm   phoneme_asr_modelmse_lossrn   sum	update_lrlog_dictlog)"rq   r   r   	optim_gen
optim_discr   r   r   r1   metricsr   r   r   	loss_discgenerator_lossesloss_mel_l1loss_mel_l2	loss_stftloss_time_domainloss_si_sdrr   r   loss_genloss_featureaudios_batch	pred_embs
gt_spk_embsyn_spk_embloss_scllogits	logits_gtlogits_predloss_aclloss_gen_allru   ru   rv   training_step  s   


zAudioCodecModel.training_stepc                 C   s   |  d d S )Nepoch)r   r   ru   ru   rv   on_train_epoch_endO  s   z"AudioCodecModel.on_train_epoch_endc                 C   s  |  |\}}}}| j|||d\}}| j|||d}	| j|||d}
| j|||d}||	 |
 }||||	|
|d}| jrztj|d|dfdd}| j	|dd}tj
|ddd\}}d	tjj||  | j }||d
< |d  |d
 7  < | jrtj|d|dfdd}| |\}}tj
|ddd\}}tjj||| j }||d< |d  |d 7  < | j|ddd d S )Nr   )val_lossval_loss_mel_l1val_loss_mel_l2val_loss_stftval_loss_time_domainval_loss_si_sdrr'   r   r   Tr   r   r0   val_loss_sclr  val_loss_acl)on_epochr   )r   r`   ra   rb   rc   rE   r   r   r   r   r   r   r   r   r   rF   rm   r   r   rn   r   )rq   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r  r  r  r  r  r  r  ru   ru   rv   validation_stepR  s8   	zAudioCodecModel.validation_stepc                 C   s   t | |jdd}W d    n1 sw   Y  |r?t | | j|j_| j|j_d|j_W d    n1 s:w   Y  t|j}|j|jj	| j
jd}||fS )N
is_shardedFz>nemo.collections.tts.data.vocoder_dataset.TarredVocoderDataset)rL   )r   datasetpopglobal_rankrL   _target_r   get_samplerdataloader_params
batch_sizer&   )rq   r%   r  r  samplerru   ru   rv   get_dataset  s   





zAudioCodecModel.get_datasetc                 C   s2   |  |\}}tjjj|f|j|d|j}|S )N)
collate_fnr  )r  r   utilsdata
DataLoaderr   r  )rq   r%   r  r  data_loaderru   ru   rv   _setup_train_dataloader  s   z'AudioCodecModel._setup_train_dataloaderc                 C   s,   t |j}tjjj|fd|ji|j}|S )Nr   )r   r  r   r!  r"  r#  r   r  )rq   r%   r  r$  ru   ru   rv   _setup_test_dataloader  s   
z&AudioCodecModel._setup_test_dataloaderc                 C   s   |  || _|d d }| jd urPt| jdrRt| jjtjjjrT| j	d urDt| j	j
trDt| j	j
tt| jj| j |  | j	_
d S | j	d u rVtd d S d S d S d S d S )Nr  r  r  zModel Trainer was not set before constructing the dataset, incorrect number of training batches will be used. Please set the trainer and rebuild the dataset.)r%  	_train_dlr{   
isinstancer  r   r!  r"  IterableDataset_trainerlimit_train_batchesfloatr   r   rY   rL   r!   r\   )rq   r%   r  ru   ru   rv   setup_training_data  s&   


z#AudioCodecModel.setup_training_datac                 C   s   |  || _d S N)r&  _validation_dlrq   r%   ru   ru   rv   setup_validation_data  s   z%AudioCodecModel.setup_validation_datac                 C   s   d S r.  ru   r0  ru   ru   rv   setup_test_data  s   zAudioCodecModel.setup_test_datac              	   C   s|   d| j v r| j dS d| j vrtdd| j v r!| j j| j j S t| j j| jj| jjt	| jt
| jjt| j| jjdS )N	max_steps
max_epochsz)Must specify 'max_steps' or 'max_epochs'.steps_per_epoch)r4  accumulate_grad_batchesr+  num_workersnum_samplesr  	drop_last)_cfgrR   rS   r4  r5  r   r&   r6  r+  r   rY   r'  r  r   r9  r   ru   ru   rv   r3    s   



zAudioCodecModel.max_stepsc                 C   s,  | j j }t|d |dd }t|d | jr | j ng }| j	r*| j
 ng }| jr4| j ng }t| j | j |||}t||d}| j }t||d}	|d u rctd ||	gS td t|d | j|d< t|d t||| jd}
t|	|| jd}|
d	 | _||	g|
|gfS )
NFschedT)paramszScheduler is not usedzSetting up schedulersr3  )	optimizerscheduler_configtrain_dataloaderinterval)r:  optimcopyr
   
set_structr  rm   r   
parametersrE   ri   r.   	itertoolschainrT   r]   r   r^   r!   debugr3  r    r'  ro   )rq   optim_configsched_configasr_ph_params	se_params	vq_params
gen_paramsoptim_gdisc_paramsoptim_dscheduler_gscheduler_dru   ru   rv   configure_optimizers  s8   




z$AudioCodecModel.configure_optimizersr   c                 C   s>   |   }|d ur| j|kr|\}}|  |  d S d S d S r.  )lr_schedulersro   r   )rq   r@  
schedulerssch1sch2ru   ru   rv   r     s   zAudioCodecModel.update_lrc              
   C   sl   | j sg S | | j }t| j j}| j jrt| j jnd }t||| j j| j j|| j	j
| j j| j jd}|gS )N)
generatorsr$  
log_epochsepoch_frequency
output_dirloggerslog_tensorboard	log_wandb)rI   r&  r   rX  log_dirr   r   rY  rZ  r&   r\  r]  r^  )rq   r$  rX  r_  log_callbackru   ru   rv   configure_callbacks  s    z#AudioCodecModel.configure_callbacksc                 C   s   g }t dddd}|| t dddd}|| t dd	d
d}|| t dddd}|| t dddd}|| |S )Naudio_codec_16khz_smallzwhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemozFor details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/audio_codec_16khz_small)pretrained_model_namelocationdescriptionmel_codec_22khz_mediumzuhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_medium/versions/v1/files/mel_codec_22khz_medium.nemozFor details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_mediummel_codec_44khz_mediumzuhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_medium/versions/v1/files/mel_codec_44khz_medium.nemozFor details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_mediummel_codec_22khz_fullband_mediumzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_22khz_fullband_medium/versions/v1/files/mel_codec_22khz_fullband_medium.nemozFor details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_22khz_fullband_mediummel_codec_44khz_fullband_mediumzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/mel_codec_44khz_fullband_medium/versions/v1/files/mel_codec_44khz_fullband_medium.nemozFor details about this model please refer to the model card: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mel_codec_44khz_fullband_medium)r   r   )clsmodelsmodelru   ru   rv   list_available_models  s@   




z%AudioCodecModel.list_available_modelsr.  )Nrw   F)T)F)r   )3__name__
__module____qualname__r	   r   rP   r|   r   r   r   r   r   tupler   r   r   Tensorr   r   r   r   r   r   r   r   r   r   r   propertyr,  r   boolr   r	  r  r  r  r%  r&  r-  r1  r2  r3  rS  r   ra  classmethodr   r   rm  __classcell__ru   ru   rs   rv   r$   7   s     




(


(


	

(


(


(
h1

'r$   )?rE  mathr   pathlibr   typingr   r   r   torch.nn.functionalr   r   r   einopsr   hydra.utilsr   lightning.pytorchr   	omegaconfr	   r
   r   ,nemo.collections.tts.losses.audio_codec_lossr   r   r   r   r   r   0nemo.collections.tts.modules.audio_codec_modulesr   #nemo.collections.tts.modules.commonr   *nemo.collections.tts.parts.utils.callbacksr   (nemo.collections.tts.parts.utils.helpersr   r   	nemo.corer   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   r   r   r   "nemo.core.neural_types.neural_typer   nemo.core.optim.lr_schedulerr   r    
nemo.utilsr!   r"   nemo.utils.decoratorsr#   r   r   r   r$   ru   ru   ru   rv   <module>   s<    