o
    }oil                     @   s  d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& dZ'zd dl(Z(W n e)y   dZ'Y nw de
e dee fddZ*dee+ dee+ de+fddZ,eG dd dZ-eG dd dZ.eG dd dZ/d ed!e+fd"d#Z0G d$d% d%eZ1e&G d&d' d'eZ2G d(d) d)e1Z3G d*d+ d+e1Z4G d,d- d-e1Z5dS ).    N)ABCabstractmethod)	dataclass)Path)DictListOptionalTupleType)	rearrange)CallbackLightningModuleTrainer)TensorBoardLogger)Logger)WandbLogger)Tensor)create_plot)logging)experimentalTFloggerslogger_typec                 C   sF   | D ]}t ||rt|dr|j  S |  S qtd| d|  d)N
experimentzCould not find z logger in .)
isinstancehasattrr   
ValueError)r   r   logger r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/parts/utils/callbacks.py_get_logger+   s   


r    
model_namecheckpoint_pathtypec                 C   s   | d u |d u ksJ d|  d| dt |}|dkr$ddlm} |}n|dkr1ddlm} |}ntd	| d
| d urF|| }|	 S |drT||}|	 S ||}|	 S )Nz?Must provide exactly one of vocoder model_name or checkpoint: (z, )hifiganr   )HifiGanModelunivnet)UnivNetModelzUnknown vocoder type ''z.nemo)
strnemo.collections.tts.modelsr&   r(   r   from_pretrainedendswithrestore_fromload_from_checkpointeval)r!   r"   r#   r&   
model_typer(   vocoderr   r   r   _load_vocoder5   s*   



r3   c                   @   s0   e Zd ZU eed< ejed< eed< eed< dS )AudioArtifactiddatasample_ratefilepathN)	__name__
__module____qualname__r*   __annotations__npndarrayintr   r   r   r   r   r4   P   s
   
 
r4   c                   @   s8   e Zd ZU eed< ejed< eed< eed< eed< dS )ImageArtifactr5   r6   r8   x_axisy_axisN)r9   r:   r;   r*   r<   r=   r>   r   r   r   r   r   r@   X   s   
 
r@   c                   @   s2   e Zd ZU eed< eed< eed< dZeed< dS )LogAudioParamsvocoder_typevocoder_namevocoder_checkpoint_pathFlog_audio_gtaN)r9   r:   r;   r*   r<   rG   boolr   r   r   r   rC   a   s
   
 rC   r8   returnc                 C   s    t | d}|tjd}|S )N _)r*   with_suffixreplaceossep)r8   path_prefixfile_idr   r   r   	create_idi   s   rR   c                   @   s>   e Zd Ze	d	dedededeee	 ee
 f fddZdS )
ArtifactGeneratorFmodel
batch_dictinitial_logrI   c                 C   s   dS )a  
        Create artifacts for the input model and test batch.

        Args:
            model: Model instance being trained to use for inference.
            batch_dict: Test batch to generate artifacts for.
            initial_log: Flag to denote if this is the initial log, can
                         be used to save ground-truth data only once.

        Returns:
            List of audio and image artifacts to log.
        Nr   )selfrT   rU   rV   r   r   r   generate_artifactsp   s    z$ArtifactGenerator.generate_artifactsNF)r9   r:   r;   r   r   r   rH   r	   r   r4   r@   rX   r   r   r   r   rS   o   s    rS   c                   @   s   e Zd ZdZ						d#dee dejjj	de
ee  ded	e
e d
e
ee  dedefddZdededefddZdededefddZd$dedede
e defddZdedefdd Zdedefd!d"ZdS )%LoggingCallbacka  
    Callback which can log artifacts (eg. model predictions, graphs) to local disk, Tensorboard, and/or WandB.

    Args:
        generators: List of generators to create and log artifacts from.
        data_loader: Data to log artifacts for.
        log_epochs: Optional list of specific training epoch numbers to log artifacts for.
        epoch_frequency: Frequency with which to log
        output_dir: Optional local directory. If provided, artifacts will be saved in output_dir.
        loggers: Optional list of loggers to use if logging to tensorboard or wandb.
        log_tensorboard: Whether to log artifacts to tensorboard.
        log_wandb: Whether to log artifacts to WandB.
    N   F
generatorsdata_loader
log_epochsepoch_frequency
output_dirr   log_tensorboard	log_wandbc	           	      C   s  || _ || _|r
|ng | _|| _|rt|nd | _|r|ng | _|| _|| _|r5t	
d t| jt| _nt	d d | _|rRtsEtdt	
d t| jt| _nt	d d | _t	d| jj t	d| j t	d| j t	d	| j t	d
| j t	d| j d S )NzCreating tensorboard loggerzNot using tensorbord loggerzWandb not installed.zCreating wandb loggerzNot using wandb loggerInitialized %s withz	log_epochs:      %sz	epoch_frequency: %sz	output_dir:      %sz	log_tensorboard: %sz	log_wandb:       %s)r\   r]   r^   r_   r   r`   r   ra   rb   r   infor    r   tensorboard_loggerdebug
HAVE_WANDBr   r   wandb_logger	__class__r9   )	rW   r\   r]   r^   r_   r`   r   ra   rb   r   r   r   __init__   s4   



zLoggingCallback.__init__audiolog_dirstepc                 C   s   |r||j  }|jjddd tj||j|jd | jr)| jj|j	|j||jd | j
rCtj|j|j|j	df}| j
|j	|i d S d S )NTparentsexist_ok)filer6   
samplerate)tag
snd_tensorglobal_stepr7   )r7   caption)r8   parentmkdirsfwriter6   r7   re   	add_audior5   rh   wandbAudiolog)rW   rk   rl   rm   r8   wandb_audior   r   r   
_log_audio   s   
zLoggingCallback._log_audioimagec                 C   s   |r||j  }|jjddd nd }t||j|j|jd}| jr+| jj|j	||dd | j
rBtj||j	df}| j
|j	|i d S d S )NTrn   )output_filepathr6   rA   rB   HWC)rs   
img_tensorru   dataformats)rv   )r8   rw   rx   r   r6   rA   rB   re   	add_imager5   rh   r|   Imager~   )rW   r   rl   rm   r8   
image_plotwandb_imager   r   r   
_log_image   s    
zLoggingCallback._log_imager   
audio_list
image_listru   c                 C   sN   |dur|j ddd |D ]
}| j|||d q|D ]
}| j|||d qdS )zLog audio and image artifacts.NTrn   )rk   rl   rm   )r   rl   rm   )rx   r   r   )rW   r   r   rl   ru   rk   r   r   r   r   _log_artifacts   s   zLoggingCallback._log_artifactstrainerrT   c                 C   s   g }g }| j D ]1}| D ]\}}t|tjr||j||< q| jD ]}|j||dd\}	}
||	7 }||
7 }q#qt	|t	|  krGdkrPn nt
d dS | jrX| jd nd}| j|||d dS )zLog initial data artifacts.T)rT   rU   rV   r   z,List are empty, no initial artifacts to log.Ninitialr   r   rl   )r]   itemsr   torchr   todevicer\   rX   lenr   rf   r`   r   )rW   r   rT   r   r   rU   keyvalue	generatorrk   imagesrl   r   r   r   on_fit_start   s"   


 
zLoggingCallback.on_fit_startc                 C   s   d|j  }|| jvr|| j dkrdS g }g }| jD ]0}| D ]\}}t|tjr2||j	||< q | j
D ]}	|	j||d\}
}||
7 }||7 }q6qt|t|  krYdkrcn ntd| dS | jrn| jd|  nd}| j|||d dS )z%Log artifacts at the end of an epoch.r[   r   N)rT   rU   z0List are empty, no artifacts to log at epoch %d.epoch_r   )current_epochr^   r_   r]   r   r   r   r   r   r   r\   rX   r   r   rf   r`   r   )rW   r   rT   epochr   r   rU   r   r   r   rk   r   rl   r   r   r   on_train_epoch_end  s(   



 z"LoggingCallback.on_train_epoch_end)Nr[   NNFF)Nr   )r9   r:   r;   __doc__r   rS   r   utilsr6   
DataLoaderr   r?   r   r   rH   rj   r4   r   r@   r   listr   r   r   r   r   r   r   r   r   rZ      s<    

	
+ rZ   c                   @   s>   e Zd ZdZ	d
dedededeee	 ee
 f fddZd	S )VocoderArtifactGeneratorz6
    Generator for logging Vocoder model outputs.
    FrT   rU   rV   rI   c                 C   sx  | d}| d}dd |D }| d}| d}g }	|rZtt||D ]/\}
\}}t| d| d}||
d ||
 f   }td	| |||jd
}|	| q&|	g fS |	||\}}t
  |j|d}t|d}W d    n1 s|w   Y  tt||D ]/\}
\}}t| d| d}||
d ||
 f   }td| |||jd
}|	| q|	g fS )Ndataset_namesaudio_filepathsc                 S      g | ]}t |qS r   rR   .0pr   r   r   
<listcomp>+      z?VocoderArtifactGenerator.generate_artifacts.<locals>.<listcomp>rk   
audio_lens/_gt.wav	audio_gt_r5   r6   r8   r7   speczB 1 T -> B T.wavaudio_)get	enumeratezipr   cpunumpyr4   r7   appendaudio_to_melspec_precessorr   no_gradforwardr   )rW   rT   rU   rV   r   r   	audio_idsrk   	audio_lenaudio_artifactsidataset_nameaudio_idaudio_gt_path
audio_gt_iaudio_artifactr   spec_len
audio_predaudio_pred_pathaudio_pred_ir   r   r   rX   %  sD   




z+VocoderArtifactGenerator.generate_artifactsNrY   )r9   r:   r;   r   r   r   rH   r	   r   r4   r@   rX   r   r   r   r   r      s    r   c                   @   s   e Zd ZdZddededefddZ	dd	ed
ee dee de	de	defddZ
d	ed
ee dee de	de	f
ddZ	dd	edededeee ee f fddZdS )AudioCodecArtifactGeneratorz:
    Generator for logging Audio Codec model outputs.
    TF	log_audiolog_encodinglog_dequantizedc                 C   sP   || _ || _|| _td| jj td| j  td| j td| j d S )Nrc   z	log_audio:       %sz	log_encoding:    %sz	log_dequantized: %s)r   r   r   r   rf   ri   r9   )rW   r   r   r   r   r   r   rj   Y  s   z$AudioCodecArtifactGenerator.__init__rT   r   r   rk   r   
save_inputc                 C   s*  | j sg S t  |||d\}}W d   n1 sw   Y  g }	tt||D ]/\}
\}}t| d| d}||
d||
 f   }td| |||j	d}|	
| q*|rtt||D ]/\}
\}}t| d| d}||
d||
 f   }td| |||j	d}|	
| qc|	S )	a  Generate audio artifacts.

        Args:
            model: callable model, outputs (audio_pred, audio_pred_len)
            dataset_names: list of dataset names for the examples in audio batch
            audio_ids: list of IDs for the examples in audio batch
            audio: tensor of input audio signals, shape (B, T)
            audio_len: tensor of lengths for each example in the batch, shape (B,)
            save_input: if True, save input audio signals
        rk   r   Nr   z_audio_out.wav
audio_out_r   z_audio_in.wav	audio_in_)r   r   r   r   r   r   r   r   r4   r7   r   )rW   rT   r   r   rk   r   r   r   audio_pred_lenr   r   r   r   r   r   r   audio_in_path
audio_in_ir   r   r   _generate_audiof  s8   
z+AudioCodecArtifactGenerator._generate_audioc                 C   s  g }| j s
| js
|S t  |j||d\}}W d   n1 s"w   Y  | j rdtt||D ]2\}	\}
}t|
 d| d}||	ddd||	 f  	 }t
d| ||ddd}|| q1| jsi|S t  |j||d	}|j||d
}W d   n1 sw   Y  tt||D ]2\}	\}
}t|
 d| d}||	ddd||	 f  	 }t
d| ||ddd}|| q|S )a  Generate image artifacts.

        Args:
            model: model, needs to support `model.encode_audio`, `model.quantize` and `model.dequantize`
            dataset_names: list of dataset names for the examples in audio batch
            audio_ids: list of IDs for the examples in audio batch
            audio: tensor of input audio signals, shape (B, T)
            audio_len: tensor of lengths for each example in the batch, shape (B,)
        r   Nr   z_encoded.pngencoded_Audio FramesChannelsr5   r6   r8   rA   rB   )encodedencoded_len)tokens
tokens_lenz_dequantized.pngdequantized_)r   r   r   r   encode_audior   r   r   r   r   r@   r   quantize
dequantize)rW   rT   r   r   rk   r   image_artifactsr   r   r   r   r   encoded_path	encoded_iencoded_artifactr   dequantizeddequantized_pathdequantized_idequantized_artifactr   r   r   _generate_images  sH   
"
"z,AudioCodecArtifactGenerator._generate_imagesrU   rV   rI   c                 C   sh   | d}| d}dd |D }| d}| d}| j||||||d}	| j|||||d}
|	|
fS )	z
        Args:
            model: model used to process input to generate artifacts
            batch_dict: dictionary obtained form the dataloader
            initial_log: save input audio for the initial log
        r   r   c                 S   r   r   r   r   r   r   r   r     r   zBAudioCodecArtifactGenerator.generate_artifacts.<locals>.<listcomp>rk   r   )rT   r   r   rk   r   r   )rT   r   r   rk   r   )r   r   r   )rW   rT   rU   rV   r   r   r   rk   r   r   r   r   r   r   rX     s"   





z.AudioCodecArtifactGenerator.generate_artifactsN)TFFrY   )r9   r:   r;   r   rH   rj   r   r   r*   r   r   r   r   r	   r4   r@   rX   r   r   r   r   r   T  sL    
6
9r   c                   @   s   e Zd ZdZ			ddededee fddZd	ed
e	e
 de	e
 defddZdededefddZd	ed
e	e
 de	e
 defddZd	ed
e	e
 de	e
 defddZ	dd	edededee	e e	e f fddZdS )FastPitchArtifactGeneratorac  
    Generator for logging FastPitch model outputs.

    Args:
        log_spectrogram: Whether to log predicted spectrograms.
        log_alignment: Whether to log alignment graphs.
        audio_params: Optional parameters for saving predicted audio.
            Requires a vocoder model checkpoint for generating audio from predicted spectrograms.
    FNlog_spectrogramlog_alignmentaudio_paramsc                 C   sN   || _ || _|sd| _d| _d | _d S d| _|j| _t|j|j|jd| _d S )NFT)r!   r"   r#   )	r   r   r   rG   r2   r3   rE   rF   rD   )rW   r   r   r   r   r   r   rj     s   
z#FastPitchArtifactGenerator.__init__rT   r   r   rU   c                 C   s   | d}| d}|j||d\}}g }	g }
tt||D ]\\}\}}t| d| d}||d || f   }td| |||jjd}|		| t| d| d}||d d d || f   }t
d	| ||d
dd}|
	| q|	|
fS )Nrk   r   input_signallengthr   r   r   r   z_spec_gt.pngspec_r   r   r   )r   preprocessorr   r   r   r   r   r4   _sample_rater   r@   )rW   rT   r   r   rU   rk   r   r   r   r   r   r   r   r   r   r   r   spec_gt_path	spec_gt_ispec_artifactr   r   r   _create_ground_truth_artifacts  s4   


"z9FastPitchArtifactGenerator._create_ground_truth_artifactsmelsmels_len
hop_lengthc                 C   sh   | | jj}t  | jj|d}W d    n1 sw   Y  |  }tj	j
||d}||fS )Nr   )r   )r   r2   r   r   r   convert_spectrogram_to_audior   r   librosacoreframes_to_samples)rW   r   r   r   	voc_inputr   mels_len_arrayaudio_pred_lensr   r   r   r   6  s   
z*FastPitchArtifactGenerator._generate_audioc                 C   sv  g }g }| d}| d}| dd }	t  |j|||	d^}
}}W d    n1 s.w   Y  | jrptt||D ]2\}\}}t| d| d}|
|d d d || f  	 }t
d| ||dd	d
}|| q=| jr| j|
||jjd\}}tt||D ]0\}\}}t| d| d}||d || f  	 }td| ||| jjd}|| q||fS )Ntext	text_lens
speaker_id)r  
input_lensspeakerr   z	_spec.pngr   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r@   r   r   r   r   r   r4   r2   r7   )rW   rT   r   r   rU   r   r   r  r  r	  	mels_predmels_pred_lenrK   r   r   r   	spec_pathspec_ir   r   r  r   r   r   r   r   r   _generate_predictions?  sN   


"

z0FastPitchArtifactGenerator._generate_predictionsc                  C   s  g }g }| d}| d}| d}	| d}
| dd }| dd }| dd }| dd }|j||d	\}}t ! |j|	|
||||||d
\}}}}}}}}}}}}W d    n1 saw   Y  | jrt|d}tt||D ]4\}\}}t	| d| d}||d |
| d || f 
  }td| ||ddd}|| qu| jr| j|||jjd\}}tt||D ]0\}\}}t	| d| d}||d || f 
  }td| ||| jjd}|| q||fS )Nrk   r   r  r  align_prior_matrixpitchenergyr  r   )r  r  r  r  r	  r   mel_lens
attn_priorz$B 1 T_spec T_text -> B T_text T_specr   z
_align.pngalign_r   zText Tokensr   r
  z_gta.wav
audio_gta_r   )r   r   r   r   r   r   r   r   r   r   r   r   r@   r   rG   r   r   r4   r2   r7   ) rW   rT   r   r   rU   r   r   rk   r   r  r  r  r  r  r	  r   r   r  r  rK   attnr   r   r   	attn_pathattn_ialignment_artifactr   r  r   r   r   r   r   r   _generate_gta_predictionsp  sf   




 
&

z4FastPitchArtifactGenerator._generate_gta_predictionsrV   rI   c                 C   s   | d}| d}dd |D }|r"| j||||d\}}||fS g }	g }
| js,| jr?| j||||d\}}|	|7 }	|
|7 }
| jsE| jrX| j||||d\}}|	|7 }	|
|7 }
|	|
fS )Nr   r   c                 S   r   r   r   r   r   r   r   r     r   zAFastPitchArtifactGenerator.generate_artifacts.<locals>.<listcomp>)rT   r   r   rU   )r   r   r   r   r  rG   r   r  )rW   rT   rU   rV   r   r   r   audio_gtspec_gtr   r   r   	spec_predaudio_gta_pred
alignmentsr   r   r   rX     s.   




z-FastPitchArtifactGenerator.generate_artifacts)FFNrY   )r9   r:   r;   r   rH   r   rC   rj   r   r   r*   r   r   r   r?   r   r  r  r	   r4   r@   rX   r   r   r   r   r     sd    

!	
1
=r   )6rN   abcr   r   dataclassesr   pathlibr   typingr   r   r   r	   r
   r   r   r=   	soundfilery   r   einopsr   lightning.pytorchr   r   r   lightning.pytorch.loggersr    lightning.pytorch.loggers.loggerr   lightning.pytorch.loggers.wandbr   r   (nemo.collections.tts.parts.utils.helpersr   
nemo.utilsr   nemo.utils.decoratorsr   rg   r|   ModuleNotFoundErrorr    r*   r3   r4   r@   rC   rR   rS   rZ   r   r   r   r   r   r   r   <module>   sR   
 4 !