o
    wi;                  	   @   s<  d dl m Z mZ d dlmZmZ d dlZd dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z, zd dl-Z-dZ.W n e/e0fy   dZ.Y nw G dd de"e!Z1dS )    )random	randrange)ListOptionalN)	rearrange)instantiate)Trainer)TensorBoardLoggerWandbLogger)
DictConfig)SummaryWriter)mask_sequence_tensor)ConsistencyLossGeneratorLossGradientPenaltyLoss	HingeLoss)to_device_recursive)
ExportableModelPTPretrainedModelInfo	typecheck)LengthsTypeMelSpectrogramType
NeuralType)BoolType)loggingTFc                       s  e Zd ZdZd6dededdf fddZdd	 Zd
ej	dej	dej	fddZ
d
ej	dej	dej	fddZd7dedefddZd8dedej	fddZdd Zeede ede ede ddede ddddddd d!ej	dej	ded"efd#d$Z					d9d!ej	dej	d%eeej	  d&eeej	  d'eej	 ded"efd(d)Zd*d+ Zd,d- Zd.d/ Zd0d1 Zed2d3 Zd4d5 Z   Z!S ):SpectrogramEnhancerModelz
    GAN-based model to add details to blurry spectrograms from TTS models like Tacotron or FastPitch. Based on StyleGAN 2 [1]
    [1] Karras et. al. - Analyzing and Improving the Image Quality of StyleGAN (https://arxiv.org/abs/1912.04958)
    Ncfgtrainerreturnc                    sZ   d | _ t j||d t|j| _t|j| _t | _t | _	t
|j| _t|j| _d S )N)r   r   )spectrogram_modelsuper__init__r   	generatordiscriminatorr   generator_lossr   discriminator_lossr   consistency_loss_weightconsistency_lossr   gradient_penalty_loss_weightgradient_penalty_loss)selfr   r   	__class__ m/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/spectrogram_enhancer.pyr"   N   s   z!SpectrogramEnhancerModel.__init__c                 C   s   t |tt| j jS N)r   nextiterr#   
parametersdevice)r+   er.   r.   r/   move_to_correct_deviceZ   s   z/SpectrogramEnhancerModel.move_to_correct_devicespectrogramlengthsc                 C   s*   || j j }|| j j| j j  }t||S r0   )_cfgspectrogram_min_valuespectrogram_max_valuer   r+   r7   r8   r.   r.   r/   normalize_spectrograms]   s   
z/SpectrogramEnhancerModel.normalize_spectrogramsc                 C   s*   || j j| j j  }|| j j }t||S r0   )r9   r;   r:   r   r<   r.   r.   r/   unnormalize_spectrogramsb   s   
z1SpectrogramEnhancerModel.unnormalize_spectrograms   F
batch_sizemixingc                 C   s   |r1| j jt k r1td| jj}t|| j jg| }t|| j jg| jj|  }g ||}nt|| j jg| jj }| 	|S )Nr?   )
r9   
mixed_probr   r   r#   
num_layerstorchrandn
latent_dimr6   )r+   r@   rA   mixing_point
first_partsecond_partzsr.   r.   r/   generate_zsg   s   
z$SpectrogramEnhancerModel.generate_zsc                 C   s   t || jjdd}| |S )Ni   r?   )rD   randr9   n_bandsr6   )r+   r@   noiser.   r.   r/   generate_noiser   s   
z'SpectrogramEnhancerModel.generate_noisec                 C   s,   | j j}|j^ }}t|d|||  fS )Nr   )r#   upsample_factorshapeFpad)r+   spectrograms
multiplier_
max_lengthr.   r.   r/   pad_spectrogramsv   s   z)SpectrogramEnhancerModel.pad_spectrograms)BDT_spec)rY   T)optionalinput_spectrogramsr8   rA   	normalize)input_types)rA   r_   r^   r_   c             	   C   s   | j ||||ddddS )aK  
        Generator forward pass. Noise inputs will be generated.

        input_spectrograms: batch of spectrograms, typically synthetic
        lengths: length for every spectrogam in the batch
        mixing: style mixing, usually True during training
        normalize: normalize spectrogram range to ~[0, 1], True for normal use

        returns: batch of enhanced spectrograms

        For explanation of style mixing refer to [1]
        [1] Karras et. al. - A Style-Based Generator Architecture for Generative Adversarial Networks, 2018 (https://arxiv.org/abs/1812.04948)
        N)r^   r8   rA   r_   rJ   wsrN   )forward_with_custom_noise)r+   r^   r8   rA   r_   r.   r.   r/   forward{   s   z SpectrogramEnhancerModel.forwardrJ   ra   rN   c                    s   |j ^}}	}
|dur|durtd|du r ||}|du r* fdd|D }|du r3 |}t|d}|r@ ||} |} ||||}|rU ||}|ddddddd|
f }t|d}|S )a  
        Generator forward pass. Noise inputs will be generated if None.

        input_spectrograms: batch of spectrograms, typically synthetic
        lenghts: length for every spectrogam in the batch
        zs: latent noise inputs on the unit sphere (either this or ws or neither)
        ws: latent noise inputs in the style space (either this or zs or neither)
        noise: per-pixel indepentent gaussian noise
        mixing: style mixing, usually True during training
        normalize: normalize spectrogram range to ~[0, 1], True for normal use

        returns: batch of enhanced spectrograms

        For explanation of style mixing refer to [1]
        For definititions of z, w [2]
        [1] Karras et. al. - A Style-Based Generator Architecture for Generative Adversarial Networks, 2018 (https://arxiv.org/abs/1812.04948)
        [2] Karras et. al. - Analyzing and Improving the Image Quality of StyleGAN, 2019 (https://arxiv.org/abs/1912.04958)
        NzZPlease specify either zs or ws or neither, but not both. It is not clear which one to use.c                    s   g | ]} j |qS r.   )r#   style_mapping).0zr+   r.   r/   
<listcomp>   s    zFSpectrogramEnhancerModel.forward_with_custom_noise.<locals>.<listcomp>b c l -> b 1 c lzb 1 c l -> b c l)	rQ   
ValueErrorrK   rO   r   r=   rX   r#   r>   )r+   r^   r8   rJ   ra   rN   rA   r_   r@   rV   rW   enhanced_spectrogramsr.   rg   r/   rb      s*   


 
z2SpectrogramEnhancerModel.forward_with_custom_noisec                 C   s  |\}}}t   | ||}| ||}W d    n1 s w   Y  |dkrv| j||ddd}t|d}| |||}t|d }	| |	||}
| |
|}| jd|dd || j	j
 dkrt| |	|
}| jd|dd || S |S |d	kr| j||ddd}t|d}t|d}| |||}| |}| |||}| jd
|dd | jd|dd t   t|d}| |||| W d    || S 1 sw   Y  || S d S )Nr   TFr]   ri   d_loss)prog_bar	d_loss_gpr?   g_lossc_loss)rD   no_gradr=   rc   r   r$   requires_grad_r&   logr9   #gradient_penalty_loss_every_n_stepsr*   r%   r(   log_illustration)r+   batch	batch_idxoptimizer_idxr^   target_spectrogramsr8   rk   fake_logitstarget_spectrograms_real_logitsrl   gp_lossro   rp   r.   r.   r/   training_step   sN   








z&SpectrogramEnhancerModel.training_stepc                 C   s8   t | jj| j d}t | jj| j d}||gg fS )N)params)r   r9   generator_optr#   r3   discriminator_optr$   )r+   r   r   r.   r.   r/   configure_optimizers  s   z-SpectrogramEnhancerModel.configure_optimizersc                 C   s.   t |j}tjjj|fd|ji|j| _d S )N
collate_fn)	r   datasetrD   utilsdata
DataLoaderr   dataloader_params	_train_dl)r+   train_data_configr   r.   r.   r/   setup_training_data  s   
z,SpectrogramEnhancerModel.setup_training_datac                 C   s   dS )z
        There is no validation step for this model.
        It is not clear whether any of used losses is a sensible metric for choosing between two models.
        This might change in the future.
        Nr.   )r+   val_data_configr.   r.   r/   setup_validation_data  s   z.SpectrogramEnhancerModel.setup_validation_datac                 C   s"   g }t ddd| d}|| |S )N.tts_en_spectrogram_enhancer_for_asr_finetuningzhttps://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning/versions/1.20.0/files/tts_en_spectrogram_enhancer_for_asr_finetuning.nemoa"  This model is trained to add details to synthetic spectrograms. It was trained on pairs of real-synthesized spectrograms generated by FastPitch. STFT parameters follow ASR with 25 ms window and 10 ms hop. It is supposed to be used in conjunction with that model for ASR training/adaptation.)pretrained_model_namelocationdescriptionclass_)r   append)clslist_of_modelsmodelr.   r.   r/   list_available_models%  s   
	z.SpectrogramEnhancerModel.list_available_modelsc                 C   s   | j dkrd S | jsd S | jjd }|| jj dkrd S d}t| |  }tj	|| |||gdd
 d d |d d d d d |f }tsLJ dtjj|dddd}	| jD ]2}
t|
trr|
j}|jd	|	|d
 |  q[t|
tr|
jd	|	gdg|d q[tdtt|
 q[d S )Nr      )dimz1Torchvision imports failed but they are required.r?   )nrowg        g      ?rT   )global_stepz%residual, input, output, ground truth)captionstepzUnsupported logger type: %s)global_rankloggersr   r   log_every_n_stepsintflattenitemrD   stackcpuTORCHVISION_AVAILABLEtorchvisionr   	make_gridclamp
isinstancer	   
experiment	add_imageflushr
   	log_imager   warningstrtype)r+   ry   r^   rk   r8   r   idxlengthtensorgridloggerwriterr.   r.   r/   ru   9  s<   
 




z)SpectrogramEnhancerModel.log_illustrationr0   )r?   F)r?   )NNNFT)"__name__
__module____qualname____doc__r   r   r"   r6   rD   Tensorr=   r>   r   boolrK   rO   rX   r   r   r   r   r   rc   r   r   rb   r~   r   r   r   classmethodr   ru   __classcell__r.   r.   r,   r/   r   H   sl    

$
<0
r   )2r   r   typingr   r   rD   torch.nn.functionalnn
functionalrR   einopsr   hydra.utilsr   lightning.pytorchr   lightning.pytorch.loggersr	   r
   	omegaconfr   torch.utils.tensorboard.writerr   #nemo.collections.common.parts.utilsr   7nemo.collections.tts.losses.spectrogram_enhancer_lossesr   r   r   r   (nemo.collections.tts.parts.utils.helpersr   	nemo.corer   r   r   r   nemo.core.neural_typesr   r   r   nemo.core.neural_types.elementsr   
nemo.utilsr   r   r   ImportErrorModuleNotFoundErrorr   r.   r.   r.   r/   <module>   s0   &