o
    …wÖi°;  ã                	   @   s<  d dl m Z mZ d dlmZmZ d dlZd dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z'm(Z( d dl)m*Z* d dl+m,Z, zd dl-Z-dZ.W n e/e0fy’   dZ.Y nw G dd„ de"e!ƒZ1dS )é    )ÚrandomÚ	randrange)ÚListÚOptionalN)Ú	rearrange)Úinstantiate)ÚTrainer)ÚTensorBoardLoggerÚWandbLogger)Ú
DictConfig)ÚSummaryWriter)Úmask_sequence_tensor)ÚConsistencyLossÚGeneratorLossÚGradientPenaltyLossÚ	HingeLoss)Úto_device_recursive)Ú
ExportableÚModelPTÚPretrainedModelInfoÚ	typecheck)ÚLengthsTypeÚMelSpectrogramTypeÚ
NeuralType)ÚBoolType)ÚloggingTFc                       sˆ  e Zd ZdZd6dededdf‡ fdd„Zdd	„ Zd
ej	dej	dej	fdd„Z
d
ej	dej	dej	fdd„Zd7dedefdd„Zd8dedej	fdd„Zdd„ Zeedeƒ ƒedeƒ ƒedeƒ ddedeƒ dddœdddd œd!ej	dej	ded"efd#d$„ƒZ					d9d!ej	dej	d%eeej	  d&eeej	  d'eej	 ded"efd(d)„Zd*d+„ Zd,d-„ Zd.d/„ Zd0d1„ Zed2d3„ ƒZd4d5„ Z ‡  Z!S ):ÚSpectrogramEnhancerModelzõ
    GAN-based model to add details to blurry spectrograms from TTS models like Tacotron or FastPitch. Based on StyleGAN 2 [1]
    [1] Karras et. al. - Analyzing and Improving the Image Quality of StyleGAN (https://arxiv.org/abs/1912.04958)
    NÚcfgÚtrainerÚreturnc                    sZ   d | _ tƒ j||d t|jƒ| _t|jƒ| _tƒ | _tƒ | _	t
|jƒ| _t|jƒ| _d S )N)r   r   )Úspectrogram_modelÚsuperÚ__init__r   Ú	generatorÚdiscriminatorr   Úgenerator_lossr   Údiscriminator_lossr   Úconsistency_loss_weightÚconsistency_lossr   Úgradient_penalty_loss_weightÚgradient_penalty_loss)Úselfr   r   ©Ú	__class__© úm/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/models/spectrogram_enhancer.pyr"   N   s   z!SpectrogramEnhancerModel.__init__c                 C   s   t |tt| j ¡ ƒƒjƒS ©N)r   ÚnextÚiterr#   Ú
parametersÚdevice)r+   Úer.   r.   r/   Úmove_to_correct_deviceZ   s   z/SpectrogramEnhancerModel.move_to_correct_deviceÚspectrogramÚlengthsc                 C   s*   || j j }|| j j| j j  }t||ƒS r0   )Ú_cfgÚspectrogram_min_valueÚspectrogram_max_valuer   ©r+   r7   r8   r.   r.   r/   Únormalize_spectrograms]   s   
z/SpectrogramEnhancerModel.normalize_spectrogramsc                 C   s*   || j j| j j  }|| j j }t||ƒS r0   )r9   r;   r:   r   r<   r.   r.   r/   Úunnormalize_spectrogramsb   s   
z1SpectrogramEnhancerModel.unnormalize_spectrogramsé   FÚ
batch_sizeÚmixingc                 C   s†   |r1| j jtƒ k r1td| jjƒ}t || j j¡g| }t || j j¡g| jj|  }g |¢|¢}nt || j j¡g| jj }|  	|¡S )Nr?   )
r9   Ú
mixed_probr   r   r#   Ú
num_layersÚtorchÚrandnÚ
latent_dimr6   )r+   r@   rA   Úmixing_pointÚ
first_partÚsecond_partÚzsr.   r.   r/   Úgenerate_zsg   s   
z$SpectrogramEnhancerModel.generate_zsc                 C   s   t  || jjdd¡}|  |¡S )Ni   r?   )rD   Úrandr9   Ún_bandsr6   )r+   r@   Únoiser.   r.   r/   Úgenerate_noiser   s   
z'SpectrogramEnhancerModel.generate_noisec                 C   s,   | j j}|j^ }}t |d|||  f¡S )Nr   )r#   Úupsample_factorÚshapeÚFÚpad)r+   ÚspectrogramsÚ
multiplierÚ_Ú
max_lengthr.   r.   r/   Úpad_spectrogramsv   s   z)SpectrogramEnhancerModel.pad_spectrograms)ÚBÚDÚT_spec)rY   T)Úoptional©Úinput_spectrogramsr8   rA   Ú	normalize)Úinput_types)rA   r_   r^   r_   c             	   C   s   | j ||||ddddS )aK  
        Generator forward pass. Noise inputs will be generated.

        input_spectrograms: batch of spectrograms, typically synthetic
        lengths: length for every spectrogam in the batch
        mixing: style mixing, usually True during training
        normalize: normalize spectrogram range to ~[0, 1], True for normal use

        returns: batch of enhanced spectrograms

        For explanation of style mixing refer to [1]
        [1] Karras et. al. - A Style-Based Generator Architecture for Generative Adversarial Networks, 2018 (https://arxiv.org/abs/1812.04948)
        N)r^   r8   rA   r_   rJ   ÚwsrN   )Úforward_with_custom_noise)r+   r^   r8   rA   r_   r.   r.   r/   Úforward{   s   ùz SpectrogramEnhancerModel.forwardrJ   ra   rN   c                    sØ   |j ^}}	}
|dur|durtdƒ‚|du rˆ  ||¡}|du r*‡ fdd„|D ƒ}|du r3ˆ  |¡}t|dƒ}|r@ˆ  ||¡}ˆ  |¡}ˆ  ||||¡}|rUˆ  ||¡}|dd…dd…dd…d|
…f }t|dƒ}|S )aÍ  
        Generator forward pass. Noise inputs will be generated if None.

        input_spectrograms: batch of spectrograms, typically synthetic
        lenghts: length for every spectrogam in the batch
        zs: latent noise inputs on the unit sphere (either this or ws or neither)
        ws: latent noise inputs in the style space (either this or zs or neither)
        noise: per-pixel indepentent gaussian noise
        mixing: style mixing, usually True during training
        normalize: normalize spectrogram range to ~[0, 1], True for normal use

        returns: batch of enhanced spectrograms

        For explanation of style mixing refer to [1]
        For definititions of z, w [2]
        [1] Karras et. al. - A Style-Based Generator Architecture for Generative Adversarial Networks, 2018 (https://arxiv.org/abs/1812.04948)
        [2] Karras et. al. - Analyzing and Improving the Image Quality of StyleGAN, 2019 (https://arxiv.org/abs/1912.04958)
        NzZPlease specify either zs or ws or neither, but not both. It is not clear which one to use.c                    s   g | ]}ˆ j  |¡‘qS r.   )r#   Ústyle_mapping)Ú.0Úz©r+   r.   r/   Ú
<listcomp>Ê   s    zFSpectrogramEnhancerModel.forward_with_custom_noise.<locals>.<listcomp>úb c l -> b 1 c lzb 1 c l -> b c l)	rQ   Ú
ValueErrorrK   rO   r   r=   rX   r#   r>   )r+   r^   r8   rJ   ra   rN   rA   r_   r@   rV   rW   Úenhanced_spectrogramsr.   rg   r/   rb   £   s*   ÿ


 
z2SpectrogramEnhancerModel.forward_with_custom_noisec                 C   s´  |\}}}t  ¡  |  ||¡}|  ||¡}W d   ƒ n1 s w   Y  |dkrv| j||ddd}t|dƒ}|  |||¡}t|dƒ ¡ }	|  |	||¡}
|  |
|¡}| jd|dd || j	j
 dkrt|  |	|
¡}| jd|dd || S |S |d	krØ| j||ddd}t|dƒ}t|dƒ}|  |||¡}|  |¡}|  |||¡}| jd
|dd | jd|dd t  ¡  t|dƒ}|  ||||¡ W d   ƒ || S 1 sÏw   Y  || S d S )Nr   TFr]   ri   Úd_loss)Úprog_barÚ	d_loss_gpr?   Úg_lossÚc_loss)rD   Úno_gradr=   rc   r   r$   Úrequires_grad_r&   Úlogr9   Ú#gradient_penalty_loss_every_n_stepsr*   r%   r(   Úlog_illustration)r+   ÚbatchÚ	batch_idxÚoptimizer_idxr^   Útarget_spectrogramsr8   rk   Úfake_logitsÚtarget_spectrograms_Úreal_logitsrl   Úgp_lossro   rp   r.   r.   r/   Útraining_stepß   sN   

þÿ
ÿ





þýîz&SpectrogramEnhancerModel.training_stepc                 C   s8   t | jj| j ¡ d}t | jj| j ¡ d}||gg fS )N)Úparams)r   r9   Úgenerator_optr#   r3   Údiscriminator_optr$   )r+   r€   r   r.   r.   r/   Úconfigure_optimizers  s   þz-SpectrogramEnhancerModel.configure_optimizersc                 C   s.   t |jƒ}tjjj|fd|ji|j¤Ž| _d S )NÚ
collate_fn)	r   ÚdatasetrD   ÚutilsÚdataÚ
DataLoaderrƒ   Údataloader_paramsÚ	_train_dl)r+   Útrain_data_configr„   r.   r.   r/   Úsetup_training_data  s   
ÿÿÿz,SpectrogramEnhancerModel.setup_training_datac                 C   s   dS )zÏ
        There is no validation step for this model.
        It is not clear whether any of used losses is a sensible metric for choosing between two models.
        This might change in the future.
        Nr.   )r+   Úval_data_configr.   r.   r/   Úsetup_validation_data  s   z.SpectrogramEnhancerModel.setup_validation_datac                 C   s"   g }t ddd| d}| |¡ |S )NÚ.tts_en_spectrogram_enhancer_for_asr_finetuningz³https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch_spectrogram_enhancer_for_asr_finetuning/versions/1.20.0/files/tts_en_spectrogram_enhancer_for_asr_finetuning.nemoa"  This model is trained to add details to synthetic spectrograms. It was trained on pairs of real-synthesized spectrograms generated by FastPitch. STFT parameters follow ASR with 25 ms window and 10 ms hop. It is supposed to be used in conjunction with that model for ASR training/adaptation.)Úpretrained_model_nameÚlocationÚdescriptionÚclass_)r   Úappend)ÚclsÚlist_of_modelsÚmodelr.   r.   r/   Úlist_available_models%  s   ù
	z.SpectrogramEnhancerModel.list_available_modelsc                 C   s   | j dkrd S | jsd S | jjd }|| jj dkrd S d}t| ¡ |  ¡ ƒ}tj	|| |||gdd 
¡ d d …|d d …d d …d |…f }tsLJ dƒ‚tjj|dd dd¡}	| jD ]2}
t|
tƒrr|
j}|jd	|	|d
 | ¡  q[t|
tƒrƒ|
jd	|	gdg|d q[t dtt|
ƒƒ¡ q[d S )Nr   é   )Údimz1Torchvision imports failed but they are required.r?   )Únrowg        g      ð?rT   )Úglobal_stepz%residual, input, output, ground truth)ÚcaptionÚstepzUnsupported logger type: %s)Úglobal_rankÚloggersr   r›   Úlog_every_n_stepsÚintÚflattenÚitemrD   ÚstackÚcpuÚTORCHVISION_AVAILABLEÚtorchvisionr…   Ú	make_gridÚclampÚ
isinstancer	   Ú
experimentÚ	add_imageÚflushr
   Ú	log_imager   ÚwarningÚstrÚtype)r+   ry   r^   rk   r8   r   ÚidxÚlengthÚtensorÚgridÚloggerÚwriterr.   r.   r/   ru   9  s<   
üù ø




øz)SpectrogramEnhancerModel.log_illustrationr0   )r?   F)r?   )NNNFT)"Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r"   r6   rD   ÚTensorr=   r>   r¡   ÚboolrK   rO   rX   r   r   r   r   r   rc   r   r   rb   r~   r‚   r‹   r   Úclassmethodr—   ru   Ú__classcell__r.   r.   r,   r/   r   H   sl    

üÿúýüûú$øþýüûúù
ø<0
r   )2r   r   Útypingr   r   rD   Útorch.nn.functionalÚnnÚ
functionalrR   Úeinopsr   Úhydra.utilsr   Úlightning.pytorchr   Úlightning.pytorch.loggersr	   r
   Ú	omegaconfr   Útorch.utils.tensorboard.writerr   Ú#nemo.collections.common.parts.utilsr   Ú7nemo.collections.tts.losses.spectrogram_enhancer_lossesr   r   r   r   Ú(nemo.collections.tts.parts.utils.helpersr   Ú	nemo.corer   r   r   r   Únemo.core.neural_typesr   r   r   Únemo.core.neural_types.elementsr   Ú
nemo.utilsr   r§   r¦   ÚImportErrorÚModuleNotFoundErrorr   r.   r.   r.   r/   Ú<module>   s0   &ÿ