o
    SifA                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ G d	d
 d
ejZG dd deZdS )    N)MultiPeriodDiscriminatorMultiResolutionDiscriminator)FeatureExtractor)FourierHead)plot_spectrogram_to_numpy)DiscriminatorLossGeneratorLossFeatureMatchingLossMelSpecReconstructionLoss)Backbone)safe_logc                       s   e Zd Z								d&dedededed	ed
edededededededef fddZ	dd Z
dd Zdd Zdd Zdd Zdd Zed d! Zd"d# Zd$d% Z  ZS )'VocosExpr   -         ?Ffeature_extractorbackboneheadsample_rateinitial_learning_ratenum_warmup_stepsmel_loss_coeffmrd_loss_coeffpretrain_mel_stepsdecay_mel_coeffevaluate_utmosevaluate_pesqevaluate_periodictyc                    sv   t    | jg dd || _|| _|| _t | _t | _	t
 | _t | _t | _t|d| _d| _| | _| _dS )a  
        Args:
            feature_extractor (FeatureExtractor): An instance of FeatureExtractor to extract features from audio signals.
            backbone (Backbone): An instance of Backbone model.
            head (FourierHead):  An instance of Fourier head to generate spectral coefficients and reconstruct a waveform.
            sample_rate (int): Sampling rate of the audio signals.
            initial_learning_rate (float): Initial learning rate for the optimizer.
            num_warmup_steps (int): Number of steps for the warmup phase of learning rate scheduler. Default is 0.
            mel_loss_coeff (float, optional): Coefficient for Mel-spectrogram loss in the loss function. Default is 45.
            mrd_loss_coeff (float, optional): Coefficient for Multi Resolution Discriminator loss. Default is 1.0.
            pretrain_mel_steps (int, optional): Number of steps to pre-train the model without the GAN objective. Default is 0.
            decay_mel_coeff (bool, optional): If True, the Mel-spectrogram loss coefficient is decayed during training. Default is False.
            evaluate_utmos (bool, optional): If True, UTMOS scores are computed for each validation run.
            evaluate_pesq (bool, optional): If True, PESQ scores are computed for each validation run.
            evaluate_periodicty (bool, optional): If True, periodicity scores are computed for each validation run.
        r   r   r   )ignore)r   FN)super__init__save_hyperparametersr   r   r   r   multiperioddiscr   multiresddiscr   	disc_lossr   gen_lossr	   feat_matching_lossr
   melspec_losstrain_discriminatorbase_mel_coeffr   selfr   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/vocos/experiment.pyr       s   
 zVocosExp.__init__c                 C   s   d| j  id| j ig}d| j id| j id| j ig}tjj|| j	j
dd}tjj|| j	j
dd}| jjd }tj|| j	j|d}tj|| j	j|d}||g|dd|ddgfS )Nparams)g?g?)lrbetas   )r   num_training_stepsstep)	schedulerinterval)r"   
parametersr#   r   r   r   torchoptimAdamWhparamsr   trainer	max_stepstransformersget_cosine_schedule_with_warmupr   )r+   disc_params
gen_paramsopt_discopt_genr>   scheduler_discscheduler_genr.   r.   r/   configure_optimizersF   s&   

zVocosExp.configure_optimizersc                 K   s2   | j |fi |}| j|fi |}| |}|S )Nr   )r+   audio_inputkwargsfeaturesxaudio_outputr.   r.   r/   forwarda   s   
zVocosExp.forwardc                 K   sn  |}|dkr| j rt  | |fi |}W d    n1 s w   Y  | jd||d|\}}}	}	| jd||d|\}
}}	}	| j||d\}}}	| j|
|d\}}}	|t| }|t| }|| jj|  }| j	d|dd | 	d| | 	d| |S |d	kr| |fi |}| j r| jd||d|\}	}}}| jd||d|\}	}}}| j
|d
\}}| j
|d
\}}|t| }|t| }| j||dt| }| j||dt| }| 	d| | 	d| | 	d| | 	d| nd } } }}| ||}|| jj|  | | jj|  | j|  }| j	d|dd | 	d| j | 	d| | jd dkr| jdkr| jjd|d j | j| jj | jjd|d j | j| jj t  t| j|d }t| j|d }W d    n	1 sw   Y  | jjjdt|j  | jdd | jjjdt|j  | jdd |S d S )Nr   )yy_hat)disc_real_outputsdisc_generated_outputszdiscriminator/totalT)prog_barzdiscriminator/multi_period_losszdiscriminator/multi_res_loss   )disc_outputs)fmap_rfmap_gzgenerator/multi_period_losszgenerator/multi_res_losszgenerator/feature_matching_mpzgenerator/feature_matching_mrdzgenerator/total_lossr   zgenerator/mel_lossi  ztrain/audio_inztrain/audio_predztrain/mel_targetHWCdataformatsztrain/mel_predr.   )r(   r9   no_gradr"   r#   r$   lenr<   r   logr%   r&   r'   r   global_stepglobal_ranklogger
experiment	add_audiodatacpur   r   mel_spec	add_imager   numpy)r+   batch	batch_idxoptimizer_idxrI   rH   	audio_hatreal_score_mpgen_score_mp_real_score_mrdgen_score_mrdloss_mploss_mp_realloss_mrdloss_mrd_realloss
fmap_rs_mp
fmap_gs_mpfmap_rs_mrdfmap_gs_mrdloss_gen_mplist_loss_gen_mploss_gen_mrdlist_loss_gen_mrd
loss_fm_mploss_fm_mrdmel_lossmelmel_hatr.   r.   r/   training_stepg   s   




zVocosExp.training_stepc                 C   s8   | j jrddlm} t| ds|| jd| _d S d S d S )Nr   )
UTMOSScoreutmos_modeldevice)r<   r   metrics.UTMOSr   hasattrr   r   )r+   r   r.   r.   r/   on_validation_epoch_start   s   
z"VocosExp.on_validation_epoch_startc              
   K   sh  |}| |fi |}t jj|| jjdd}t jj|| jjdd}| jjr3ddlm} |||\}	}
}nd }	 }
}| jjrI| j	
|d }ntjd| jd}| jjrddlm} d}t|  |  D ]\}}||d||ddd	7 }qj|t| }t|}ntjd| jd}| |d|d}|d
|  d
|  }|||||	|
||d |d d	S )Ni>  )	orig_freqnew_freqr   )calculate_periodicity_metricsrS   r   )pesqwb)on_error   )	val_lossr   utmos_score
pesq_scoreperiodicity_loss
pitch_lossf1_scorerH   
audio_pred)
torchaudio
functionalresampler<   r   r   metrics.periodicityr   r   r   score	unsqueezemeanr9   zerosr   r   r   ziprc   rf   r[   tensorr'   )r+   rg   rh   rI   rH   rj   audio_16_khzaudio_hat_16khzr   r   r   r   r   r   r   refdegr   
total_lossr.   r.   r/   validation_step   s>   "zVocosExp.validation_stepc                 C   s  | j dkri|d  ^ }}}| jjd|j  | j| j	j
 | jjd|j  | j| j	j
 t| j|}t| j|}| jjjdt|j  | jdd | jjjdt|j  | jdd tdd	 |D  }td
d	 |D  }tdd	 |D  }	tdd	 |D  }
tdd	 |D  }tdd	 |D  }tdd	 |D  }| jd|dd | jd|dd | jd|	dd | jd|
dd | jd|dd | jd|dd | jd|dd d S )Nr   val_inval_predval_mel_targetrW   rX   val_mel_hatc                 S      g | ]}|d  qS )r   r.   .0rK   r.   r.   r/   
<listcomp>	      z1VocosExp.validation_epoch_end.<locals>.<listcomp>c                 S   r   )r   r.   r   r.   r.   r/   r   
  r   c                 S   r   )r   r.   r   r.   r.   r/   r     r   c                 S   r   )r   r.   r   r.   r.   r/   r     r   c                 S   r   )r   r.   r   r.   r.   r/   r     r   c                 S   r   )r   r.   r   r.   r.   r/   r     r   c                 S   r   )r   r.   r   r.   r.   r/   r     r   r   T)	sync_distzval/mel_losszval/utmos_scorezval/pesq_scorezval/periodicity_losszval/pitch_losszval/f1_score)r^   valuesr_   r`   ra   rb   rc   rf   r]   r<   r   r   r'   rd   re   r   r9   stackr   nparrayr\   )r+   outputsrm   audio_inr   
mel_targetr   avg_lossr   r   r   r   r   r   r.   r.   r/   validation_epoch_end   sH   
zVocosExp.validation_epoch_endc                 C   s   | j jjjS )z_
        Override global_step so that it returns the total number of batches processed
        )r=   fit_loop
epoch_looptotal_batch_idxr+   r.   r.   r/   r]     s   zVocosExp.global_stepc                 G   s"   | j | jjkrd| _d S d| _d S )NTF)r]   r<   r   r(   )r+   argsr.   r.   r/   on_train_batch_start   s   

zVocosExp.on_train_batch_startc                    s4   d fdd	} j jr j| jd   _d S d S )N      ?c              	      sl    j jd }|  jjk rdS t|  jj ttd| jj  }tdddttjt| d |   S )Nr3   r   rS   g        r   g       @)	r=   r>   r<   r   floatmaxmathcospi)current_step
num_cyclesr>   progressr   r.   r/   mel_loss_coeff_decay'  s   *z9VocosExp.on_train_batch_end.<locals>.mel_loss_coeff_decayrS   )r   )r<   r   r)   r]   r   )r+   r   r   r.   r   r/   on_train_batch_end&  s   	zVocosExp.on_train_batch_end)r   r   r   r   FFFF)__name__
__module____qualname__r   r   r   intr   boolr    rG   rM   r   r   r   r   propertyr]   r   r   __classcell__r.   r.   r,   r/   r      s\    		
2W-'
r   c                       s   e Zd ZdZ							ddededed	ed
ededededede	de	de	de	f fddZ
 fddZ fddZ fddZ  ZS )VocosEncodecExpa  
    VocosEncodecExp is a subclass of VocosExp that overrides the parent experiment to function as a conditional GAN.
    It manages an additional `bandwidth_id` attribute, which denotes a learnable embedding corresponding to
    a specific bandwidth value of EnCodec. During training, a random bandwidth_id is generated for each step,
    while during validation, a fixed bandwidth_id is used.
    r   r   r   Fr   r   r   r   r   r   r   r   r   r   r   r   r   c                    sP   t  |||||||||	|
||| tt| jjd| _tt| jjd| _d S )N)num_embeddings)	r   r    r   r[   r   
bandwidthsr"   r   r#   r*   r,   r.   r/   r    <  s"   zVocosEncodecExp.__init__c                    s2   t jdt| jjd| jd}t j|d|i}|S )Nr   )rS   )lowhighsizer   bandwidth_id)r9   randintr[   r   r   r   r   r   r+   r   r   outputr,   r.   r/   r   _  s   zVocosEncodecExp.training_stepc                    s(   t jdg| jd}t j|d|i}|S )Nr   r   r   )r9   r   r   r   r   r   r,   r.   r/   r   d  s   zVocosEncodecExp.validation_stepc                    s   | j dkr;|d  ^ }}}| jj| jjd  | j|d d d d f }| jjd|d j	
  | j| jj t | d S )Nr   encodec)r   r   )r^   r   r   r   set_target_bandwidthr   r_   r`   ra   rb   rc   rf   r]   r<   r   r   r   )r+   r   rm   r   encodec_audior,   r.   r/   r   i  s   
z$VocosEncodecExp.validation_epoch_end)r   r   r   FFFF)r   r   r   __doc__r   r   r   r   r   r   r    r   r   r   r   r.   r.   r,   r/   r   4  sN    	
#r   )r   rf   r   pytorch_lightningplr9   r   r?   vocos.discriminatorsr   r   vocos.feature_extractorsr   vocos.headsr   vocos.helpersr   
vocos.lossr   r   r	   r
   vocos.modelsr   vocos.modulesr   LightningModuler   r   r.   r.   r.   r/   <module>   s"      $