o
    miK                     @   s   d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZ d d	lmZ dd
lT d dlZd dlmZ d dlZd dlmZ d dlmZmZmZmZ dd Zdd Zdd Zdd Zdd Z G dd dej!Z"dS )    )PathN)beartype)nn)CosineAnnealingLR   )get_dataloaderaudioDataset)get_optimizer)tensorboard)*)SpeechTokenizer)tqdm)AcceleratorDistributedTypeDistributedDataParallelKwargsDataLoaderConfigurationc                 C   s   | d uS N )valr   r   ?/home/ubuntu/SpeechTokenizer/speechtokenizer/trainer/trainer.pyexists   s   r   c                 c   s    	 | D ]}|V  qqr   r   )dldatar   r   r   cycle   s
   r   c                 C   s   t | ttfr	| S | fS r   )
isinstancetuplelist)tr   r   r   
cast_tuple!   s   r   c                 C   s.   |  D ]\}}| |d}|| | |< q| S )N        )itemsget)lognew_logskey	new_value	old_valuer   r   r   	accum_log%   s   r'   c                 C   s,   t dt| }t|dkrdS t|d S )zReturns the number of steps trained from a checkpoint based on the filename.

    Filename format assumed to be something like "/path/to/soundstorm.20000.pt" which is
    for 20k train steps. Returns 20000 in that case.
    z\d+r   )refindallstrlenint)checkpoint_pathresultsr   r   r   checkpoint_num_steps+   s   r0   c                       s   e Zd Zee fdededef fddZdd Zdd
dZdd Z	e
dd Ze
dd Ze
dd Ze
dd Zdd Zd defddZdd Zdd Z  ZS )!SpeechTokenizerTrainer	generatordiscriminatorsaccelerate_kwargsc                    sV  t    t }t|d |dd}|d _|d _|d _|d}t	| _
|d _|d	 _|d
 _|d _|d _|dd _d} j
 sf j
jddd tt j
 dd}	tj||	ddd W d    n1 sw   Y  t|d}
tdL|
|gd| _ jrttj|d _ | _!| _" #dt$dg |d _%|d _&|d _'|d _(|dd }|d!krdd"l)m*} |d#d$}|t+|d% _,nt- _,g  _.d$}t/t0 j%D ]-} j.1|d&| |d' j|d(| |d)| |d*|d+d, |d- }q|d&|d' j|d(|d)|d*|d.d, _2|d/}|d0}|d}| _t|d1}	|	3 }W d    n	1 sqw   Y  |d2}t|d1}	|	3 }W d    n	1 sw   Y  t4|||j5 jd3 _6t4| jd4 |j5 jdd5 _7 jrň 8d6t0 j6 d7t0 j7 d8 t0 j6 jksJ d9t0 j7 jksJ d:t0 j7 d;|d<d}|d=}t9 j6 jd||d> _:t9 j7d$ddd$d> _;|d? _<|d@ _=t>|? |d?|dA|dBdC _@t>tAjBdDdE  j"C D  |d?|dA|dBdC _D j j6E  | }tF j@|dF _GtF jD|dF _H jI j! j@ jD jG jH j: j;\ _! _@ _D _G _H _: _; fdGdH j"J D  _"| j j< j= jdI} jjKd|dJ tLdK _Md _Nd S )MNseedsplit_batchesF	log_stepsstdout_stepssave_model_stepsresults_foldernum_ckpt_keepepochsnum_warmup_steps
batch_sizesample_rateshowpiece_num   r   T)parentsexist_okz/config.jsonzw+   )ensure_asciiindent)r6   )dataloader_configkwargs_handlerslogsstepsr   mel_loss_lambdascommitment_loss_lambdarecon_loss_lambdadistill_loss_lambdadistill_typed_axist_axis)partial
lambda_simr   )rS   n_fftnum_melshop_sizewin_sizefminfmax_for_loss)rT   rU   r?   rV   rW   rX   fmax   rZ   segment_sizetrain_filesrvalid_files)	file_listr\   downsample_rater?      )r`   r\   ra   r?   validztraining with dataset of z/ samples and validating with randomly splitted z samplesz1dataset must have sufficient samples for trainingzEvalidation dataset must have sufficient number of samples (currently z) for training	drop_lastnum_workers)r>   shufflerd   re   learning_rateintial_learning_ratewdbetas)lrri   rj   c                 S   s   g | ]}|  qS r   )
parameters).0ir   r   r   
<listcomp>   s    z3SpeechTokenizerTrainer.__init__.<locals>.<listcomp>)T_maxc                       i | ]\}}| j |qS r   )acceleratorpreparerm   kvselfr   r   
<dictcomp>       z3SpeechTokenizerTrainer.__init__.<locals>.<dictcomp>)num_train_stepsr=   rg   initial_learning_rater<   )configinfr   )Osuper__init__r   torchmanual_seedr!   r7   r8   r9   r   r:   r;   r<   r=   r>   r?   r@   r   mkdiropenr+   jsondumpr   r   rr   is_mainr
   SummaryWriterospathjoinwriterr2   r3   register_bufferTensorrK   rL   rM   rN   	functoolsrR   t_axis_distill_lossdistill_lossd_axis_distill_lossmel_loss_kwargs_listranger,   append
mel_kwargs	readlinesr   ra   dsvalid_dsprintr   r   valid_dlrk   
initial_lrr	   rl   optim_g	itertoolschainvaluesoptim_d__len__r   scheduler_gscheduler_drs   r    init_trackersfloatbest_dev_mel_lossplot_gt_once)rx   r2   r3   cfgr4   
ddp_kwargsr6   r:   project_namefrG   rO   rR   rS   multrn   r\   r]   r>   train_file_listr_   valid_file_listrd   re   r{   hps	__class__rw   r   r   :   s   




 






$(
	


zSpeechTokenizerTrainer.__init__c              	      s   | j k r| _ t j j j d tt|j	
d}t| jkr5dd |d  j  D  t j j fdd j D  j  j  j  j  j d}t|| d S )Nz/SpeechTokenizer_best_dev.ptSpeechTokenizerTrainer_*c                 S   s   g | ]}t |qS r   )r   remove)rm   cr   r   r   ro      s    z/SpeechTokenizerTrainer.save.<locals>.<listcomp>c                    rq   r   )rr   get_state_dictrt   rw   r   r   ry      rz   z/SpeechTokenizerTrainer.save.<locals>.<dictcomp>)r2   r3   r   r   r   r   r   )r   r   saverr   r   r2   r:   sortedr   parentglobr,   r;   dictr3   r    r   
state_dictr   r   r   )rx   r   r   ckptspkgr   rw   r   r      s    
	zSpeechTokenizerTrainer.saveNTc                    s  t |stjd}t|d }jj}tj	|dd |
 d  fddj D }t fdd	|  |rj
 d
  j
 d  j
 d  j
 d  d  v rw d _jrwdj  tjt|d gjd_d S d S )Nr   r(   cpu)map_locationr2   c                    rq   r   )rr   unwrap_modelrt   rw   r   r   ry      rz   z/SpeechTokenizerTrainer.load.<locals>.<dictcomp>c                    s   | d   d | d  S )Nr   r3   r   )load_state_dict)kv)r   r   r   <lambda>   s    z-SpeechTokenizerTrainer.load.<locals>.<lambda>r   r   r   r   r   z The best dev mel loss before is r   )device)r   r   r:   r   r+   rr   r   r2   r   loadr   r3   r    mapr   r   r   r   keysr   r   r   tensorr0   r   rJ   )rx   r   restore_optimizerr   r2   r3   r   )r   rx   r   r      s&   
 zSpeechTokenizerTrainer.loadc                 C   s   | j | d S r   )rr   r   )rx   msgr   r   r   r   
  s   zSpeechTokenizerTrainer.printc                 C      | j jS r   )rr   r   rw   r   r   r   r        zSpeechTokenizerTrainer.devicec                 C   s   | j jtjko| j jdk S Nr   )rr   distributed_typer   NOnum_processesrw   r   r   r   is_distributed  s   z%SpeechTokenizerTrainer.is_distributedc                 C   r   r   )rr   is_main_processrw   r   r   r   r     r   zSpeechTokenizerTrainer.is_mainc                 C   r   r   )rr   is_local_main_processrw   r   r   r   is_local_main  r   z$SpeechTokenizerTrainer.is_local_mainc                 C   s,   || j k r| j| j| j | | j   S | jS r   )r=   r   rk   )rx   stepr   r   r   warmup  s   
zSpeechTokenizerTrainer.warmupr   c                 K   s   |dkr|  D ]\}}| jj|||d qd S |dkr4|  D ]\}}| jj||fd|i| q d S |  D ]\}}| jj|||d q8d S )Nfigure)global_stepaudior   )r    r   
add_figure	add_audio
add_scalar)rx   r   r   typekwargsru   rv   r   r   r   r"   #  s   zSpeechTokenizerTrainer.logc                    s   | j   tdd | j  i }t| j }|| jk r.| 	|}| j
jD ]}||d< q&n| j  | j  | j d }t| jD ]}| jrRtd| d | jD ]}t }|\ } d |   \}	}
| j  tt fdd| j }ttd	d |}| j| | j  | j  tt fd
d| j }t }tt fddt| j | j!}ttdd |}ttdd |}| "|
|}|| | |	| j#  || j$  | j%|  }| j| | j  t&|dt | i}| jr\|| j' s\t()  t* fi | j!d  }W d    n	1 s*w   Y  | d| d| d| dd|dd|	 dd| dd|d | j' dd i }| jr|| j+ s| j,| | | | |	 | || |d	|d | j-  | jr|| j. s|dkr| d d}d}d}| j /  t()  t0t1| j2D ]\}}|\ } d |   \}	}
t* fi | j!d  }||7 }| "|
| }||7 }| 3d7 }|| j4k re| j5s/| j,d|  d 6 7 id| j8|d t9 :dfi | j;}| j,d| t<|d 6 = id |d! | j,d"| d 6 7 id| j8|d t9:dfi | j;}| j,d#| t<|d 6 = id |d! q| j5snd$| _5| | d%|| dd&|| d | j,|| || d'|d W d    n	1 sw   Y  t>| j?d(|d) }| @|||  | | d*t>| j?  | j   |  jd7  _t| j }|| jk r| 	|}| jjD ]}||d< q| jjD ]}||d< qqU| j  | j  | j d }qUqD| d+ d S ),Nc                 S   s   |   S r   )traindiscr   r   r   r   1  s    z.SpeechTokenizerTrainer.train.<locals>.<lambda>rk   r   zEpoch:z	 start...r   c                    s   |    S r   )detachr   xx_hatr   r   r   L  s    c                 S   s   t | d d  S Nr[   )discriminator_lossr   r   r   r   r   M      c                    s
   |  S r   r   r   r   r   r   r   T  s   
 c                    s   | d t  fi | d  S )Nr   r   )mel_loss)mel_kr   r   r   r   V  rz   c                 S   s   t | dd   S r   )feature_lossr   r   r   r   r   W  r   c                 S   s   t | d S r   )adversarial_lossr   r   r   r   r   X  s    	time_costzEpoch z	 -- Step z: Gen Loss: z0.3fz; Mel Error:z
; Q Loss: z; Distill Loss: z; Time cost per step: s)	ztrain/discriminators lossztrain/generator lossztrain/feature lossztrain/adversarial lossztrain/quantizer lossztrain/mel lossztrain/mel errorztrain/distillation lossztrain/learning_rate)r   zValidation start ...r   zgroundtruth/x_r   )r   r?   r   zgroundtruth/x_spec_r   )r   r   zgenerate/x_hat_zgenerate/x_hat_spec_Tz: dev mel error: z	dev distill loss: )zdev/mel errorzdev/distillation lossSpeechTokenizerTrainer_08dz: saving model to ztraining complete)Ar2   r   r   r3   r   r-   rJ   itemr=   r   optimparam_groupsr   r   r   get_last_lrr   r<   r   r   r   time	unsqueezer   	zero_gradr   sumrr   backwardr   
recon_lossziprK   r   r   rL   rM   rN   r'   r8   r   inference_moder   r7   r"   wait_for_everyoner9   evalr   	enumerater   sizer@   r   r   r   r?   mel_spectrogramsqueezer   plot_spectrogramnumpyr+   r:   r   )rx   step_time_logrJ   rk   param_groupepochbatchticsemantic_featureloss_qfeaturediscriminator_outputsloss_disc_all
loss_reconloss_melloss_featureloss_adversarialloss_distillloss_generator_all	mel_errortotal_mel_errortotal_distill_lossnumrn   x_spec
x_hat_spec
model_pathr   r   r   r   .  s   










"*

V





****&



ezSpeechTokenizerTrainer.trainc                 C   s   |    |   d S r   )r   r   rw   r   r   r   continue_train  s   z%SpeechTokenizerTrainer.continue_train)NTr   )__name__
__module____qualname__r   r   r   r   r   r   r   propertyr   r   r   r   r   r"   r   r  __classcell__r   r   r   r   r1   9   s4     '




{r1   )#pathlibr   r)   r   r   r   r   r   torch.optim.lr_schedulerr   datasetr   r   	optimizerr	   torch.utilsr
   lossr   speechtokenizerr   r   r   
accelerater   r   r   r   r   r   r   r'   r0   Moduler1   r   r   r   r   <module>   s.    