o
    ϯiPX                     @   s  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dlm	  m
Z
 d dlmZ zd dlZW n ey>   dZY nw zd dlm  mZ W n eyU   dZY nw zd dlmZ W n eyi   dZY nw d dlmZmZmZ d dlmZ d dlmZmZmZ d dlmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z%m&Z& d dl'm(Z(m)Z) dd Z*	dddZ+dd Z,dddZ-dd Z.dd Z/e0dkre.  dS dS )    N)datetime)
GradScaler)create_model_and_transformstrace_modelcreate_model)get_data)	is_masterinit_distributed_deviceworld_info_from_env)setup_logging)
parse_args)	cosine_lr)train_one_epochevaluate)dataset_splitget_optimizerc              
   C   s   t t||D ].}tjtj| jd| dr5ttj| jd| dtj| jd|d  d qtjtj| jd| drUttj| jd| d d S )N
epoch_top_.pt   )	reversedrangeospathexistsjoincheckpoint_pathrenameremove)argsstartidxall_idx_leni r"   L/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/training/main.pymaintain_ckpts$   s    r$   Tc           
   
   C   sr  t | ttfrt| } t| ||||dS t | tr,tt|  } t| ||||dS t | tt	frdd |
 D }t|
 }t| |d}t|}||  t||d}|dd }||krh|| fS tt|D ]}	|||	  ||	 kr||	 |||	 < d|||	 < qntt|D ]#}	||	 rt||	t| t|tj|jd|	 d	  || fS q|| fS dS )
z
    Record the top-k performance of the current epoch.
    current_top_k_metrics is a dictionary of the form: {1: top_1_ckpt_measure, 2: top_2_ckpt_measure, ...}
    )r   ckptbignumbetterc                 S      i | ]}|d qS )Fr"   ).0kr"   r"   r#   
<dictcomp>L   s    z,update_top_k_performance.<locals>.<dictcomp>)reverseNTr   r   )
isinstancelisttuplenpmeanupdate_top_k_performancedictvaluesfloatintkeyssortedcopydeepcopyappendr   lenr$   torchsaver   r   r   r   )
new_metrics_inputscurrent_top_k_ckpt_metricsr   r%   r&   update_flagsorted_keyssorted_valuessorted_values_r!   r"   r"   r#   r2   2   s^   



r2   c                 C   s0   |  dp| dv p|  dp|  dp|  dS )Ntransformer)positional_embeddingtext_projectiontoken_embeddingln_finallogit_scale_t)
startswith)nr"   r"   r#   is_pretrained_paramsm   s   
rM   *   c                 C   s0   t | |  tj| |  t| |  d S N)r=   manual_seedr0   randomseed)rR   rankr"   r"   r#   random_seedw   s   rT   c            .         s   t  } | jdd| _t| j t| j tj| j tj| j t	j| j | j
dks;| j
dks;| j
dkrI| jdksI| jd u sIJ d| jd u rsdt dd	| j d
| j d| j d| j d| j g| _d| _t \| _| _| _| jrt| r| jD ]2}t| D ]+}tj d| d| st!d| d|  t"d| d| d| d| d	 qqd | _#t| | j$drtj| j%| j}tj!|dd | j$rd| j nd}tj||| _#tj | j#rt&d dS | j'rt(j)nt(j*| _+t,| j#| j+ t-| }d| j.v pd| j.v | _/d| j.v p!d| j.v | _0t| rY| j0r6tj| j%| jdnd| _1tj| j%| jd| _2| j1| j2fD ]}|rVtj!|dd qJnd| _1d| _2| j3rgt3|  | jdv soJ | jdkrzt(4d  | j5rt(6d!| j7 d"| j d#| j d$| j d%	 n%| jrt(6d&| j7 d"| j d#| j d$| j d%	 n
t(6d'| j7 d% t(6d(tj8| j9  t:| j| j
| j| j|| j;| j<tj8| j9d| j=| j>| j?| j@d)\}}| j5rtA  |B D ]
}	|	C|	D  qW d    n	1 sw   Y  | jErtF|| j|d*}t| ryt(6d+ t(6tG|  t(6d, tj| j%| jd-}
tH|
d.-}tItJ| D ]}tK| |}t(6d/| d0|  |L| d0| d1 qJW d    n	1 stw   Y  | jr| j5s| jMrtjNjOP|}i }| jQrd|d2< tjNjRjS|f|gdd3|}tT| | tU sJ d4| jErd5 vsJ d6d7d8 fd9d8tV|W }d:d; |D }| jXrt&d< |D ]}d|_Yqfd=d;|D }fd>d;|D }| jZd?kr	d@| _[d@| _\d@| _]| j^d u rd }d }n d5 j_j`| ja }| jbrdAD ]}dBD ]}tK| || d u r?tc| || tK| | q(q$fdCd;|D }fdDd;|D }fdEd;|D }fdFd;|D }td|dGdH|| j\dHg| je| jf| jgf| jh| ji| jZdI}tj|| je| jk|}td|dGdH|| j]dHg| jl| jm| jnf| jo| jp| jZdI}tj|| jl| jk|} ||dJ}|| dJ}| j5rtqjr||W dK}tqjr||W dK}tqjs|t d@dL tqju|d@dL tqju|d@dL nAtd|dGdH|| j[dHg| j| jv| jwf| jx| jy| jZdI}tj|| j| jk|}| j5r(tqjr||W dK}tqjs|t d@dL tqju|d@dL | jdMkr1tz nd }!d@}"| j{d urtj|| j{rtj}| j{|dN}#dO|#v r|#dO }"|#dP }$| jsst~t|$ d@ dQrsdRdS |$ D }$||$ | jbr|d ur| D ]\}}%|%|#|dT dU   q|d ur||#dU  |!d urdV|#v r|!|#dV  t(6dW| j{ dX|" dY n||# t(6dZ| j{ dX|" dY | jXrt&d< |D ]}d|_Yqn	t(6d[| j{ dt_dt_| j%o| j% d\kot| | _d }&| jr| j0rt0d usJ d]t0| j1}&| j/rmt| rmt/d us0J d^t('d_  d5 j_j| _| jd urI d` j_j| _t/jdada| j| jg tJ| db | j'rct/j|ddc t/|
 t('dd d5 vr|t| |"| |& d S |"d@krd` v r| jst| d@| |& | jrdedS t| jD }'t|"| jaD ]}(|(| jkrt&df |D ]}d|_Yqt| rt(6dg|(  t| |(||!|| |& |(dh })t fdidjdkD r| jst| |)| |&}*| jr| j| jfdld;|* D }+| jr{| jbrdmdS | D },ndU|t i},|)| j|t dn}-|-|, |!d ur6|!t |-dV< |)| jaksJ| jd@krZ|)| j d@krZt|-tj| j2do|) dp | jrjt|-tj| j2dq | jr{| js{t|+|'| |-ddr q| j/rt| rt/  d S d S d S )sN/-bertrobertabart zBbert/roberta/bart text encoder does not support pretrained models.z%Y_%m_%d-%H_%M_%Smodel_lr_b_j_p_Fz./json_files/z,aws s3 cp s3://s-laion-audio/webdataset_tar/z/sizes.json ./json_files/z/sizes.json)localT)exist_okzout-zout.logzLError. Experiment already exists. Use --name {} to specify a new experiment.r,   wandballtensorboardcheckpoints)ampfp16fp32rg   zIt is recommended to use fp32 mixed-precision instead of FP16 and AMP in this model. They will cause NaN loss and NaN gradients. FP16 and AMP support needs further verification and tuning, especially for train.zARunning in horovod mode with multiple processes / nodes. Device: z.Process (global: z, local z	), total .z=Running in distributed mode with multiple processes. Device: z&Running with a single process. Device zopenai cache dir: )
	precisiondevicejitforce_quick_geluopenai_model_cache_dirskip_paramspretrained_audiopretrained_textenable_fusionfusion_type)
batch_sizerk   zModel:zParams:z
params.txtwz  z: 
static_graph)
device_idsfind_unused_parametersz5At least one train or eval dataset must be specified.trainzCannot train with traced modelc                 S   s*   |j dk pd| v pd| v pd| v pd| v S )N   bnlnbiaslogit_scale)ndimrL   pr"   r"   r#   <lambda>  s   
 zmain.<locals>.<lambda>c                    s    | | S rO   r"   r   excluder"   r#   r   "  s    c                 S   s   g | ]
\}}d |v r|qS )text_branchr"   r(   rL   r   r"   r"   r#   
<listcomp>'  s
    zmain.<locals>.<listcomp>zFreeze Text!!!!c                    $   g | ]\}} ||r|j r|qS r"   requires_gradr   r   r"   r#   r   2  s    c                    r   r"   r   r   includer"   r#   r   5  s   $ adamr   )lrbeta1beta2epswd)_new_pretrainedc                    ,   g | ]\}} ||r|j rt|r|qS r"   r   rM   r   r   r"   r#   r   I      c                    r   r"   r   r   r   r"   r#   r   N  r   c                    ,   g | ]\}} ||r|j rt|s|qS r"   r   r   r   r"   r#   r   S  r   c                    r   r"   r   r   r   r"   r#   r   X  r   g        )paramsweight_decay)r   betasr   momentumoptimizer_name)
pretrainednew)named_parameters)	root_rankrf   )map_locationepoch
state_dictmodulec                 S   s"   i | ]\}}|t d d |qS )zmodule.N)r<   r(   r)   vr"   r"   r#   r*     s   " zmain.<locals>.<dictcomp>_	optimizerscalerz=> resuming checkpoint 'z	' (epoch )z=> loaded checkpoint 'z=> no checkpoint found at '{}'nonezPlease install tensorboard.zPlease install wandb.zStarting wandb.valclap)entityprojectnotesnametagsconfig)logzFinished loading wandb.c                 S   r'   )r   r"   )r(   r!   r"   r"   r#   r*     s    z8Text pretrained parameters are freezed since this epoch.zStart epoch r   c                 3   s    | ]}| v V  qd S rO   r"   )r(   r   )datar"   r#   	<genexpr>
  s    zmain.<locals>.<genexpr>)r   zimagenet-valzimagenet-v2c                    s$   g | ]\}}|v r |v r|qS r"   r"   r   )top_k_datasettop_k_metricr"   r#   r     s
    c                 S   s"   i | ]\}}|d  d |  qS )r   r   )r   r   r"   r"   r#   r*     s    )r   r   r   epoch_r   zepoch_latest.pt)r&   )r   amodelreplacerQ   rR   r=   rP   cudamanual_seed_allr0   tmodelr   r   r   r   nowstrftimer   rt   workersrj   distributedr
   
local_rankrS   
world_size
remotedatar   datasetnamesr   r   r   r   makedirssystemlog_path	log_locallogsprintdebugloggingDEBUGINFO	log_levelr   r	   	report_torb   rd   tensorboard_pathr   copy_codebasewarninghorovodinfork   
expanduserrn   r   torchscriptrm   rp   rq   rr   rs   no_grad
parametersset_
contiguoustracer   stropenr8   varsgetattrwriteuse_bn_syncnnSyncBatchNormconvert_sync_batchnormddp_static_graphparallelDistributedDataParallelr   r<   r.   r   freeze_textr   r   r   wd_pretrainedwd_new
train_data
dataloadernum_batchesepochs	split_optsetattrr   lr_pretrainedbeta1_pretrainedbeta2_pretrainedeps_pretrainedmomentum_pretrainedr   warmuplr_new	beta1_new	beta2_neweps_newmomentum_newhvdDistributedOptimizerbroadcast_parametersr   broadcast_optimizer_stater   r   r   r   r   resumeisfileloadnextiteritemsrK   load_state_dictformatcudnn	benchmarkdeterministiclower	save_logsSummaryWriternum_samplestrain_szval_dataval_szinitwandb_noteswatchr>   r   no_evalsave_top_performancer   freeze_text_afterr   anytop_k_checkpoint_select_datasettop_k_checkpoint_select_metricupdatesave_frequencysave_most_recentr2   finish).r   dataset_namesplitlog_base_pathlog_filenamerk   dirnamemodel	model_cfgparamparams_filefr   r   ddp_argsr   text_freeze_parametersr)   gain_or_bias_paramsrest_paramsr   	schedulertotal_stepsxygain_or_bias_pretrained_paramsrest_pretrained_paramsgain_or_bias_new_paramsrest_new_paramspretrained_params_optimizerpretrained_params_schedulernew_params_optimizernew_params_schedulerr   start_epoch
checkpointsdo_writerr@   r   completed_epochmetricsfiltered_metricsopt_dictcheckpoint_dictr"   )r   r   r   r   r   r#   main}   s  

































"







rH  c                 C   s   ddl m}m} tj| j| jd}tj|r"t	d| d dS t	d|  tj
t}tdD ]}tj|}q3||||d	d
dd t	d dS )Nr   )copytreeignore_patternscodez$Error. Experiment already exists at z). Use --name to specify a new experiment.r,   zCopying codebase to    r   r   rb   )ignorezDone copying code.r   )shutilrI  rJ  r   r   r   r   r   r   r   realpath__file__r   r(  )r   rI  rJ  new_code_pathcurrent_code_pathr   r"   r"   r#   r   @  s    
r   __main__)T)rN   r   )1r   r   rQ   r   r9   numpyr0   r=   torch.backends.cudnnbackendsr  torch.cuda.ampr   rb   ImportErrortorch.utils.tensorboardutilsrd   horovod.torchr  clap_moduler   r   r   training.datar   training.distributedr   r	   r
   training.loggerr   training.paramsr   training.schedulerr   training.trainr   r   clap_module.utilsr   r   r$   r2   rM   rT   rH  r   __name__r"   r"   r"   r#   <module>   sZ    
;

   F
