o
    ϯia                     @   s  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dlm	  m
Z
 d dlmZ d dlZzd dlZW n eyB   dZY nw zd dlm  mZ W n eyY   dZY nw zd dlmZ W n eym   dZY nw d dlmZmZmZ d dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z" d d	l#m$Z$ d d
l%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+ d dl(m,Z,m-Z- d dl.m/Z/ dd Z0	dddZ1dd Z2d ddZ3dd Z4dd Z5dd Z6e7dkre5  dS dS )!    N)datetime)
GradScaler)create_model_and_transformstrace_modelcreate_model)get_data)
parse_args)	is_masterinit_distributed_deviceworld_info_from_env)setup_logging)	cosine_lr)train_one_epochevaluate)get_tar_path_from_dataset_namedataset_splitget_optimizer)load_pload_class_label)LinearProbec              
   C   s   t t||D ].}tjtj| jd| dr5ttj| jd| dtj| jd|d  d qtjtj| jd| drUttj| jd| d d S )N
epoch_top_.pt   )	reversedrangeospathexistsjoincheckpoint_pathrenameremove)argsstartidxall_idx_leni r&   O/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/training/lp_main.pymaintain_ckpts'   s    r(   Tc           
   
   C   sr  t | ttfrt| } t| ||||dS t | tr,tt|  } t| ||||dS t | tt	frdd |
 D }t|
 }t| |d}t|}||  t||d}|dd }||krh|| fS tt|D ]}	|||	  ||	 kr||	 |||	 < d|||	 < qntt|D ]#}	||	 rt||	t| t|tj|jd|	 d	  || fS q|| fS dS )
z
    Record the top-k performance of the current epoch.
    current_top_k_metrics is a dictionary of the form: {1: top_1_ckpt_measure, 2: top_2_ckpt_measure, ...}
    )r"   ckptbignumbetterc                 S      i | ]}|d qS )Fr&   ).0kr&   r&   r'   
<dictcomp>O   s    z,update_top_k_performance.<locals>.<dictcomp>)reverseNTr   r   )
isinstancelisttuplenpmeanupdate_top_k_performancedictvaluesfloatintkeyssortedcopydeepcopyappendr   lenr(   torchsaver   r   r   r   )
new_metrics_inputscurrent_top_k_ckpt_metricsr"   r)   r*   update_flagsorted_keyssorted_valuessorted_values_r%   r&   r&   r'   r6   5   s^   



r6   c                 C   s0   |  dp| dv p|  dp|  dp|  dS )Nclap_model.transformerzclap_model.positional_embeddingzclap_model.text_projectionclap_model.token_embeddingclap_model.ln_finalzclap_model.logit_scale_t
startswith)nr&   r&   r'   is_pretrained_paramsp   s   
rP   *   c                 C   s0   t | |  tj| |  t| |  d S N)rA   manual_seedr4   randomseed)rU   rankr&   r&   r'   random_seedz   s   rW   c                    sH  |j dkrd|_d|_d|_dd t|  }i }i }dd |D }|jr3td |D ]}d|_	q-|j
sd	d   fd
d fdd|D }fdd|D }	|jd u r^d }d }nA|d jj|j }
|jr3dD ]}dD ]}t||| d u rt||| t|| qqqm fdd|D }fdd|D } fdd|D }fdd|D }t|dd||jdg|j|j|jf|j|j|j d}t||j|j|
}t|dd||jdg|j|j|jf|j|j|j d}t||j|j|
}||d< ||d< ||d< ||d< |jr2t j!||  d}t j!||  d}t j"| # dd t j$|dd t j$|dd nlt|dd|	|jdg|j%|j&|j'f|j(|j)|j d|d< t|d |j%|j|
|d< |jrt j!|d |  d|d< t j"| # dd t j$|d dd nfdd|D }t||j*|j&|j'f|j(d|j d}||d< |||fS )Nadamr   c                 S   s
   |  dS )N
clap_modelrM   rO   pr&   r&   r'   <lambda>   s   
 z%config_lp_optimizer.<locals>.<lambda>c                 S   s:   g | ]\}}| d s|dv s| ds| dr|qS )rI   rJ   rK   rL   rM   r,   rO   r[   r&   r&   r'   
<listcomp>   s    z'config_lp_optimizer.<locals>.<listcomp>Freeze Text!!!!Fc                 S   s*   |j dk pd| v pd| v pd| v pd| v S )N   bnlnbiaslogit_scale)ndimrZ   r&   r&   r'   r\      s   
 c                    s    | | S rR   r&   rZ   excluder&   r'   r\      s    c                    $   g | ]\}} ||r|j r|qS r&   requires_gradr]   rf   r&   r'   r^      s    c                    rh   r&   ri   r]   includer&   r'   r^         $ train)lrbeta1beta2epswd)_new_pretrainedc                    ,   g | ]\}} ||r|j rt|r|qS r&   rj   rP   r]   rf   r&   r'   r^          c                    rv   r&   rw   r]   rk   r&   r'   r^      rx   c                    ,   g | ]\}} ||r|j rt|s|qS r&   rw   r]   rf   r&   r'   r^      rx   c                    ry   r&   rw   r]   rk   r&   r'   r^      rx   g        )paramsweight_decay)ro   betasrr   momentumoptimizer_nametextaudio)named_parameters)	root_rankclapc                    s$   g | ]\}} ||s|j r|qS r&   ri   r]   )in_clapr&   r'   r^     rm   g?lp)+	optimizerrs   wd_pretrainedwd_newr2   r   freeze_textlogginginforj   	lp_freeze
train_data
dataloadernum_batchesepochs	split_optgetattrsetattrr   lr_pretrainedbeta1_pretrainedbeta2_pretrainedeps_pretrainedmomentum_pretrainedr   warmuplr_new	beta1_new	beta2_neweps_newmomentum_newhorovodhvdDistributedOptimizerbroadcast_parameters
state_dictbroadcast_optimizer_statero   rp   rq   rr   r}   lp_lr)modeldatar"   r   r   	schedulertext_freeze_parametersr-   gain_or_bias_paramsrest_paramstotal_stepsxygain_or_bias_pretrained_paramsrest_pretrained_paramsgain_or_bias_new_paramsrest_new_paramspretrained_params_optimizerpretrained_params_schedulernew_params_optimizernew_params_scheduler	lp_paramslp_optimr&   )rg   r   rl   r'   config_lp_optimizer   s   
	














r   c            #         s8  t  } t| j | jdd| _t| j t| j tj	| j tj	
| j tj| j t| j| _| jd u rbdt dd| j d| j d| j d| j d| j g| _d	| _t \| _| _| _| jrt| r| jD ]2}t | D ]+}t!j"#d
| d| st!$d
| d|  t!%d| d| d| d| d	 q~qxd | _&t| | j'drt!j"| j(| j}t!j$|dd | j'rd| j nd}t!j"||| _&d}t!j"#| j&r|d7 }|d t)| }t!j$|dd | j'rd| j nd}t!j"||| _&t!j"#| j&s| j*rt+j,nt+j-| _.t/| j&| j. t0| }d| j1v p0d| j1v | _2d| j1v p<d| j1v | _3t| rt| j3rQt!j"| j(| jdnd| _4t!j"| j(| jd| _5| j4| j5fD ]}|rqt!j$|dd qend| _4d| _5| j6rt6|  | jdv sJ | jdkrt+7d | j8rt+9d| j: d| j d| j d | j d!	 n%| jrt+9d"| j: d| j d| j d | j d!	 n
t+9d#| j: d! t+9d$t!j";| j<  t=| j| j>| j?| j|| j@| jAt!j";| j<d	| jB| jC| jD| jEd%\}	}
tFtG| jH | _It+9d&| jJ  t+9d'| jK  t+9d(| jL  t+9d)| jI  t+9d*| jM  t+9d+| jN  t+9d,| jO  tP|	| jJ| jKd-| jI| jLd.}|Q|}| j8rtR  |S D ]
}|T|U  qmW d    n	1 sw   Y  | jVrtW|| j|d/}t| rt+9d0 t+9t)|	  t+9d1 t!j"| j(| jd2}tX|d3-}tYtZ| D ]}t[| |}t+9d4| d5|  |\| d5| d6 qW d    n	1 sw   Y  | jr| j8s| j]rtj^j_`|}i }| jard|d7< tj^jbjc|f|gdd8|}td| |
 tF s*J d9| jVr7d: vs7J d;te| | \}}}| jd<krItf nd }d}| jgd urt!j"h| jgrtji| jg|d=}d>|v r|d> }|d? }| jstjtk|l d md@rdAdB |l D }|n| | jor|d ur|l D ]\}}|n||dC dD   q|d ur|n|dD  |d urdE|v r|n|dE  t+9dF| jg dG| dH n|n| t+9dI| jg dG| dH | jprtqdJ |D ]}d	|_rqn	t+9dKs| jg dtt_ud	tt_v| j(o| j(w dLkot| | _xd }| jxr6| j3r6t3d us0J dMt3y| j4}| j2rt| rt2d usHJ dNt+*dO  d: jzj{| _|| j}d ura dP jzj{| _~t2jdQ| j| jg tZ| dR | j*rzt2j|ddS t2| t+*dT d: vrt| || | d S |dkrdP v r| jst| d| | | jrdUdB t| jD }t|| jD ]}|| jkrtqdV |D ]}d	|_rqt| rt+9dW|  t| ||||| | |d }t fdXdYdZD r| jst| || |}| jr| j| jfd[d\|l D } | jxrd]dB |l D }!|| j| d^}"|"|! |d urB| |"dE< || jksV| jdkrf|| j dkrft|"t!j"| j5d_| d` | jrvt|"t!j"| j5da | jr| jst| || |"ddb q| j2rt| rt2  d S d S d S )cN/-z%Y_%m_%d-%H_%M_%Slinear_probemodel_lr_b_j_p_Fz./json_files/z,aws s3 cp s3://s-laion-audio/webdataset_tar/z/sizes.json ./json_files/z/sizes.json)localT)exist_okzout-zout.logr   r   wandballtensorboard checkpoints)ampfp16fp32r   zIt is recommended to use AMP mixed-precision instead of FP16. FP16 support needs further verification and tuning, especially for train.zARunning in horovod mode with multiple processes / nodes. Device: z.Process (global: z, local z	), total .z=Running in distributed mode with multiple processes. Device: z&Running with a single process. Device zopenai cache dir: )
	precisiondevicejitforce_quick_geluopenai_model_cache_dirskip_paramspretrained_audiopretrained_textenable_fusionfusion_typezlinear probe using mlp: zlinear probe using freeze: zlinear probe act layer: zlinear probe out ch: z,linear probe learning rate (if applicable): zlinear probe loss func: zlinear probe lp_metrics: i   )mlpfreezein_chout_chact)
batch_sizer   zLinear Probe CLAP Model:zParams:z
params.txtwz  z: 
static_graph)
device_idsfind_unused_parametersz5At least one train or eval dataset must be specified.rn   zCannot train with traced modelr   )map_locationepochr   modulec                 S   s"   i | ]\}}|t d d |qS )zmodule.N)r@   r,   r-   vr&   r&   r'   r.     s   " zmain.<locals>.<dictcomp>_r   scalerz=> resuming checkpoint 'z	' (epoch )z=> loaded checkpoint 'r_   z=> no checkpoint found at '{}'nonezPlease install tensorboard.zPlease install wandb.zStarting wandb.valr   )projectnotesnametagsconfig)logzFinished loading wandb.c                 S   r+   )r   r&   )r,   r%   r&   r&   r'   r.   -  s    z8Text pretrained parameters are freezed since this epoch.zStart epoch c                 3   s    | ]}| v V  qd S rR   r&   )r,   r   )r   r&   r'   	<genexpr>=  s    zmain.<locals>.<genexpr>)r   zimagenet-valzimagenet-v2c                    s$   g | ]\}}|v r |v r|qS r&   r&   r   )top_k_datasettop_k_metricr&   r'   r^   B  s
    zmain.<locals>.<listcomp>c                 S   s"   i | ]\}}|d  d |  qS )r   r   )r   r   r&   r&   r'   r.   I  s    )r   r   r   epoch_r   zepoch_latest.pt)r*   )r   timesleepamodelreplacerT   rU   rA   rS   cudamanual_seed_allr4   r   class_label_pathclass_index_dictr   r   r   nowstrftimero   r   workersr   distributedr   
local_rankrV   
world_size
remotedatar	   datasetnamesr   r   r   r   makedirssystemlog_path	log_locallogsstrdebugr   DEBUGINFO	log_levelr   r
   	report_tor   r   tensorboard_pathr   copy_codebasewarningr   r   r   
expanduserr   r   tmodel
pretrainedtorchscriptr   r   r   r   r   r@   r2   r;   	lp_out_chlp_mlpr   lp_actr   lp_loss
lp_metricsr   tono_grad
parametersset_
contiguoustracer   openr<   varsr   writeuse_bn_syncnnSyncBatchNormconvert_sync_batchnormddp_static_graphparallelDistributedDataParallelr   r   r   resumeisfileloadnextiteritemsrN   load_state_dictr   r   printrj   formatcudnn	benchmarkdeterministiclower	save_logsSummaryWriterr   num_samplestrain_szval_dataval_szinitwandb_noteswatchrB   r   no_evalsave_top_performancer   r   freeze_text_afterr   anytop_k_checkpoint_select_datasettop_k_checkpoint_select_metricr   updatesave_frequencysave_most_recentr6   finish)#r"   dataset_namesplitlog_base_pathlog_filenamepostfixlog_base_path_newr   dirnamerY   clap_model_cfgr   paramparams_filefr   r   ddp_argsr   r   r   r   start_epoch
checkpointsdr-   o_writerrD   r   completed_epochmetricsfiltered_metricsopt_dictcheckpoint_dictr&   )r   r   r   r'   main%  s.  






















"





 

rb  c                 C   s   ddl m}m} tj| j| jd}tj|r"t	d| d dS t	d|  tj
t}tdD ]}tj|}q3||||d	d
dd t	d dS )Nr   )copytreeignore_patternscodez$Error. Experiment already exists at z). Use --name to specify a new experiment.r0   zCopying codebase to    r   r	  r   )ignorezDone copying code.r   )shutilrc  rd  r   r   r   r	  r   r   r3  realpath__file__r   rR  )r"   rc  rd  new_code_pathcurrent_code_pathr   r&   r&   r'   r  n  s    
r  __main__)T)rQ   r   )8r   r   rT   r   r=   numpyr4   rA   torch.backends.cudnnbackendsr5  torch.cuda.ampr   r   r   ImportErrortorch.utils.tensorboardutilsr   horovod.torchr   clap_moduler   r   r   training.datar   training.paramsr   training.distributedr	   r
   r   training.loggerr   training.schedulerr   training.lp_trainr   r   clap_module.utilsr   r   r   r   r   clap_module.linear_prober   r(   r6   rP   rW   r   rb  r  __name__r&   r&   r&   r'   <module>   sb    
;

 '  K
