o
    }oi$                     @   sL  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ dZdZdZdZd dl m!Z! efddZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&d*dd Z'd*d!d"Z(d#d$ Z)d%d& Z*ej+,d'd(d) Z-dS )+    N)Callback)OptimizerConfig)	lightning)llm)train)PreTrainingDataModule)get_nmt_tokenizer)
AutoResume
NeMoLogger)CosineAnnealingScheduler)MegatronOptimizerModule)/reconfigure_num_microbatches_calculator_managerz3/home/TestData/nlp/megatron_gpt/data/gpt/vocab.jsonz3/home/TestData/nlp/megatron_gpt/data/gpt/merges.txtzN/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_documentz/tmp/nemo_exp/)	ProxyDictc                 C   s   dd l }||  d S )Nr   )shutilrmtree)exp_dirr    r   Z/home/ubuntu/.local/lib/python3.10/site-packages/tests/lightning/test_state_restoration.pyteardown*   s   r   c                   @   s    e Zd ZdddZdd	d
ZdS )ValidateOptStateRestorationtrainer
pl.Trainer	pl_modulepl.LightningModulereturnNc                 C   :   |j d j}t|ttfsJ dt|dksJ dd S Nr   !Expected state to be a dictionaryExpected state to be empty
optimizersstate
isinstancedictr   lenselfr   r   	opt_stater   r   r   on_fit_start1   s   z(ValidateOptStateRestoration.on_fit_startc                 C   s   |j d j}t|ttfsJ dt|dksJ d| D ]\}}| D ]}t	|dk
 s;t	|dk
 r=J q'qd S )Nr   r   zExpected a non-empty stateg      ?)r    r!   r"   r#   r   r$   itemsvaluestorchallitem)r&   r   r   
checkpointr'   keyvalparamr   r   r   on_load_checkpoint8   s   *z.ValidateOptStateRestoration.on_load_checkpointr   r   r   r   r   Nr   N)__name__
__module____qualname__r(   r2   r   r   r   r   r   0   s    
r   c                   @       e Zd ZdddZdd	d
ZdS )ValidateOptStateScratchInitr   r   r   r   r   Nc                 C   r   )Nr   z"Expected state to be a dictionary r   r   r%   r   r   r   r(   D      z(ValidateOptStateScratchInit.on_fit_startc                 C   r   r   r   r%   r   r   r   on_train_startI   r:   z*ValidateOptStateScratchInit.on_train_startr3   r5   r6   r7   r(   r;   r   r   r   r   r9   C   s    
r9   c                   @   r8   )ValidateModelScratchInitr   r   r   r   r   Nc                 C   s   |  D ]}|   qd S )N)
parametersdetachzero_r&   r   r   pr   r   r   r(   P   s   z%ValidateModelScratchInit.on_fit_startc                 C   sr   |  D ]}t|dksJ dqt  |  D ]}|tdd qW d    d S 1 s2w   Y  d S )Nr   z$Expected params (scratch) to be zero   )r>   r+   r,   no_gradfill_randomuniformrA   r   r   r   r;   T   s   
"z'ValidateModelScratchInit.on_train_startr3   r<   r   r   r   r   r=   O   s    
r=   c                   @   s*   e Zd ZdddZdd	d
ZdddZdS )ValidateModelRestorationr   r   r   r   r   Nc                 C   s$   |  D ]}|   qd| _d S )NF)r>   r?   r@   called_on_load_checkpointrA   r   r   r   r(   ]   s   
z%ValidateModelRestoration.on_fit_startc                 C   s
   d| _ d S )NT)rI   )r&   r   r   r.   r   r   r   r2   b   s   
z+ValidateModelRestoration.on_load_checkpointc                 C   sH   |  D ]}t|dkrJ dqt| dsJ | jdks"J dd S )Nr   z'Expected params (resume) to be non-zerorI   Tz*Expected to have called on_load_checkpoint)r>   r+   r,   hasattrrI   rA   r   r   r   r;   e   s   z'ValidateModelRestoration.on_train_startr3   r4   )r5   r6   r7   r(   r2   r;   r   r   r   r   rH   \   s    

rH   rC         c                 C   s(   t ddttd}ttd| |d|d}|S )NmegatronGPT2BPETokenizer)
vocab_filemerges_filerL   i  )paths
seq_lengthmicro_batch_sizeglobal_batch_sizeseed	tokenizer)r   
VOCAB_PATHMERGES_PATHr   	DATA_PATH)mbsgbsrR   rV   datar   r   r   
setup_datal   s   r]   c                 C   st   t jdddd|ddddddd d	}t j|| d
}ttdddddtddttdddttddd}||fS )NrK            gZd;O?g?gh㈵>F)
num_layershidden_sizeffn_hidden_sizenum_attention_headsrR   init_method_stdhidden_dropoutattention_dropoutlayernorm_epsilonmake_vocab_size_divisible_bymasked_softmax_fusion$virtual_pipeline_model_parallel_size)rV   adamg{Gz?T)	optimizerlrbf16use_distributed_optimizer2   giUMu?g      $@stepreduced_train_loss)	max_stepsmin_lrwarmup_stepsintervalmonitorconstant_steps)configlr_scheduler)	r   	GPTConfigGPTModelr   r   r   intmathceil)rV   rZ   r[   rR   
gpt_configmodeloptr   r   r   make_model_optim   s@   r   c                 C   s   t | |}t|j| |\}}tdd || |d: tj|ddt t t gddtj	ddd	}t
|||ttd
ddd dtdddd|d |  W d    d S 1 sTw   Y  d S )Nr   data_parallel_size
   gpurC   rK   
bf16-mixed	precisiondevicesrt   acceleratorstrategy	callbackslog_every_n_stepslimit_val_batchespluginsv1Tlog_dirversionuse_datetime_versionupdate_logger_directorywandb)resume_if_existsresume_ignore_no_checkpointr\   )r   r\   r   logresumerV   optim)r]   r   rV   r   nlTrainerMegatronStrategyr9   r=   MegatronMixedPrecisionr   r
   EXP_DIRr	   	_teardown)rZ   r[   num_devr\   r   r   r   r   r   r   run_train_from_scratch   sN   



"r   c                 C   s   t | |}t|j| |\}}tdd || |dD tj|ddt t t gddtj	ddd}dd	l
m} t|||d
|ttdddd dtddt ddd |  W d    d S 1 s^w   Y  d S )Nr   r   rC   r   rK   r   r   r   )RestoreConfigr\   r   Tr   FzJdefault/v1/checkpoints/default--None=0.0000-epoch=0-consumed_samples=20.0/)r   r   resume_from_path)r   r\   r   rV   r   r   r   )r]   r   rV   r   r   r   r   r   rH   r   'nemo.lightning.pytorch.strategies.utilsr   r   r
   r   r	   r   )rZ   r[   r   r\   r   r   r   r   r   r   r   run_resume_train   sR   




"r   GPUc                  C   s:   d\} }d}zt | || t| || W t  d S t  w )N)rC   rK   rC   )r   r   r   )rZ   r[   num_devicesr   r   r   test_optim_state_restoration   s   r   )rC   rK   rL   ).r   rF   pytestr+   lightning.pytorch.callbacksr   megatron.core.optimizerr   nemor   r   nemo.collectionsr   nemo.collections.llm.apir   nemo.collections.llm.gpt.datar   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightningr	   r
   )nemo.lightning.pytorch.optim.lr_schedulerr   %nemo.lightning.pytorch.optim.megatronr   &tests.lightning.mcore_microbatch_utilsr   rW   rX   rY   r   !megatron.core.optimizer.optimizerr   r   r   r9   r=   rH   r]   r   r   r   markrun_only_onr   r   r   r   r   <module>   s>   

%+
-