o
    }oiZ'                     @   sX  d dl Z d dlmZmZ d dlZdd Zd dlZd dlmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ dZdZ dZ!d/ddZ"d0deee#  fddZ$d1ddZ%d1ddZ&dd Z'd d! Z(d"d# Z)d$d% Z*d&d' Z+d(d' Z+d)d* Z,d+d, Z-G d-d. d.Z.dS )2    N)ListOptionalc                   C   s   dt jd< d S )N0NVTE_APPLY_QK_LAYER_SCALING)osenviron r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/tests/lightning/test_nemo_resume_from_ckpt.pyset_env   s   r
   Path)OptimizerConfig)AttnBackend)llm)PreTrainingDataModule)get_nmt_tokenizer)ModelCheckpoint)CosineAnnealingScheduler)MegatronOptimizerModule)TimingCallbackzN/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_documentz3/home/TestData/nlp/megatron_gpt/data/gpt/vocab.jsonz3/home/TestData/nlp/megatron_gpt/data/gpt/merges.txtTc                    s~   ddl m} dd l dd lm  m} ddlm} t| |s"|| } || }| } fdd|j	
 D }|j||d |S )Nr   r   )FileSystemReaderc                    s4   i | ]\}}t |jd kr| j|j|jjdqS )TensorStorageMetadata)dtype)type__name__emptysize
propertiesr   ).0ktptorchr   r	   
<dictcomp>:   s
    zload_dcp.<locals>.<dictcomp>)storage_reader)pathlibr   r"   torch.distributed.checkpointdistributed
checkpointr   
isinstanceread_metadatastate_dict_metadataitemsload)ckpt_dirtorch_tensorr   dcpr   	fs_readermetadata
state_dictr   r!   r	   load_dcp.   s    

r4   pathc                 C   s  |d ur|ng }t | tr8t |tsJ t|  t| ks"J |  D ]}t| | || ||g  q&d S t | trkt |tsDJ t| t|ksNJ tt| |D ]\}\}}t|||d| dg  qUd S t | t	j
rd|}| j|jksJ d| d| j d|j | j|jksJ d| d| j d|j t	| |ksJ d| d|  d	| td
| tjd d S tdtt|  )N[].z	mismatch	z: different dtypes  z: different shape z: different values

zmatch	)filezUnexpected value type )r)   dictsetkeyscompare_ckptslistlen	enumeratezipr"   Tensorjoinr   shapeallprintsysstderr
ValueErrorstrr   )abr5   keyiaabbskeyr   r   r	   r?   G   s(   


**(r?         c              	   C   s.   d}t ddttd}t|d||d|dd}|S )N   megatronGPT2BPETokenizer)
vocab_filemerges_filei  z9999,1,1)paths
seq_lengthmicro_batch_sizeglobal_batch_sizeseed	tokenizersplit)r   
VOCAB_PATHMERGES_PATHr   )log_dirn_steps	data_pathgbsmbsr\   r`   datar   r   r	   
setup_data]   s"   	rj   c           
      C   sj   d}t jdddd|dddddd	d
tjd}t j||d}tdddddddd
d dd
d}t|d}	|||	fS )NrV   rT         rU   gZd;O?        gh㈵>RMSNormF)
num_layershidden_sizeffn_hidden_sizenum_attention_headsr\   init_method_stdhidden_dropoutattention_dropoutlayernorm_epsilonmake_vocab_size_divisible_bynormalizationmasked_softmax_fusionattention_backend)r`   adamg{Gz?g?g?gffffff?g:0yE>g      ?T)	optimizerlrweight_decay
adam_beta1
adam_beta2adam_eps	clip_gradlog_num_zeros_in_gradtimersbf16use_distributed_optimizer)config)r   	GPTConfigr   unfusedGPTModelr   r   )
rd   re   r`   rg   rh   r\   
gpt_configmodel
opt_configoptimr   r   r	   setup_model_optimr   s@   

r   c                 C   s   t jdddddddd}tdddddddddd		}|t g}t jdd
d||dddddt jddd}t j| dddd |d}||fS )NrU   TF
torch_dist)tensor_model_parallel_sizeckpt_include_optimizerckpt_parallel_loadckpt_parallel_save_optimckpt_async_savesave_ckpt_formatprogress_intervalreduced_train_loss
   z{step}-{epoch})		save_lastmonitor
save_top_kevery_n_train_stepsalways_save_contextsave_context_on_train_endsave_on_train_epoch_endsave_optim_on_train_endfilename(   gpu   rm   r   z
bf16-mixed)	precision)devices	max_stepsacceleratorstrategy	callbackslog_every_n_stepsval_check_intervallimit_val_batchesnum_sanity_val_stepsenable_checkpointingpluginsv1)rd   versionuse_datetime_versionupdate_logger_directorywandbckpt)nlMegatronStrategyr   r   TrainerMegatronMixedPrecision
NeMoLogger)rd   r   checkpoint_callbackr   trainernemo_loggerr   r   r	   setup_trainer_and_logger   sV   


	r   c                 C   s   |  |sJ | ||dS )NrU   )
startswithreplace)xoldnewr   r   r	   replace_first   s   r   c                 C   s    t | trJ ttdd | S )Nc                 S   s
   |  dS )Nmodule.)r   r   r   r   r	   <lambda>   s   
 z$extract_model_keys.<locals>.<lambda>)r)   r<   r@   filter	ckpt_keysr   r   r	   extract_model_keys   s   r   c                 C      t tdd | S )Nc                 S      t | ddS )Nr   zoptimizer.state.exp_avg.module.r   r   r   r   r	   r          z!prepend_exp_avg.<locals>.<lambda>r@   map
model_keysr   r   r	   prepend_exp_avg      r   c                 C   r   )Nc                 S   r   )Nr   z"optimizer.state.exp_avg_sq.module.r   r   r   r   r	   r      r   $prepend_exp_avg_sq.<locals>.<lambda>r   r   r   r   r	   prepend_exp_avg_sq   r   r   c                 C   r   )Nc                 S   r   )Nr   z"optimizer.state.fp32_param.module.r   r   r   r   r	   r      r   r   r   r   r   r   r	   r      r   c                    s   t t fdd|S )Nc                    s   |  v S Nr   r   r   r   r	   r      s    zhas_all_keys.<locals>.<lambda>)rG   r   )r   r>   r   r   r	   has_all_keys   s   r   c                  C   s6   dd l } dD ]}z| d| d W q   Y qd S )Nr   )r   r   /tmp/mcore_logs_zsteps/)shutilrmtree)r   stepsr   r   r	   teardown   s   r   c                   @   s    e Zd Zejddd ZdS )TestCkptStateRestorationGPUc                 C   s:  dd }t   tjd dksJ |dtjddd d}t| s$J |d	tjdd
|d ddg}ttdd |s>J t	tt
|}t|d  }t|dksUJ t|tt|d  kseJ |d  |d  fD ]}t|t|s|J t|t|sJ t|t|sJ qqt|d |d  t  d S )Nc              
   S   s   d|  d}t j|dd tg}t|| |ddd}dd	lm} |dd dddd
) t|| |j\}}}t|\}	}
t	j
|||	|
|d|d |	  W d    d S 1 sTw   Y  d S )Nr   r   T)exist_okrT   rU   )rg   rh   r   )/reconfigure_num_microbatches_calculator_manager)data_parallel_sizeri   )r   ri   r   logresumer`   r   )r   makedirs	DATA_PATHrj   &tests.lightning.mcore_microbatch_utilsr   r   r`   r   r   train	_teardown)re   r   rd   rf   ri   r   r   r   r   r   r   r   r   r	   r      s2   
	"z?TestCkptStateRestoration.test_resume_optim_state.<locals>.trainr   r   r   T)resume_if_existsresume_ignore_no_checkpointz>/tmp/mcore_logs_40steps/default/v1/checkpoints/step=29-epoch=0r   F)r   r   resume_from_pathzF/tmp/mcore_logs_40steps/default/v1/checkpoints/step=39-epoch=0/weightszF/tmp/mcore_logs_10steps/default/v1/checkpoints/step=39-epoch=0/weightsc                 S   s   t |  S r   )r   existsr   r   r   r	   r   .  r   zBTestCkptStateRestoration.test_resume_optim_state.<locals>.<lambda>r   rU   )r
   r   r   r   
AutoResumer   r   rG   r   r@   r4   r   r>   rA   r=   r   r   r   r?   r   )selftmp_pathr   resume_pathr[   ckptsr   r   r   r   r	   test_resume_optim_state   sD   	 
z0TestCkptStateRestoration.test_resume_optim_stateN)r   
__module____qualname__pytestmarkrun_only_onr   r   r   r   r	   r      s    
r   )Tr   )rT   rU   )/r   typingr   r   r   r
   rI   r%   r   r"   megatron.core.optimizerr   megatron.core.transformer.enumsr   nemo.lightning	lightningr   nemo.collectionsr   nemo.collections.llm.gpt.datar   3nemo.collections.nlp.modules.common.tokenizer_utilsr    nemo.lightning.pytorch.callbacksr   nemo.lightning.pytorch.optimr   %nemo.lightning.pytorch.optim.megatronr   nemo.utils.exp_managerr   r   rb   rc   r4   rL   r?   rj   r   r   r   r   r   r   r   r   r   r   r   r   r	   <module>   sD   


&4