o
    wi                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dlmZ d dlmZ dd Zedkre ZdZeddej ej!dZ"eej#ddde"dZ$e
j%ddddedddddej&dZ'e
j(e'e$j"dZ)e* Z+ed d!d"Z,d#ej-fd$d%Z.ee.ej/e.ej0d&d'gd(Z1ej2d) Z3e,e1e ee3d!d*gZ4g Z5ed+d,Z6e57e6 ed-d.d/d0d!d1Z8ee8d2Z9ej:ej;ej2d3e+e5e4d4d)e3ej<d5d6d7
Z=eej>d8Z?ed!d!d9Z@ee)e$e=e?e@d:e9d; dS dS )<    N)TensorBoardLogger)OptimizerConfig)	lightning)llm)train)PreTrainingDataModule)get_nmt_tokenizer)
AutoResume
NeMoLogger)ModelCheckpointModelTrainingStateCallbackParameterDebugger)MegatronOptimizerModule)4AssertOptimizerParamGroupsHaveAtLeastTwoWeightDecaysc                  C   s   t jdd} | jdtdd | jdtdd | jdtd	d | jd
tdd | jdtdd | jdtdd | jdtdd | jddddd |  S )Nz&Train a small GPT model using NeMo 2.0)descriptionz	--devicesz%Number of devices to use for training)typehelpz--max-stepszNumber of steps to train forz--experiment-dirz-directory to write results and checkpoints toz--data-pathzPath to data filez--vocab-pathzPath to vocab filez--merges-pathzPath to merges filez--index-mapping-dirz$directory to write index mappings toz--no-masked-softmax-fusionstore_falsezDisable fusion of softmax.masked_softmax_fusion)actionr   dest)argparseArgumentParseradd_argumentintstr
parse_args)parser r   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/llm/megatron_gpt_pretraining.pyget_args#   s   r    __main__i   megatronGPT2BPETokenizer)
vocab_filemerges_file    i  )paths
seq_lengthglobal_batch_sizeseed	tokenizer   i   i   gZd;O?g?gh㈵>   )
num_layershidden_sizeffn_hidden_sizenum_attention_headsr(   init_method_stdhidden_dropoutattention_dropoutlayernorm_epsilonmake_vocab_size_divisible_byr   )r+   i  T)every_n_train_stepssave_optim_on_train_end	precisionc                    s   dt jdd f fdd}|S )Ntensorreturnc                    s   | j  ksJ d S )N)dtype)r:   r9   r   r   verify_precision^   s   z1create_verify_precision.<locals>.verify_precision)torchTensor)r9   r>   r   r=   r   create_verify_precision]   s   rA   on_train_starton_train_end)param_fngrad_fnlog_on_hooks   )val_check_intervalstrictdummy)save_diradamga2U0*C?giUMu?F)	optimizerlrmin_lruse_distributed_optimizerbf16)configgpu   z
bf16-mixedr=   )
devices	max_stepsacceleratorstrategylogger	callbackslog_every_n_stepslimit_val_batchesrH   plugins)log_dir)resume_if_existsresume_ignore_no_checkpointdata)modelra   trainerlogresumer+   optim)Ar   r?   lightning.pytorch.loggersr   megatron.core.optimizerr   nemor   nlnemo.collectionsr   nemo.collections.llm.apir   nemo.collections.llm.gpt.datar   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightningr	   r
    nemo.lightning.pytorch.callbacksr   r   r   %nemo.lightning.pytorch.optim.megatronr   tests.collections.llm.commonr   r    __name__argsr(   
vocab_pathmerges_pathr+   	data_pathra   	GPTConfigr   
gpt_configGPTModelrb   MegatronStrategyrX   checkpoint_callbackr<   rA   bfloat16float32debuggerrV   rH   rZ   loggerstensorboard_loggerappend
opt_configoptTrainerrU   MegatronMixedPrecisionrc   experiment_dirnemo_loggerre   r   r   r   r   <module>   s   






