o
    wi                     @   sR  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ dd Zedkr'e Zi Zdd e dD ed< eddej!edZ"ej#dkredddde"ddddZ$neej#ddddde"dej%d 	Z$e
j&j'j&j(d!d!d"d#d!dd$d%d%d&ddd'ej)ej)d(Z*e
j&j'j&j+e*e$j"d)Z'ej,d*d*dd+Z-ed,d'd-Z.e.e gZ/ej0d'd'd.Z1ed/d0d1d'd2d3Z2edd2ej3d&d4Z4ee2e4d5Z5ej6ej7ej3d6e-e/d*d7d7ej8d8d9d:	Z9ej:dureej;ej:d;d<Z<ndZ<eej;d1ej=e<d=Z>ee'e1e$e9e>e5d> dS dS )?    N)WandbLogger)OptimizerConfig)	lightning)llm)pretrain)MockDataModulePreTrainingDataModule)get_nmt_tokenizer)
NeMoLogger)ModelCheckpoint)WarmupAnnealingScheduler)MegatronOptimizerModule)4AssertOptimizerParamGroupsHaveAtLeastTwoWeightDecaysc                  C   s   t jdd} | jdtdd | jdtdd | jdtd	d | jd
tdd | jdtd dd | jdtdd | jdtd dd | jdtdd |  S )Nz%Train a small T5 model using NeMo 2.0)descriptionz	--devicesz%Number of devices to use for training)typehelpz--max-stepszNumber of steps to train forz--experiment-dirz-directory to write results and checkpoints toz--experiment-namezname of experimentz--wandb-projectzwandb project name)r   defaultr   z--data-pathzPath to data filez--vocab-pathzPath to vocab filez--index-mapping-dirz$directory to write index mappings to)argparseArgumentParseradd_argumentintstr
parse_args)parser r   j/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/llm/megatron_t5_pretraining.pyget_args$   s   r   __main__c                 C   s   g | ]}d | dqS )z
<extra_id_>r   ).0ir   r   r   
<listcomp>7   s    r!   d   additional_special_tokensmegatronBertWordPieceCase)
vocab_filespecial_tokensmocki      @   i'  )
seq_lengthseq_length_decmicro_batch_sizeglobal_batch_size	tokenizernum_train_samplesnum_val_samplesnum_test_samplesi  z	99982,9,9)	pathsr+   r,   r-   r.   seedr/   splitindex_mapping_dir   i   i   gQ?g?gh㈵>T)
num_layersencoder_num_layershidden_sizeffn_hidden_sizenum_attention_headskv_channelsinit_method_stdhidden_dropoutattention_dropoutlayernorm_epsilonmake_vocab_size_divisible_bymax_position_embeddingsbf16params_dtypepipeline_dtype)r/      )tensor_model_parallel_sizepipeline_model_parallel_sizerF   i  )every_n_train_stepssave_optim_on_train_end)resume_if_existsresume_ignore_no_checkpointadamg-C6?Fg{Gz?)	optimizerlruse_distributed_optimizerrD   weight_decay)warmup_stepswarmup_ratio	max_stepsmin_lr)configlr_schedulergpu   z
bf16-mixed)	precision)	devicesrU   acceleratorstrategy	callbackslog_every_n_stepslimit_val_batchesval_check_intervalpluginsall)nameproject	log_model)re   use_datetime_versionlog_dirwandb)modelresumedatatrainerlogoptim)?r   torchlightning.pytorch.loggersr   megatron.core.optimizerr   nemor   nlnemo.collectionsr   nemo.collections.llm.apir   nemo.collections.llm.t5.datar   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr	   nemo.lightningr
    nemo.lightning.pytorch.callbacksr   )nemo.lightning.pytorch.optim.lr_schedulerr   %nemo.lightning.pytorch.optim.megatronr   tests.collections.llm.commonr   r   __name__argsr'   range
vocab_pathr/   	data_pathrm   r6   t5rk   T5Configbfloat16	t5_configT5ModelMegatronStrategyr^   checkpoint_callbackr_   
AutoResumerl   
opt_configrU   rX   optTrainerr\   MegatronMixedPrecisionrn   wandb_projectexperiment_namewandb_loggerexperiment_dirnemo_loggerr   r   r   r   <module>   s  





