o
    }oi                     @   sN  d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zedkr!e Ze	jejejejddZe	jejejdee	jdddddd d	Z e	j!ddddddZ"e	j#ej$de"dZ%e	j&eddddddddZ'ej(ej(j)v rej(j)ej(  Z(ndZ(ej*red d!d"ndZ+ej,d#krej*rJ ej-ed#d ej.d$d e+d%Z/nej0d ej.d$d e+d&Z/e1e/j23ej45d'sJ eej67ej8d(d)Z9ej:e e9d*Z;e	j<e	j=ej8d+dd,Z>ej?e;e/e e(e%e'e>d- ejd.kre@d/ ejd0kr#d1ZAd2e1e jBv s%J eAdS dS dS )3    N)	dataclass)OptimizerConfig)	lightning)llm)get_dataset_root)PackedSequenceSpecs)get_nmt_tokenizer)Llama3ConfigCIc                  C   s   t jdd} | jdtdd | jdtdd | jdtd	d
d | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdddd | jdtdddgdd |  S )Nz)Finetune a small GPT model using NeMo 2.0)descriptionz--restore_pathzPath to model to be finetuned)typehelpz--experiment_dirz-directory to write results and checkpoints toz--peftnoneznone | lora)r   defaultr   z	--devices   znumber of devicesz--max_stepsz--mbszmicro batch sizez	--tp_sizeztensor parallel sizez	--pp_sizezpipeline parallel sizez--packed
store_truezuse packed sequence dataset)actionr   z	--datasetdollychatzDataset to use)r   r   choicesr   )argparseArgumentParseradd_argumentstrint
parse_args)parser r   X/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/gpt_finetuning.pyget_args!   s   r   __main__log_all)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtypeckpt_load_strictnessgpuz
bf16-mixed)	precisionr      )	devices	max_stepsacceleratorstrategypluginslog_every_n_stepslimit_val_batchesval_check_intervalnum_sanity_val_stepsTreduced_train_loss)	save_lastmonitor
save_top_ksave_on_train_epoch_endsave_optim_on_train_endF)log_diruse_datetime_versionckptadamg-C6?g\(\?g      ?)	optimizerlr
adam_beta2use_distributed_optimizer	clip_gradbf16)configi   dummy_tokenizer)packed_sequence_sizetokenizer_model_namer      )dataset_root
seq_lengthmicro_batch_sizeglobal_batch_sizenum_workerspacked_sequence_specs)rG   rH   rI   rJ   rK   	NEMO_HOMEzdummy_tokenizer.model)tokenizer_model)	tokenizer)path)restore_configresume_if_exists)modeldatatrainerpeftlogoptimresume   zInitial Training Succeeded   zResume did not happen in this resume test.
Hint: Scroll up and see whether 'Initial Training Succeeded' is printed out.
If not, then the issue is not with ckpt resume.zreduced_train_loss=)Cr   osdataclassesr   torchmegatron.core.optimizerr   nemor   nlnemo.collectionsr   "nemo.collections.llm.gpt.data.corer   -nemo.collections.llm.gpt.data.packed_sequencer   3nemo.collections.nlp.modules.common.tokenizer_utilsr   tests.collections.llm.commonr	   r   __name__argsMegatronStrategytp_sizepp_sizebfloat16r+   Trainerr(   r)   MegatronMixedPrecisionrT   ModelCheckpointr9   
NeMoLoggerexperiment_dirloggerMegatronOptimizerModuler:   rU   PEFT_STR2CLSpackedrK   datasetChatDataModulembsrS   DollyDataModuler   rF   
startswithenvirongetrO   joinrestore_pathrN   
LlamaModel	llama3_8b
AutoResumeRestoreConfigrX   finetuneprintmsg	ckpt_pathr   r   r   r   <module>   s   



		
c