o
    }oi V                     @   sP  U d dl Z d dlmZ d dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) dej*j+_,ej-ej.ej/ej0dZ1e2e3eej4 f e5d< dd Z6dd Z7e8dkr	 e7  dS dS )    N)Type)LearningRateMonitorRichModelSummary)TensorBoardLoggerWandbLogger)DistributedDataParallelConfig)OptimizerConfig)	lightning)llm)MockDataModulePreTrainingDataModule)get_nmt_tokenizer)
NeMoLogger)	callbacks)ModelCheckpoint)MegatronCommOverlapCallback)CosineAnnealingScheduler)MegatronOptimizerModule)RestoreConfig)TimingCallbackT)4B8B47B56Bmodel_optionsc                  C   sv  t jdt jd} |  }|jddtddd |jdd	d
d |jdd	dd | jdtdd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdd	d d!d" | jd#td$d%d | jd&tdd'd | jd(tdd)d | jd*tdd+d | jd,d-d.d/d0d1 | jd2d	d3d | jd4d	d5d | jd6tdd7d | jd8tdd9d | jd:tdd;d | jd<td=d | jd>td?d | jd@d	d dAd" | jdBd	d dCd" | jdDd	d dEd" | jdFtg dGdHdIdJ | jdKd	d dL | jdMttt	 dNdOdJ | jdPtd.dQdR | jdStd dTdR | jdUtdVd dTdW | jdXtdYdZd | jd[tdd d\dW | jd]tdd^d | jd_t
d`dad | jdbd	dcd | jddtdedfd | jdgtdhdid | jdjtdkdld | jdmd	d dnd" | jdod	d dL | jdptdqdrgdqdsdJ | jdtd	d dud" | jdvd	d dwd" | jdxtdkdyd | jdztd{d | jd|td}d | jd~d	d dd" | jdt
ddd | jdt
ddd | jdtddd | jdd	d dd" | jdtd dkdd | jdtd ddR | jdtdd dkgdd | jdt
ddd | jdt
dd | jdd	d dd" | jdd	d dd" | jdtddd |  S )z'Parse arguments for NMH model training.z'Train a nemotronh model using NeMo 2.0.)descriptionformatter_classz-dz--dataset-configNzCPath to the blended / weighted training dataset configuration YAML.)typedefaulthelpz--mock-data
store_truezZTrain with Mock data (for testing/debugging), either set this or provide a dataset config.)actionr   z--sftzSFT with SQUAD.z--dataset-dirzAbsolute path to the dataset directory. Defaults to using the absolute or relative paths (dataset_prefix) specified in the dataset config YAML.)r   r   z--num-nodes   z3Number of nodes to use for training, defaults to 1.z	--devicesz5Number of devices to use for training, defaults to 1.z--seq-lengthi    zTraining sequence lengthz--tensor-parallel-sizez+Order of tensor parallelism. Defaults to 1.z--pipeline-model-parallel-sizez-Order of pipeline parallelism. Defaults to 1.z--context-parallel-sizez,Order of context parallelism. Defaults to 1.z
--no-wandbFzDisable Wandb logging)r!   r   r   z--wandb-project	nemotronhzWandb project namez--wandb-run-idzWandb run identifierz--wandb-groupz3A unique string shared by all runs in a given groupz--wandb-job-typezA unique string representing a type of run, which is useful when you're grouping runs together into larger experiments using group.z--disable-checkpointingstore_falseTcreate_checkpoint_callbackz,Disable creating a ModelCheckpoint callback.)r!   r   destr   z--sequence-parallelz#Set to enable sequence parallelism.z--fp8zSet to enable FP8z--micro-batch-sizez,Micro-batch size for data-parallel training.z--global-batch-sizez\Global batch size for training. If set to None, infer it from the TP, CP, and PP parameters.z--grad-acc-batchesz/Number of batches to accumulate gradients over.z--max-stepsz*Number of training optimizer update steps.z--val-check-intervalzFNumber of steps between validation measurements and model checkpoints.z--grad-reduce-in-fp32zGradient reduce in FP32.z--fp8-wgradz?Faster option that is maybe less accurate (TBD) when using fp8.z--no-aligned-megatron-ddpz'Do not do aligned gradient updates etc.z--tp-comm-overlap-backend)ncclmpigloor'   z4TP communication backend to use. Defaults to 'nccl'.)r   choicesr   r   z--align-param-gather)r!   r   z--model-sizer   zModel architecture to usez--experiment-dirz4Directory to write model checkpoints and results to.)r   requiredr   z--vocab-filezPath to tokenizer vocab file.z--hf-tokenizer-nameznvidia/Nemotron-H-8B-Base-8K)r   r   r+   r   z--limit-val-batches   zNumber of validation stepsz--log-every-n-stepsz Number of steps between logging.z
--ckpt-dirzUDirectory to restore an initial checkpoint from. Use this for supervised fine-tuning.z--wdg{Gz?zWeight decay for optimizer.z--restore-optimizer-from-ckptzCRestore optimizer state from initial checkpoint. Defaults to False.z--seedi  zSet random seed for training.z	--workers   z*Number of workers to use for data loading.z--gc-intervalr   zeSet to a value > 0 if you want to synchronize garbage collection, will do gc every gc-interval steps.z--enable-preemptionzTEnable preemption hooks. If enabled this will save a checkpoint whenver slurm exits.z--ckpt-async-savez--ckpt-format
torch_distzarrzSpecify checkpoint format to use. Defaults to 'torch_dist', as 'zarr' is deprecated. Only use if resuming training from a zarr checkpoint.z--cross-entropy-loss-fusionzjUse the faster, but maybe less accurate fused form of cross entropy, which also has bf16 grads internally.z--no-fp32-residual-connectionzWIf set, turn off fp32 residual connections which may be faster but may impact accuracy.z--debug-ddp-parity-freqz:Set to value > 0 to debug DDP weight parity between ranks.z--hybrid-override-patternz]Override the hybrid override pattern in the config (specifies mamba layer ordering and type).z--num-layerszHIf set, override the number of layers specified in the requested config.z--log-parameters-and-shapesz8Log training parameters shapes and dtypes for debugging.z--lrga2U0*3?zLearning rate.z--min-lrgiUMu>z&Min learning rate in cosine annealing.z--warmup-stepsi	  z*Number of warmup steps in cosine annealingz--nsys-profilingaS  Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example:  `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`z--nsys-start-stepz%Start nsys profiling after this step.)r   r+   r   r   z--nsys-end-stepz#End nsys profiling after this step.z--nsys-ranks+z&Enable nsys profiling for these ranks.)r   nargsr+   r   r   z--clip-gradg      ?zGGrad clip value. Note that when using DDP this may need to be inflated.z--seq-len-interpolation-factorzAdjusts the linear scaling of ROPE (Rotary Position Embedding) for context extension. Set this factor relative to your base context length e.g., for an original context length of 8192 and an extended context length of 524288, use 524288/8192 = 64.z--overlap-param-gatherzOverlap the parameter gather with the optimizer step. This is currently disabled due to a NeMo bug when using DDP. Making this an option defaulting to False is a temporary solution until the bug is fixed.z--overlap-grad-reducez4Overlap the gradient reduce with the optimizer step.z--bucket-sizei   @zDDP bucket size.)argparseArgumentParserArgumentDefaultsHelpFormatteradd_mutually_exclusive_groupadd_argumentstrintsortedr   keysfloat
parse_args)parser
data_group r?   b/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/gpt/model/test_nemotronh.pyr<   0   s
  

r<   c                  C   s  t  } | jrtdd| jdd}ntd| jdd}| j}| jr+t| j| j|| j	|d}n%| j
r@tj| j| j| j|| j	ddid	}nt| j| j| j|| j| j	|d
}| j| jrWdnd| j| j d}| jrh| j|d< | jrp| j|d< | jtvr}td| j t| j d?i |}tj||jd}tddt t g}| jrt| j| j ddddd}|!| | j"r|!t#$  | j%dkr|!t#j&| j%d | j'r|!t#(  |!t)| j*| j+d | j,dkr|!t#j-| j,| j,d | j.r
| j/du r| j0}	n| j/}	|!t#j1| j2|	| j3dd g }
i }| j4sQ| j5rQt6d| j d| j7 d| j8 d| j9 d| d | j d!| j: d"| j d#| j; | j<| j=| j>| j5| j d$}|
!| ||d%< t?d&d'}||d(< |
!| t@d?d)| j i|}tAd| jB| jC| jDd*}tEjF|| j7| j8| j9tGjH| jdd| jI| jJd+d,}tEjK| jL| j;| j0d-||
|| jM| jNddtEjOd.tGjH| jD| j:rd/ndd0d1| j:rd2nd3d4d4d5	| j| jd6}|jP|dd7 tEjQddd| j | jRrtS| jRd| jTd8ndd9}|P|| tUd:| jVd;d<| jW| jXddd=}tY|j0| jZ| j[d>}t\||}|]| |^|| dS )@z"Main function to run NMH training.tiktokenTiktokenTokenizerT)library
model_name
vocab_fileuse_fasthuggingface)rC   rD   rF   )
seq_lengthmicro_batch_sizeglobal_batch_sizenum_workers	tokenizerpad_to_max_length)rH   rI   rJ   rL   rK   dataset_kwargs)pathsrH   rI   rJ   seedrK   rL   F)rH   distribute_saved_activationscross_entropy_loss_fusionfp32_residual_connectionhybrid_override_pattern
num_layerszInvalid model size: )rL      )	max_depth   )every_n_train_stepsdirpath
save_top_kalways_save_contextsave_optim_on_train_endsave_context_on_train_endr   )interval)bucket_sizetp_comm_bootstrap_backend)gc_interval_traingc_interval_valN)
start_stepend_stepranks	gen_shapeznemotronh-size-z-TPz-PPz-CPz-GBSz-MBSFP8z-SEQLENz-NODES)namegroupjob_typeidprojectsave_dirwandbdummy)rn   tensorboardlog_dir)check_for_nan_in_gradoverlap_grad_reduceoverlap_param_gathergrad_reduce_in_fp32log_all)ddptensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizepipeline_dtypesequence_parallelckpt_load_optimizerckpt_save_optimizerckpt_async_savesave_ckpt_formatckpt_load_strictnessgpuz
bf16-mixedhybrid
tensorwiser"   maxmost_recent   )		precisionparams_dtyperv   fp8
fp8_recipefp8_amax_history_lenfp8_amax_compute_algonum_layers_at_start_in_bf16num_layers_at_end_in_bf16)devices	num_nodes	max_stepsacceleratorstrategyloggerr   log_every_n_stepslimit_val_batchesnum_sanity_val_stepsuse_distributed_samplerpluginsval_check_intervalenable_checkpointing)resume_if_exists)pathload_model_stateload_optim_state)r   resume_ignore_no_checkpointresume_past_endresume_from_directoryrestore_configadamg?gffffff?)	optimizerlr
adam_beta1
adam_beta2weight_decay	clip_graduse_distributed_optimizerbf16)r   warmup_stepsmin_lrr?   )_r<   rE   r   hf_tokenizer_namerJ   	mock_datar   rH   rI   workerssftr
   SquadDataModuler   dataset_dirrP   r}   rR   no_fp32_residual_connectionrT   rU   
model_sizer   
ValueError
MambaModelrL   r   r   r   r%   r   r   experiment_dirappendenable_preemptionnl_callbacksPreemptionCallbackdebug_ddp_parity_freqDdpParityCheckerlog_parameters_and_shapesParameterDebuggerr   r`   tp_comm_overlap_backendgc_intervalGarbageCollectionCallbacknsys_profilingnsys_end_stepr   NsysCallbacknsys_start_step
nsys_ranksno_wandbwandb_projectr   tensor_parallel_sizerz   r{   r   r   wandb_groupwandb_job_typewandb_run_idr   r   r   rt   ru   rv   nlMegatronStrategytorchbfloat16r   ckpt_formatTrainerr   r   r   MegatronMixedPrecisionsetup
AutoResumeckpt_dirr   restore_optimizer_from_ckptr   r   wdr   r   r   r   r   connectfit)argsrL   rJ   dataconfig_modifiers_initmamba_configmodelr   checkpoint_callbackr   loggersnemo_logger_kwargswandb_logger	tb_loggernemo_loggerrx   r   trainerresume
opt_configschedoptr?   r?   r@   mainF  s  












r   __main__)9r2   typingr   r   torch._dynamolightning.pytorch.callbacksr   r   lightning.pytorch.loggersr   r   megatron.core.distributedr   megatron.core.optimizerr   nemor	   r   nemo.collectionsr
   nemo.collections.llm.gpt.datar   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightningr   nemo.lightning.pytorchr   r    nemo.lightning.pytorch.callbacksr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.lightning.pytorch.optimr   %nemo.lightning.pytorch.optim.megatronr   'nemo.lightning.pytorch.strategies.utilsr   nemo.utils.exp_managerr   _dynamoconfigsuppress_errorsNemotronHConfig4BNemotronHConfig8BNemotronHConfig47BNemotronHConfig56Br   dictr7   	SSMConfig__annotations__r<   r   __name__r?   r?   r?   r@   <module>   sF   

   o
