o
    }oi l                     @   sj  d dl Z d dlmZ d dlmZ d dlZd dlmZmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 dej7j8_9dd Z:dd Z;e<dkr	 e;  dS dS )    N)asdict)Type)LearningRateMonitorRichModelSummary)TensorBoardLoggerWandbLogger)DistributedDataParallelConfig)OptimizerConfig)	lightning)llm)MockDataModulePreTrainingDataModule)parse_dataset_config)Evo2DatasetEvo2DatasetPadEodLossMask)HYENA_MODEL_OPTIONS)/userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192.userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192)get_nmt_tokenizer)
NeMoLogger)	callbacks)ModelCheckpoint)FLOPsMeasurementCallback)MegatronCommOverlapCallback)CosineAnnealingScheduler)MegatronOptimizerModule)RestoreConfig)TimingCallbackTc                  C   s  t jdt jd} | jdd}|jddtdd |jd	d
dd | jdtdd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdd
dd d! | jd"td#d$d | jd%td&d'd | jd(td&d)d | jd*td&d+d | jd,d
d-d | jd.d
d/d | jd0tdd1d | jd2td&d3d | jd4tdd5d | jd6td7d | jd8td9d | jd:d
dd;d! | jd<d
dd=d! | jd>d
dd?d! | jd@d
ddA | jdBtg dCdDdEdF | jdGd
ddA | jdHttt	 dIdJdF | jdKd
ddLd! | jdMtddNdO | jdPtdQdRd | jdStdddTdU | jdVtd&dWd | jdXt
dYdZd | jd[d
d\d | jd]d
dd^d! | jd_td`dad | jdbtdcddd | jdetdfdgd | jdhd
ddid! | jdjd
ddA | jdktdldmgdldndF | jdod
ddpd! | jdqd
ddrd! | jdsd
ddtd! | jdutdfdvd | jdwtdxd | jdytdzd | jd{d
dd|d! | jd}d
dd~d! | jdt
ddd | jdt
ddd | jdtddd | jdd
ddd! | jdtddfdd | jdtdddO | jdd
ddd! | jdtdddfgdd | jdtdd | jdddddd | jdt
ddd | jdt
dd | jdd
ddd! | jdd
ddd! | jdt
ddd | jdt
ddd | jdd}|jdd
ddA |jdd
ddA |  S )z(Parse arguments for Evo2 model training.z#Train a Hyena model using NeMo 2.0.)descriptionformatter_classT)requiredz-dz--dataset-configzCPath to the blended / weighted training dataset configuration YAML.)typehelpz--mock-data
store_truezZTrain with Mock data (for testing/debugging), either set this or provide a dataset config.)actionr"   z--dataset-dirzAbsolute path to the dataset directory. Defaults to using the absolute or relative paths (dataset_prefix) specified in the dataset config YAML.z--num-nodes   z3Number of nodes to use for training, defaults to 1.)r!   defaultr"   z	--devicesz5Number of devices to use for training, defaults to 1.z--seq-lengthi    zTraining sequence lengthz--tensor-parallel-sizez+Order of tensor parallelism. Defaults to 1.z--pipeline-model-parallel-sizez-Order of pipeline parallelism. Defaults to 1.z--context-parallel-sizez,Order of context parallelism. Defaults to 1.z
--no-wandbFzDisable Wandb logging)r$   r&   r"   z--wandb-project	nemo_evo2zWandb project namez--wandb-run-idNzWandb run identifierz--wandb-groupz3A unique string shared by all runs in a given groupz--wandb-job-typezA unique string representing a type of run, which is useful when you're grouping runs together into larger experiments using group.z--sequence-parallelz#Set to enable sequence parallelism.z--fp8zSet to enable FP8z--micro-batch-sizez,Micro-batch size for data-parallel training.z--global-batch-sizez\Global batch size for training. If set to None, infer it from the TP, CP, and PP parameters.z--grad-acc-batchesz/Number of batches to accumulate gradients over.z--max-stepsz*Number of training optimizer update steps.z--val-check-intervalzFNumber of steps between validation measurements and model checkpoints.z--grad-reduce-in-fp32zGradient reduce in FP32.z--fp8-wgradz?Faster option that is maybe less accurate (TBD) when using fp8.z--no-aligned-megatron-ddpz'Do not do aligned gradient updates etc.z--use-megatron-comm-overlap-8k)r$   r&   z--tp-comm-overlap-backend)ncclmpigloor(   z4TP communication backend to use. Defaults to 'nccl'.)r!   choicesr&   r"   z--align-param-gatherz--model-size7bzModel architecture to use, choose between 7b, 40b, or test (a sub-model of 4 layers, less than 1B parameters). '_arc_1m' models have GLU / FFN dimensions that support 1M context length when trained with TP<=8.z--add-bias-outputz?Add bias to the output layer to enable learning a simple prior.z--experiment-dirz4Directory to write model checkpoints and results to.)r!   r    r"   z--limit-val-batches   zNumber of validation stepsz--log-every-n-stepsz Number of steps between logging.)r!   r&   r    r"   z
--ckpt-dirzUDirectory to restore an initial checkpoint from. Use this for supervised fine-tuning.z--wdg{Gz?zWeight decay for optimizer.z--restore-optimizer-from-ckptzCRestore optimizer state from initial checkpoint. Defaults to False.z--no-average-in-collectivezSAvaerage optimizer state in collective rather than dividing by dp size and summing.z--seedi  zSet random seed for training.z	--workers   z*Number of workers to use for data loading.z--gc-intervalr   zeSet to a value > 0 if you want to synchronize garbage collection, will do gc every gc-interval steps.z--enable-preemptionzTEnable preemption hooks. If enabled this will save a checkpoint whenver slurm exits.z--ckpt-async-savez--ckpt-format
torch_distzarrzSpecify checkpoint format to use. Defaults to 'torch_dist', as 'zarr' is deprecated. Only use if resuming training from a zarr checkpoint.z--eod-pad-in-loss-maskzRDo not predict EOD/Pad tokens (typical default, but not default in original evo2).z--cross-entropy-loss-fusionzjUse the faster, but maybe less accurate fused form of cross entropy, which also has bf16 grads internally.z--no-fp32-residual-connectionzWIf set, turn off fp32 residual connections which may be faster but may impact accuracy.z--debug-ddp-parity-freqz:Set to value > 0 to debug DDP weight parity between ranks.z--hybrid-override-patternz]Override the hybrid override pattern in the config (specifies hyena layer ordering and type).z--num-layerszHIf set, override the number of layers specified in the requested config.z--tflops-callbackzGEnable tflops calculation callback for Hyena / Evo2. Defaults to False.z--log-parameters-and-shapesz8Log training parameters shapes and dtypes for debugging.z--lrga2U0*3?zLearning rate.z--min-lrgiUMu>z&Min learning rate in cosine annealing.z--warmup-stepsi	  z*Number of warmup steps in cosine annealingz--nsys-profilingaS  Enable targeted `nsys` profiling on the training loop for a defined step range. To actually get profiling output you must run the whole program with `nsys`. For example:  `nsys profile -s none -o output_report_name -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop  [regular python command here]`z--nsys-start-stepz%Start nsys profiling after this step.)r!   r    r&   r"   z--nsys-end-stepz#End nsys profiling after this step.z--no-renormalize-lossz$Do not renormalize the loss weights.z--nsys-ranks+z&Enable nsys profiling for these ranks.)r!   nargsr    r&   r"   z,--activation-checkpoint-recompute-num-layersz5If set, override the default value set in the config.z--disable-checkpointingstore_falsecreate_checkpoint_callbackz,Disable creating a ModelCheckpoint callback.)r$   r&   destr"   z--clip-gradg      ?zGGrad clip value. Note that when using DDP this may need to be inflated.z--seq-len-interpolation-factorzAdjusts the linear scaling of ROPE (Rotary Position Embedding) for context extension. Set this factor relative to your base context length e.g., for an original context length of 8192 and an extended context length of 524288, use 524288/8192 = 64.z--overlap-param-gatherzOverlap the parameter gather with the optimizer step. This is currently disabled due to a NeMo bug when using DDP. Making this an option defaulting to False is a temporary solution until the bug is fixed.z--overlap-grad-reducez4Overlap the gradient reduce with the optimizer step.z--hidden-dropoutg        z(Dropout probability for the hyena layersz--attention-dropoutz-Dropout probability for the attention layers.z--no-activation-checkpointingz$--selective-activation-checkpointing)argparseArgumentParserArgumentDefaultsHelpFormatteradd_mutually_exclusive_groupadd_argumentstrintsortedr   keysfloat
parse_args)parser
data_grouprecompute_group rD   ^/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/gpt/model/test_hyena.pyr@   6   s>  
	
r@   c                  C   s  t  } td}| j}| jrt| j| j|| j|d}n t| j	| j
}| jr&tnt}t||| j| j|| j| j|| jd	}| jrDdddd}n| jrNdddd}n| jdurYd| ji}ni }| j| j| j| j| jrhdnd	| jrnd
nd| j| j | jd	|}| jr| j|d< | jr| j|d< | jtvrtd| j t| j dUi |}t j!||j"d}	t#ddt$ t% g}
| j&rt'| j(| j)ddddd}|
*| | j+r|
*t,-  | j.dkr|
*t,j/| j.d | j0r|
*t,1  | j2rt3t4||d}|
*| | jr| j5rt6}nt7}|
*t8|j9|| j:dd
| j;d | j<dkr*|
*t,j=| j<| j<d | j>rI| j?du r8| j@}n| j?}|
*t,jA| jB|| jCdd g }i }| jDs"| jEr"tFdGg d| j d| jH d| jI d | jJ d!| d"| j d#| j d$| j d%| j d&|jK d'|j d(|jL d)|j d*| jM  d+| j d,| j d-| jN d.| j d/| j d0| jO d1| jP d2| jQ d3| jR d4| jS d5| jTo| j5 d6| jU d7| jV d8| jW d9| j5 | jX| jY| jZ| jE| j)d:}|*| ||d;< t[d<d=}||d>< |*| t\dUd?| j)i|}t]d| jU| jV| jS| j;| jM d@}t^j_|| jH| jI| jJt`ja| jdd| jb| jcdAdB}t^jd| je| jW| j@dC|||
| jf| jgdd
t^jhdDt`ja| jS| j5rdEnd| j5rdFndG| j5rdHndI| j5o| jTp| jdJ| j(| j&dK}|ji|ddL t^jjddd
| j)| jkrtl| jkd| jmdMnddN}|i||	 tndO| jOdPdQ| jR| jNdddR}to|j@| jQ| jPdS}tp|||jqdT}|r|	 |s|	| dS )Vz#Main function to run Evo2 training.z
byte-level)
seq_lengthmicro_batch_sizeglobal_batch_sizenum_workers	tokenizer)	pathsdataset_clsrF   rG   rH   seedrI   rJ   eod_mask_lossN)recompute_granularityrecompute_methodrecompute_num_layers	selectiverQ   weightednormalized_weightedFT)	tp_comm_overlaprF   hidden_dropoutattention_dropoutto_upperdistribute_saved_activationscross_entropy_loss_fusionfp32_residual_connectionadd_bias_outputhybrid_override_pattern
num_layerszInvalid model size: )rJ      )	max_depth   )every_n_train_stepsdirpath
save_top_kalways_save_contextsave_optim_on_train_endsave_context_on_train_endr   )intervalhyena   )rU   tp_comm_overlap_cfgtp_comm_bootstrap_backendwgrad_deferral_limit(overlap_param_gather_with_optimizer_stepalign_param_gather)gc_interval_traingc_interval_val)
start_stepend_stepranks	gen_shape z
evo2-size-z-TPz-PPz-CPz-GBSz-MBSz-SkipLossRenormz-NOACz-SELACz-ACRNLz-PATz-F32Rz-FCEz-AICz-PEODz-BOz-GCLPz-HDOz-ADOz-LRz-MINLRz-WUSTEPSz-WDz-GRFP32z-FP8WGz-OGRz-OPGz-NODESz-FP8)namegroupjob_typeidprojectsave_dirwandbdummy)r|   tensorboardlog_dir)check_for_nan_in_gradoverlap_grad_reduceoverlap_param_gathergrad_reduce_in_fp32ro   average_in_collectivelog_all)ddptensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizepipeline_dtypesequence_parallelckpt_load_optimizerckpt_save_optimizerckpt_async_savesave_ckpt_formatckpt_load_strictnessgpuz
bf16-mixedhybrid   r%   maxmost_recent)	precisionparams_dtyper   fp8fp8_amax_history_lenfp8_amax_compute_algo	fp8_wgrad)devices	num_nodes	max_stepsacceleratorstrategyloggerr   log_every_n_stepslimit_val_batchesnum_sanity_val_stepsuse_distributed_samplerpluginsval_check_intervalenable_checkpointing)resume_if_exists)pathload_model_stateload_optim_state)r   resume_ignore_no_checkpointresume_past_endresume_from_directoryrestore_configadamg?gffffff?)	optimizerlr
adam_beta1
adam_beta2weight_decay	clip_graduse_distributed_optimizerbf16)r   warmup_stepsmin_lr)no_weight_decay_condrD   )tr@   r   rH   	mock_datar   rF   rG   workersr   dataset_configdataset_direod_pad_in_loss_maskr   r   r   rM   no_activation_checkpointing"selective_activation_checkpointing*activation_checkpoint_recompute_num_layersuse_megatron_comm_overlap_8krV   rW   no_renormalize_lossr   rZ   no_fp32_residual_connectionr\   r]   r^   
model_sizer   
ValueErrorr   
HyenaModelrJ   r   r   r   r4   r   r   experiment_dirappendenable_preemptionnl_callbacksPreemptionCallbackdebug_ddp_parity_freqDdpParityCheckerlog_parameters_and_shapesParameterDebuggertflops_callbackr   r   r   r   r   r   rU   tp_comm_overlap_backendro   gc_intervalGarbageCollectionCallbacknsys_profilingnsys_end_stepr   NsysCallbacknsys_start_step
nsys_ranksno_wandbwandb_projectr   jointensor_parallel_sizer   r   rQ   r[   no_average_in_collectiver   r   r   r   wdr   r   r   r   r   wandb_groupwandb_job_typewandb_run_idr   r   r   nlMegatronStrategytorchbfloat16r   ckpt_formatTrainerr   r   r   MegatronMixedPrecisionsetup
AutoResumeckpt_dirr   restore_optimizer_from_ckptr	   r   r   hyena_no_weight_decay_cond_fnconnectfit)argsrJ   rH   datablended_dataset_configrL   activation_checkpointing_argsconfig_modifiers_initevo2_configmodelr   checkpoint_callbackflop_meas_callbackrk   r   loggersnemo_logger_kwargswandb_logger	tb_loggernemo_loggerr   r   trainerresume
opt_configschedoptrD   rD   rE   mainj  s  







"	


	

r  __main__)=r6   dataclassesr   typingr   r   lightning.pytorch.callbacksr   r   lightning.pytorch.loggersr   r   megatron.core.distributedr   megatron.core.optimizerr	   nemor
   r   nemo.collectionsr   nemo.collections.llm.gpt.datar   r   3nemo.collections.llm.gpt.data.megatron.hyena.configr   9nemo.collections.llm.gpt.data.megatron.hyena.evo2_datasetr   r   $nemo.collections.llm.gpt.model.hyenar   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightningr   nemo.lightning.pytorchr   r    nemo.lightning.pytorch.callbacksr   /nemo.lightning.pytorch.callbacks.flops_callbackr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.lightning.pytorch.optimr   %nemo.lightning.pytorch.optim.megatronr   'nemo.lightning.pytorch.strategies.utilsr   nemo.utils.exp_managerr   _dynamoconfigsuppress_errorsr@   r  __name__rD   rD   rD   rE   <module>   sH   
  6  ,
