o
    }oiy                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZmZmZ dd Zd	d
 ZedkrIe  dS dS )z.
Test the LLaMA3 recipe with a smaller model.
    N)llm)ParameterDebugger)PytorchProfilerCallback)4AssertOptimizerParamGroupsHaveAtLeastTwoWeightDecaysMCoreModelAttributeValidatorMiscAttributeValidatorStopBeforeEndcreate_verify_precisionsmall_llama_cfg
train_dataverify_ckpt_dirc                  C   s<  t jddd} | jdtddd | jdtddd | jd	td d
d | jdtddd | jdtd dd | jdtd dd | jdtdd | jdtddd | jdtd dd | jdtd dd | jdtd dd | jdtd dd | jd td!d"gd d#d$ | jd%tg d&d'd(d$ | jd)d*d+d, | jd-d*d.d, |  S )/N )progdescriptionz	--devicesTz%Number of devices to use for training)typerequiredhelpz--max-stepszNumber of steps to train forz--early-stopzEStop training early at this global step (for testing resume training))r   defaultr   z--experiment-dirz-directory to write results and checkpoints toz--data-pathz4Path to data file. If not specified, uses mock data.z--tokenizer-pathzOPath to a sentencepiece tokenizer model file. If not specified, uses mock data.z--index-mapping-dirz$directory to write index mappings to)r   r   z--seq-lengthi    zSequence length. default is 8kz--tpzOverride tensor parallelismz--ppzOverride pipeline parallelismz--vpz%Override virtual pipeline parallelismz--cpzOverride context parallelismz--spr      zOverride sequence parallel)r   choicesr   r   z--precisionbf16fp16fp32r   zOverride recipe precisionz--fp8
store_truez
Enable FP8)actionr   z
--profilerzDAttach PytorchProfilerCallback and verify trace files after training)argparseArgumentParseradd_argumentintstr
parse_args)parser r#   \/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/llama3_pretraining.pyget_args(   sL   r%   c                  C   s  t  } d}tjj| j|| jd}ttjt	| j
|_| jr.| jr.t| j| j| j| j
d|_| j|j_d|j_d |jj_d |jj_d|j_d|j_| jrW|jjt| jd |jjt  | jdkrg| j rdd l!m"  m  m#} | j| j f}|j$|j%|j&d	| }| |j_'t(j)t(j*t(j+d
}t,t-|| j t-t(j+ddgd}|jj| | j.| j/| j0| j1| j2d urt3| j2nd d}|4 D ]\}	}
|
d urt5|jj6|	|
 t7|jj6|	||	< q|jjt8| t9| j| jp| jd}|jj| | j:r)t;j<=| j|}t;j<=|d}t;j>|dd t?d| jd| j|ddid}|jj| tj|dd t@|jj| jp:| j|jjt;j<=| j| | j:rt;j<=| j|}t;j<=|d}t;j<=|d}t;j<=|d}t;j<A|suJ d| t;j<A|sJ d| dd t;B|D }dd t;B|D }tC|| jksJ d| j d| dtC| tC|| jksJ d| j d| dtC| d S d S ) NL2_llama3_small_pretrain_test)dirnamenum_gpus_per_node)	data_pathtokenizer_pathindex_mapping_dir
seq_lengthr      )stop_on_stepr   r   ))r   F)r   T)r   Tr   on_train_starton_train_end)param_fngrad_fnlog_on_hooks)tensor_model_parallel_sizepipeline_model_parallel_size$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallel)	max_stepsr/   tracesT)exist_ok
with_stack)
start_stepend_stepwarmup_stepsactive_steps	trace_dirprofiler_kwargs)directdevicehostz!Missing device traces directory: zMissing host traces directory: c                 S      g | ]	}| d r|qS z.jsonendswith.0fr#   r#   r$   
<listcomp>       zmain.<locals>.<listcomp>c                 S   rG   rH   rI   rK   r#   r#   r$   rN      rO   z	Expected z JSON files in z, found )Dr%   r   	llama3_8bpretrain_recipeexperiment_dirdevicesrunConfig
LlamaModelr
   r-   modelr*   r+   r   r,   datar:   trainerlog_every_n_stepslogckptevery_n_train_stepstrain_time_intervalval_check_intervallimit_val_batches
early_stop	callbacksappendr   r   	precisionfp8%llm.recipes.precision.mixed_precisionrecipesmixed_precision
fp16_mixedbf16_with_fp8_mixedfp16_with_fp8_mixedpluginstorchbfloat16float16float32r   r	   tpppvpcpspboolitemssetattrstrategygetattrr   r   profilerospathjoinmakedirsr   r   isdirlistdirlen)argsexp_namerQ   
mp_recipeskeyprecision_recipe	dtype_mapdebugger_callbackparallelismskvmisc_checkerexp_pathrB   profiler_cb
trace_root
device_dirhost_dirdevice_jsons
host_jsonsr#   r#   r$   mainR   s   




r   __main__)__doc__r   r|   nemo_runrT   rm   nemo.collectionsr   *nemo.lightning.pytorch.callbacks.debuggingr   1nemo.lightning.pytorch.callbacks.pytorch_profilerr   tests.collections.llm.commonr   r   r   r   r	   r
   r   r   r%   r   __name__r#   r#   r#   r$   <module>   s   (*n
