o
    }oiA                     @   s  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZ d dlmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddl m!Z! de"de#de#de#de#de#de#de#de#de$fddZ%e&dkrbe ' Z(ee( ee(j)* ddde(Z+e+dd \
Z,Z-Z.Z/Z0Z1Z2Z3Z4Z5e%e(e,e-e.e/e0e1e2e3e5
Z6e, de/ de0 de1 d e2 d!e- d"e. dZ7eee8d   d!e(j9 d!e7 Z:ee(j)* e(j;e(j<e(j=e,e(j>e(j?e(j@e(jAi e(jBe(jCe(jDe(jErd#ndd$ZFed%e0d&krd'nde(j)* d(v d)gZGe(jHreGIed*d+d, e(jJr(e(jKdusJ eGIee(jKd- eLe:*ZMeMjNe6eFe:eGd. e(jOsCeMjd%d%d/ neMO  W d   dS W d   dS 1 s[w   Y  dS dS )0    )basenamesplitextN)pretrain_recipe)/userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsloggingset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsc
                 C   s   t dd}
t|
d|| j||| j||||||	| j| j| j| j| jd}
t	|
ddd| j
| j| j| j}
| j }| jr>td tjtdd	d
d|
j_|
jj|
j_t|
jj}|dus]J d|dv rtt}ttjt |}||
jj| _!|
S )z
    nemotron4 15b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)r   use_user_buffer_registration	use_sharpcompute_dtype
fp8_recipenccl_communicator_config_pathllm	nemotron4zKHuggingFace tokenizer not supported for Nemotron4 15B. Using NullTokenizer.nullNullTokenizeri  )library
model_name
vocab_sizeNz>MegatronCommOverlapCallback missing. Required for performance.b200gb200)"r   r   gpus_per_node	max_stepsr   r    r!   r"   r#   r   tensorboardwandbwandb_prj_namewandb_job_namegpuloweruse_hf_tokenizerr   warningrunConfigr   data	tokenizermodelr   trainer	callbacksr   fdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfg)r   r   r   r   r   r   r   r   r   r   recipegpu_typecomm_overlap_callback_idxrC    rG   b/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_nemotron4_15b.pyoverride_recipe_configs    sL   



rI   __main__r   r%   15b
   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    r+   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Pos.pathr   r   fiddler?   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrA   nemo_runr8   *nemo.collections.llm.recipes.nemotron4_15br   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r   r	   argument_parserr   	executorsr   helpersr   r   r   r   r   utilsr   strintboolrI   __name__
parse_argsr   r4   r5   kwargsr   r   r   r   r   r   r   r   rQ   r   rD   
exp_config__file__r!   exp_nameaccount	partitionlog_dirr.   
time_limitcontainer_imagerT   rV   rW   rX   r    rc   re   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrG   rG   rG   rH   <module>   s   	


?
 ."
	$