o
    }oi                     @   s  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZ d dlmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZm Z  ddl!m"Z" de#de$de$de$de$de$de$de$de$de%de%de$de$fddZ&e'dkrue ( Z)ee) ee)j*+ ddde)Z,e,dd \Z-Z.Z/Z0Z1Z2Z3Z4Z5Z6Z7Z8Z9e&e)e-e.e/e0e1e2e3e4e6e7e8e9Z:e- d e0 d!e1 d"e2 d#e3 d$e. d%e/ dZ;eee<d   d$e)j= d$e; Z>ee)j*+ e)j?e)j@e)jAe-e)jBe)jCe)jDe)jEd&d&d'e)jFe)jGe)jHe)jIrd(ndd)ZJed*e1d+krd,nde)j*+ d-v d.gZKe)jLr&eKMed/d0d1 e)jNr;e)jOdus2J eKMee)jOd2 ePe>*ZQeQjRe:eJe>eKd3 e)jSsVeQjd*d*d4 neQS  W d   dS W d   dS 1 snw   Y  dS dS )5    )basenamesplitextN)pretrain_recipe)0userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096/userbuffers_fp8_b200_h18432_tp8_mbs1_seqlen4096)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsloggingset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   s  t dd}t|d|| j||| j||||||	|
||| j| j| j| j| jd}t	|ddd| j
| j| j| j}| j }| jrAtd tjtdd	d
d|j_|jj|j_t|jj}|dus`J d|dv r| j dkrmtnt}ttjt !|}||jj| _"|S )z
    nemotron4 340b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)	r   r   r   r    compute_dtype
fp8_recipeuse_user_buffer_registration	use_sharpnccl_communicator_config_pathllm	nemotron4zLHuggingFace tokenizer not supported for Nemotron4 340B. Using NullTokenizer.nullNullTokenizeri  )library
model_name
vocab_sizeNz>MegatronCommOverlapCallback missing. Required for performance.b200gb200fp8)#r   r   gpus_per_node	max_stepsr#   r$   r%   r&   r'   r   tensorboardwandbwandb_prj_namewandb_job_namegpuloweruse_hf_tokenizerr   warningrunConfigr   data	tokenizermodelr   trainer	callbacksr   r   fdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfg)r   r   r   r   r   r   r   r   r   r   r   r   r    recipegpu_typecomm_overlap_callback_idxrH    rL   c/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_nemotron4_340b.pyoverride_recipe_configs#   sX   



rN   __main__r"   r)   340b   nodes_tp_pp_cp_vp_mbs_1)NVTE_NORM_FWD_USE_CUDNNNVTE_NORM_BWD_USE_CUDNNsharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    r/   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Tos.pathr   r   fiddlerD   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrF   nemo_runr=   +nemo.collections.llm.recipes.nemotron4_340br   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r	   r
   argument_parserr   	executorsr   helpersr   r   r   r   r   utilsr   strintboolrN   __name__
parse_argsr   r9   r:   kwargsr   r   r   r   r   r   r   r   rV   r   r   r   r    rI   
exp_config__file__r#   exp_nameaccount	partitionlog_dirr3   
time_limitcontainer_imager\   r^   r_   r`   r&   rk   rm   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrL   rL   rL   rM   <module>   s   	


I

."
	$