o
    }oi	                     @   s  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z! ddl"m#Z#m$Z$ de%de&de&de&de&de&de&de&de&de'de'de&de&fddZ(e)dkre * Z+ee+ ee+j,- ddde+Z.e.dd \Z/Z0Z1Z2Z3Z4Z5Z6Z7Z8Z9Z:Z;e(e+e/e0e1e2e3e4e5e6e8e9e:e;Z<e/ d e2 d!e3 d"e4 d#e5 d$e0 d%e1 dZ=eee>d   d$e+j? d$e= Z@e9rd&d&d'ZAni ZAee+j,- e+jBe+jCe+jDe/e+jEe+jFe+jGe+jHeAe+jIe+jJe+jKe+jLrd(ndd)ZMed*e3d+krd,nde+j,- d-v d.gZNe+jOr1eNPed/d0d1 e+jQrFe+jRdus=J eNPee+jRd2 eSe@*ZTeTjUe<eMe@eNd3 e+jVsaeTjd*d*d4 neTV  W d   dS W d   dS 1 syw   Y  dS dS )5    )basenamesplitextN)pretrain_recipe)4userbuffers_bf16_b200_h16384_tp4_cp2_mbs1_seqlen81924userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen81923userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen81923userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxhf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   st  t dd}t|d|| j||| j||||||	|
| j| j||| j| j| jd}t	|ddd| j
| j| j| j}| j }| jrCtd|j_ntjtdd	d
d|j_|jj|j_dtj_dtj_dtj_dtj_dtj_dtj_dtj_dtj_ttdtt dtt dd}t!|j"j#}|dusJ d|| | j }t$%tjt&'|}||j"j#| _(|
r|dkr|d d |j"j)_*|S )z
    llama3 405b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)	r   r    use_user_buffer_registration	use_sharpr!   r"   compute_dtype
fp8_recipenccl_communicator_config_pathllmllama3zmeta-llama/Llama-3.1-405BnullNullTokenizeri  )library
model_name
vocab_sizeF)bf16fp8)h100b200gb200Nz>MegatronCommOverlapCallback missing. Required for performance.r5      @   )+r   r   gpus_per_node	max_stepsr%   r&   r'   r(   r)   r   tensorboardwandbwandb_prj_namewandb_job_namegpuloweruse_hf_tokenizerr   data	tokenizerrunConfigr	   modelr   	qkv_fprop	aggregate
proj_dgrad	fc1_fprop	fc2_dgradr   r   r   r   trainer	callbacksfdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfgstrategy#num_distributed_optimizer_instances)r   r   r   r   r   r   r   r   r   r   r    r!   r"   recipegpu_typeub_cfgcomm_overlap_callback_idxrQ    rX   a/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_llama31_405b.pyoverride_recipe_configs%   sx   


rZ   __main__r$   llama31405b   nodes_tp_pp_cp_vp_mbs_1)NVTE_NORM_FWD_USE_CUDNNNVTE_NORM_BWD_USE_CUDNNsharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )r4   r5   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Wos.pathr   r   fiddlerM   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrO   nemo_runrC   )nemo.collections.llm.recipes.llama31_405br   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   r   r   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr	   nemo.lightning.run.pluginsr
   r   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   strintboolrZ   __name__
parse_argsr   r>   r?   kwargsr   r   r   r   r   r   r   r   rc   r   r    r!   r"   rT   
exp_config__file__r'   exp_nameenv_varsaccount	partitionlog_dirr8   
time_limitcontainer_imageri   rk   rl   rm   r&   rx   rz   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrX   rX   rX   rY   <module>   s   	


c

."
	$