o
    }oi                  "   @   s  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z! ddl"m#Z#m$Z$ de%de&de&de&de&de&de&de&de&de'de'de&de&de(de'de'de'f"ddZ)e*dkre + Z,ee, ee,j-. d d!d"e,Z/e/dd# \Z0Z1Z2Z3Z4Z5Z6Z7Z8Z9Z:Z;Z<Z=Z>Z?Z@e)e,e0e1e2e3e4e5e6e7e9e:e;e<e=e>e?e@ZAe0 d$e3 d%e4 d&e5 d'e6 d(e1 d)e2 dZBeeeCd   d(e,jD d(eB ZEee,j-. e,jFe,jGe,jHe0e,jIe,jJe,jKe,jLi e,jMe,jNe,jOe@rd*ndd+ZPed,e4d-krd.nde,j-. d/v e?d0gZQe,jRr7eQSed1d2d3 e,jTrLe,jUdusCJ eQSee,jUd4 eVeE*ZWeWjXeAePeEeQd5 e,jYsgeWjd,d,d6 neWY  W d   dS W d   dS 1 sw   Y  dS dS )7    )basenamesplitextN)pretrain_recipe)/userbuffers_bf16_b200_h8192_tp2_mbs1_seqlen8192/userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192.userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192.userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxhf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersrecompute_moduleskeep_fsdp_fp8_transpose_cacheuse_user_buffer_registration	use_sharpc                 C   s  t dd}t|d|| j||| j|||||f|	|
|| j|||| j| j| j|d}t|ddd| j	| j
| j| j}| j }| jrFtd|j_ntjtdd	d
d|j_|jj|j_ttdttdttdd}t|jj}|dustJ d|| | j }ttjt !|}||jj| _"|S )z
    llama3 70b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)r   r    r%   use_fsdp_double_bufferr&   r!   r"   compute_dtype
fp8_recipenccl_communicator_config_pathr$   llmllama3zmeta-llama/Meta-Llama-3-70BnullNullTokenizeri  )library
model_name
vocab_size)bf16fp8)h100b200gb200Nz>MegatronCommOverlapCallback missing. Required for performance.)#r   r   gpus_per_node	max_stepsr)   r*   r+   r,   r   tensorboardwandbwandb_prj_namewandb_job_namegpuloweruse_hf_tokenizerr   data	tokenizerrunConfigr	   modelr   r   r   r   r   trainer	callbacksfdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfg)r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   recipegpu_typeub_cfgcomm_overlap_callback_idxrM    rR   _/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_llama3_70b.pyoverride_recipe_configs%   sj   


rT   __main__r(   r.   70b   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )r7   r8   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_neweruser_buffer_registration      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Zos.pathr   r   fiddlerI   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrK   nemo_runrD   'nemo.collections.llm.recipes.llama3_70br   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   r   r   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr	   nemo.lightning.run.pluginsr
   r   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   strintboollistrT   __name__
parse_argsr   r?   r@   kwargsr   r   r   r   r   r   r   r   r\   r   r    r!   r"   r#   r$   r%   r&   rN   
exp_config__file__r*   exp_nameaccount	partitionlog_dirr9   
time_limitcontainer_imager_   ra   rb   rc   ro   rq   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrR   rR   rR   rS   <module>   s
  	


Z

."
	$