o
    }oi                  "   @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ dededededededededededededededededef"ddZedkrfe  Zee eej ! dd d!eZ"e"dd" \Z#Z$Z%Z&Z'Z(Z)Z*Z+Z,Z-Z.Z/Z0Z1Z2Z3eee#e$e%e&e'e(e)e*e,e-e.e/e0e1e2e3Z4e# d#e& d$e' d%e( d&e) d'e$ d(e% dZ5eee6d   d'ej7 d'e5 Z8eej ! ej9ej:ej;e#ej<ej=ej>ej?i ej@ejAejBe3rd)ndd*ZCed+e'd,krd-ndej ! d.v e2d/gZDejEreDFed0d1d2 ejGr,ejHdus#J eDFe
ejHd3 eIe8*ZJeJjKe4eCe8eDd4 ejLsGeJjd+d+d5 neJL  W d   dS W d   dS 1 s_w   Y  dS dS )6    )basenamesplitextN)pretrain_recipe)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersrecompute_moduleskeep_fsdp_fp8_transpose_cacheuse_user_buffer_registration	use_sharpc                 C   s   t dd}t|d|| j||| j||||||	| j| j| j|
| j|||d}t|ddd| j	| j
| j| j}| jr>td|j_|S tjtdd	d
d|j_|jj|j_|S )z
    llama3 8b pre-train recipe aimed at achieving best possible performance and faster
    overall runtime.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)	r   compute_dtype
fp8_recipenccl_communicator_config_pathr   use_fsdp_double_bufferr    r!   r   llmllama3zmeta-llama/Meta-Llama-3-8BnullNullTokenizeri  )library
model_name
vocab_size)r   r   gpus_per_node	max_stepsr$   r%   r&   r'   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   data	tokenizerrunConfigr   model)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   recipe r<   ^/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_llama3_8b.pyoverride_recipe_configs   sF   

r>   __main__r#   r)   8b   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_neweruser_buffer_registration      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Mos.pathr   r   nemo_runr8   &nemo.collections.llm.recipes.llama3_8br   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r   r   argument_parserr
   	executorsr   helpersr   r   r   r   utilsr   strintboollistr>   __name__
parse_argsr   gpulowerkwargsr   r   r   r   r   r   r   r   rF   r   r   r   r   r   r   r    r!   r;   
exp_config__file__r$   exp_nameaccount	partitionlog_dirr/   
time_limitcontainer_imagerI   rK   rL   rM   r[   r]   enable_nsysappendenable_memory_profilememory_profile_out_path
Experimentexpadddryrunr<   r<   r<   r=   <module>   s  	


A

."

	$