o
    }oi                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ dededededededededededefddZedkr`e  Zee eej  dddeZ!e!d d \Z"Z#Z$Z%Z&Z'Z(Z)Z*Z+Z,Z,Z,eee"e#e$e%e&e'e(e)e*e+Z-e" de% de& de' d e( d!e) d"e* d#e# d$e$ dZ.eee/d   d#ej0 d#e. Z1e+sd%d&iZ2ni Z2eej  ej3ej4ej5e"ej6ej7ej8ej9e2ej:ej;ej<ej=rd'ndd(Z>ed)e&d*krd+ndej  d,v d-gZ?ej@re?Aed.d/d)d0 ejBr&ejCdusJ e?Ae
ejCd1 eDe1*ZEeEjFe-e>e1e?d2 ejGsAeEjd)d)d3 neEG  W d   dS W d   dS 1 sYw   Y  dS dS )4    )basenamesplitextN)pretrain_recipe)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeetp_sizeenable_cuda_graphsc                 C   s   t dd}t|d|| j||| j||||||	|
| j| j| j| jd}t|ddd| j	| j
| j| j}| jr:td|j_ntjtdd	d
d|j_|jj|j_d|jj_d|jj_d|jj_d|jj_|S )z
    llama4 e16 pre-train recipe aimed at achieving best possible performance and faster
    overall runtime.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)r   use_user_buffer_registration	use_sharpcompute_dtype
fp8_recipellmllama4z)meta-llama/Llama-4-Scout-17B-16E-InstructnullNullTokenizeri@ )library
model_name
vocab_sizete)r   r   gpus_per_node	max_stepsr   r   r    r!   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   data	tokenizerrunConfigr   modelconfigcross_entropy_fusion_implcross_entropy_loss_fusionapply_rope_fusionmoe_permute_fusion)r   r   r   r   r   r   r   r   r   r   r   recipe r<   _/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_llama4_e16.pyoverride_recipe_configs   sF   





r>   __main__r   r#   e16   nodes_tp_pp_cp_vp_ep_etp_mbs_PYTORCH_CUDA_ALLOC_CONFzexpandable_segments:Truesharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step	gen_shape)dir)executornameplugins)
sequentialdetach)Hos.pathr   r   nemo_runr3   'nemo.collections.llm.recipes.llama4_e16r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r   r   argument_parserr
   	executorsr   helpersr   r   r   r   utilsr   strintboolr>   __name__
parse_argsr   gpulowerkwargsr   r   r   r   r   r   r   r   r   r   rH   r;   
exp_config__file__r    exp_namerM   account	partitionlog_dirr*   
time_limitcontainer_imagerL   rN   rO   rP   r   r^   r`   enable_nsysappendenable_memory_profilememory_profile_out_path
Experimentexpadddryrunr<   r<   r<   r=   <module>   s   	


=
8"

	$