o
    }oi
                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ dededededededededededefddZedkrWe  Zee eej  dddeZ!e!d d \Z"Z#Z$Z%Z&Z'Z(Z)Z*Z+Z,Z,Z,eee"e#e$e%e&e'e(e)e*e+Z-e" de% de& de' d e( d!e) d"e* d#e# d$e$ dZ.eee/d   d#ej0 d#e. Z1eej  ej2ej3ej4e"ej5ej6ej7ej8i ej9ej:ej;ej<rd%ndd&Z=ed'e&d(krd)ndej  d*v d+gZ>ej?re>@ed,d-d'd. ejArejBdusJ e>@e
ejBd/ eCe1*ZDeDjEe-e=e1e>d0 ejFs8eDjd'd'd1 neDF  W d   dS W d   dS 1 sPw   Y  dS dS )2    )basenamesplitextN)pretrain_recipe)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeetp_sizeenable_cuda_graphsc                 C   s   t dd}t|d|| j||| j||||||	|
| j| j| j| jd}t|ddd| j	| j
| j| j}| jr:td|j_ntjtdd	d
d|j_|jj|j_d|jj_d|jj_d|jj_d|jj_|S )z
    llama4 e128 pre-train recipe aimed at achieving best possible performance and faster
    overall runtime.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)r   use_user_buffer_registration	use_sharpcompute_dtype
fp8_recipellmllama4z-meta-llama/Llama-4-Maverick-17B-128E-InstructnullNullTokenizeri@ )library
model_name
vocab_sizete)r   r   gpus_per_node	max_stepsr   r   r    r!   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   data	tokenizerrunConfigr   modelconfigcross_entropy_fusion_implcross_entropy_loss_fusionapply_rope_fusionmoe_permute_fusion)r   r   r   r   r   r   r   r   r   r   r   recipe r<   `/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_llama4_e128.pyoverride_recipe_configs   sF   





r>   __main__r   r#   e128   nodes_tp_pp_cp_vp_ep_etp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step	gen_shape)dir)executornameplugins)
sequentialdetach)Gos.pathr   r   nemo_runr3   (nemo.collections.llm.recipes.llama4_e128r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r   r   argument_parserr
   	executorsr   helpersr   r   r   r   utilsr   strintboolr>   __name__
parse_argsr   gpulowerkwargsr   r   r   r   r   r   r   r   r   r   rH   r;   
exp_config__file__r    exp_nameaccount	partitionlog_dirr*   
time_limitcontainer_imagerK   rM   rN   rO   r   r]   r_   enable_nsysappendenable_memory_profilememory_profile_out_path
Experimentexpadddryrunr<   r<   r<   r=   <module>   s   	


=
8"
	$