o
    }oi"                  $   @   s   d dl mZmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZ d	d
lmZ d	dlmZ d	dlmZmZmZmZ d	dlmZ dZdZ 			d;de!de"de"de"de"de"de"de"de"de"de#de#de"de"deee!  dee# d ee# f"d!d"Z$e%d#kre & Z'ee' ee'j() d$d%d&e'Z*e*dd' \Z+Z,Z-Z.Z/Z0Z1Z2Z3Z4Z5Z6Z7Z8Z9Z:Z;e$e'e+e,e-e.e/e0e1e2e3e4e5e6e7e8e:e;Z<e+ d(e. d)e/ d*e0 d+e1 d,e2 d-e, d.e- dZ=eee>d   d-e'j? d-e= Z@ee'j() e'jAe'jBe'jCe+e'jDe'jEe'jFe'jGi e'jHe'jIe'jJe;rd/ndd0ZKede/d1kr'd2nde'j() d3v d4gZLe'jMr?eLNed5d6d7 e'jOrTe'jPdusKJ eLNee'jPd8 eQe@*ZReRjSe<eKe@eLd9 e'jTsoeRjddd: neRT  W d   dS W d   dS 1 sw   Y  dS dS )<    )basenamesplitext)ListOptionalN)pretrain_recipe)get_nmt_tokenizer)"MegatronEnableExperimentalCallback)MegatronTokenDropCallback)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerzdeepseek-ai/DeepSeek-V3-BaseTargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeetp_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersrecompute_modulesuse_user_buffer_registration	use_sharpc                 C   s8  t dd}| jdu rd|jj_d|jj_d|jj_d|jj_t|jdr*|jj	du r.g |j_	| j
 dv rEd|jj_d|jj_d|jj_nd|jj_d|jj_d|jj_tr`|jj	tt d|jj_d|jj_|jj	tt dd	gd
gd  d
gd d
gd d
gd dg gd	gd
gd  gd
gd gd  d
gd dg g d	gd
gd  gd
gd gd  d
gd dg g d	gd
gd  gd
gd gd  d
dgg d	gd
gd  gd
gd gd  d
dgg d	gd
gd  gd
gd gd  d
dgg d}|pd}|p	d}||f|vr td| d| d|  |||f }|dur4tdd |D }||jj_d|jj_d|jj_d|jj_d|jj_t |d|| j!||| j"||||||	f|
|| j#||||| j$| j%|d
}t&|ddd| j'| j(| j)| j*}| j+rt,t-|j._/ntjt0ddd d!|j._/|j.j/|j_/|S )"zT
    DeepSeek V3 pre-train recipe aimed at achieving best possible performance.
    T)performance_modeN	callbacks)h100flexFalltoall	embeddingdecoder      loss               ))   r5   )r3   r5   )r0   r5   )r3   r   )r-   r5   )r0   r   )r3   r3   r5   zInvalid PP and VP size: z and zC to infer PP layout for DeepSeek V3. Known PP and VP combinations: c                 S   s   g | ]}t |qS  )list).0xr6   r6   `/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_deepseek_v3.py
<listcomp>l   s    z+override_recipe_configs.<locals>.<listcomp>	pre_train)
r   r    use_fsdp_double_bufferr$   r%   r!   r"   compute_dtype
fp8_reciper#   llm
deepseekv3nullNullTokenizeri  )library
model_name
vocab_size)1r   r#   modelconfigrecompute_granularityrecompute_methodrecompute_num_layershasattrtrainerr'   gpulowermoe_token_dispatcher_typemoe_enable_deepepmoe_shared_expert_overlapUSE_TOKEN_DROPappendrunConfigr	   moe_permute_fusionapply_rope_fusionr   
ValueErrorkeysr7   strategypipeline_model_parallel_layout'account_for_embedding_in_pipeline_split"account_for_loss_in_pipeline_split"num_layers_in_first_pipeline_stage!num_layers_in_last_pipeline_stager   gpus_per_node	max_stepsr=   r>   r?   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   HF_MODEL_URIdata	tokenizerr   )r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   recipemap_pp_vp_to_layoutlayoutr6   r6   r:   override_recipe_configs#   s   












.22***
	







rn   __main__r<   deepseekv3   nodes_tp_pp_cp_vp_ep_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkr5   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newerr2   r1   )
start_stepend_step)dir)executornameplugins)
sequentialdetach)NNN)Uos.pathr   r   typingr   r   nemo_runrU   (nemo.collections.llm.recipes.deepseek_v3r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   Fnemo.lightning.pytorch.callbacks.megatron_enable_experimental_callbackr   /nemo.lightning.pytorch.callbacks.moe_token_dropr	   nemo.lightning.run.pluginsr
   r   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   rh   rS   strintboolrn   __name__
parse_argsr   rN   rO   kwargsr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   rx   r$   r%   rk   
exp_config__file__r>   exp_nameaccount	partitionlog_dirra   
time_limitcontainer_imager{   r}   r~   r   r   r   enable_nsysrT   enable_memory_profilememory_profile_out_path
Experimentexpadddryrunr6   r6   r6   r:   <module>   s  	


 


4"
	$