o
    }oim                     @   s  d dl mZmZ d dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ dedededededededededee dedededefddZedkrhe  Z ee  ee j!" ddde Z#e#dd  \Z$Z%Z&Z'Z(Z)Z*Z+Z,Z-Z.Z/Z0ee e$e%e&e'e(e)e*e+e,e-e.e/e0Z1e$ d!e' d"e( d#e) d$e* d%e+ d&e, d'e% d(e& dZ2eee3d   d'e j4 d'e2 Z5ee j!" e j6e j7e j8e$e j9e j:e j;e j<i e j=e j>e j?e j@rd)ndd*ZAed+e(d,krd-nde j!" d.v d/gZBe jCreBDed0d1d2 e jEr.e jFdus%J eBDee jFd3 eGe5*ZHeHjIe1eAe5eBd4 e jJsIeHjd+d+d5 neHJ  W d   dS W d   dS 1 saw   Y  dS dS )6    )basenamesplitext)OptionalN)pretrain_recipe)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeetp_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   s   t dd}t|d|| j||| j||||||	|
|| j| j||| j| j| jd}t	|ddd| j
| j| j| j}| jr@td|j_|S tjtdd	d
d|j_|jj|j_|S )z
    mixtral 8x22b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)use_user_buffer_registration	use_sharpr   r   compute_dtype
fp8_recipenccl_communicator_config_pathllmmixtralzmistralai/Mixtral-8x22B-v0.1nullNullTokenizeri }  )library
model_name
vocab_size)r   r   gpus_per_node	max_stepsr"   r#   r$   r%   r&   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   data	tokenizerrunConfigr   model)r   r   r   r   r   r   r   r   r   r   r   r   r   r   recipe r;   b/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_mixtral_8x22b.pyoverride_recipe_configs   sH   

r=   __main__r!   r(   8x22b   nodes_tp_pp_cp_vp_ep_etp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Kos.pathr   r   typingr   nemo_runr7   .nemo.collections.llm.recipes.mixtral_8x22b_64kr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r   r	   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   strintboolr=   __name__
parse_argsr   gpulowerkwargsr   r   r   r   r   r   r   r   r   r   r   r   r   r:   
exp_config__file__r$   exp_nameaccount	partitionlog_dirr.   
time_limitcontainer_imagerJ   rL   rM   rN   r#   r[   r]   enable_nsysappendenable_memory_profilememory_profile_out_path
Experimentexpadddryrunr;   r;   r;   r<   <module>   s   	


>

8"
	$