o
    }oi                     @   s  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z! ddl"m#Z#m$Z$ de%de&de&de&de&de&de&de&de&de'de'de&de&fddZ(e)dkrve * Z+ee+ ee+j,- ddde+Z.e.dd \Z/Z0Z1Z2Z3Z4Z5Z6Z7Z8Z9Z:Z;e(e+e/e0e1e2e3e4e5e6e8e9e:e;Z<e/ d e2 d!e3 d"e4 d#e5 d$e0 d%e1 dZ=eee>d   d$e+j? d$e= Z@ee+j,- e+jAe+jBe+jCe/e+jDe+jEe+jFe+jGi e+jHe+jIe+jJe+jKrd&ndd'ZLed(e3d)krd*nde+j,- d+v d,gZMe+jNr'eMOed-d.d/ e+jPr<e+jQdus3J eMOee+jQd0 eRe@*ZSeSjTe<eLe@eMd1 e+jUsWeSjd(d(d2 neSU  W d   dS W d   dS 1 sow   Y  dS dS )3    )basenamesplitextN)pretrain_recipe)0userbuffers_bf16_b200_h12288_tp4_mbs1_seqlen20480userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048/userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048/userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxhf_tokenizerargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   s   t dd}t|d|| j||| j||||||	|
| j| j||| j| j| jd}t	|ddd| j
| j| j| j}| j }| jrCtd|j_ntjtdd	d
d|j_|jj|j_ttdttdttdd}t|jj}|dusqJ d|| | j }t tjt!"|}||jj| _#d|jj$_%|S )z
    gpt3 175b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    T)performance_mode	pre_train)	r   r    use_user_buffer_registration	use_sharpr!   r"   compute_dtype
fp8_recipenccl_communicator_config_pathllmgpt3znvidia/megatron-gpt2-345mnullNullTokenizeri   )library
model_name
vocab_size)bf16fp8)h100b200gb200Nz>MegatronCommOverlapCallback missing. Required for performance.)&r   r   gpus_per_node	max_stepsr%   r&   r'   r(   r)   r   tensorboardwandbwandb_prj_namewandb_job_namegpuloweruse_hf_tokenizerr   data	tokenizerrunConfigr	   modelr   r   r   r   r   trainer	callbacksfdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfgconfigtp_only_amax_red)r   r   r   r   r   r   r   r   r   r   r    r!   r"   recipegpu_typeub_cfgcomm_overlap_callback_idxrJ    rQ   ^/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/pretrain_gpt3_175b.pyoverride_recipe_configs%   sf   



rS   __main__r$   r+   175b   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    )r4   r5   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dir)executornameplugins)
sequentialdetach)Vos.pathr   r   fiddlerF   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrH   nemo_runrA   &nemo.collections.llm.recipes.gpt3_175br   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   r   r   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr	   nemo.lightning.run.pluginsr
   r   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   strintboolrS   __name__
parse_argsr   r<   r=   kwargsr   r   r   r   r   r   r   r   r[   r   r    r!   r"   rM   
exp_config__file__r'   exp_nameaccount	partitionlog_dirr6   
time_limitcontainer_imager^   r`   ra   rb   r&   rm   ro   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrQ   rQ   rQ   rR   <module>   s   	


V

."
	$