o
    }oit                     @   sD  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ dd	lmZ dd
lmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& dZ'dZ(de)de*de*de*de*de*de*de*de*de+de+de*de*fddZ,e-dkre . Z/ee/ ee/j01 e/j2dd e/Z3e3dd! \Z4Z5Z6Z7Z8Z9Z:Z;Z<Z=Z>Z?Z@e,e/e4e5e6e7e8e9e:e;e=e>e?e@ZAe4 d"e7 d#e8 d$e9 d%e: d&e5 d'e6 dZBe/j2 d&eeeCd   d&e/jD d&eB ZEee/j01 e/jFe/jGe/jHe4e/jIe/jJe/jKe/jLi e/jMe/jNe/jOe/jPrd(ndd)ZQed*e8d+krd,nde/j01 d-v d.gZRe/jSr6eRTed/d0d1 e/jUrKe/jVdusBJ eRTee/jVd2 eWeEEZXe(sle/jMdus^J d3eXjYe%eQe d4e' d5  eXjYeAeQeEeRd6 e/jZseXjd*d*d7 neXZ  W d   dS W d   dS 1 sw   Y  dS dS )8    )basenamesplitextN)SquadDataModule)finetune_recipemodel)4userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxhf_tokenizerimport_ckpt_experimentisfile_train_pack_metadatazmeta-llama/Llama-3.1-405BFargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   s  | j dkrdn| j }t|dd}t|||| j||| j||||||	|
||| j| j| j| j| j	d}t
||dd| j| j| j| j}| jrItt|j_ntjtdd	d
d|j_|jj|j_|jjtkrjtt|jsjd|j_t|jj}|dusxJ d|dkr|dkr| j dkr|dkrt nd}|rd|jj| _!t"#tjt$%|}||jj| _&d|jj'_(d|jj'_)d|jj'_*d|j+j'_,d|jj'_-|S )z
    llama3.1 405b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    sftnoneT)peft_schemeperformance_mode)	r    r!   r"   r#   compute_dtype
fp8_recipenccl_communicator_config_pathuse_user_buffer_registration	use_sharpllmllama3nullNullTokenizeri  )library
model_name
vocab_sizeNz>MegatronCommOverlapCallback missing. Required for performance.lora   fp8   F).
finetuningr   r   gpus_per_node	max_stepsr(   r)   r*   r+   r,   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   HF_MODEL_URIdata	tokenizerrunConfigr   r   __fn_or_cls__r   r   force_redownloadr   trainer	callbackslowerr   tp_comm_overlapfdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfgconfigtp_comm_overlap_disable_qkvtp_comm_bulk_dgradtp_comm_overlap_rs_dgradoptimuse_distributed_optimizer!disable_parameter_transpose_cache)r   r   r   r   r   r   r   r   r   r    r!   r"   r#   finetuning_schemerecipecomm_overlap_callback_idxrO    rZ   a/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/finetune_llama31_405b.pyoverride_recipe_configs+   sr   





r\   __main__llama31405b   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkTr5   i    )b200gb200)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dirz>HF token is required for importing checkpoint from HuggingFacezhf://)source)executornameplugins)
sequentialdetach)[os.pathr   r   fiddlerK   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrM   nemo_runrC   #nemo.collections.llm.gpt.data.squadr   )nemo.collections.llm.recipes.llama31_405br   r   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr	   r
   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   r   r   r@   SKIP_IMPORTstrintboolr\   __name__
parse_argsr   gpurI   r8   kwargsr   r   r   r   r   r   r   r   re   r    r!   r"   r#   rX   
exp_config__file__r(   exp_nameaccount	partitionlog_dirr9   
time_limitcontainer_imagerh   rj   rk   rl   r,   ry   r{   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrZ   rZ   rZ   r[   <module>   s   	


^

.*
$