o
    }oi                      @   sD  d dl mZmZ d dlZd dlm  m  mZ	 d dl
Zd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ dd	lmZ dd
lmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& dZ'dZ(de)de*de*de*de*de*de*de*de*de+de+de*de*fddZ,e-dkre . Z/ee/ ee/j01 e/j2dd e/Z3e3dd! \Z4Z5Z6Z7Z8Z9Z:Z;Z<Z=Z>Z?Z@e,e/e4e5e6e7e8e9e:e;e=e>e?e@ZAe4 d"e7 d#e8 d$e9 d%e: d&e5 d'e6 dZBe/j2 d&eeeCd   d&e/jD d&eB ZEee/j01 e/jFe/jGe/jHe4e/jIe/jJe/jKe/jLi e/jMe/jNe/jOe/jPrd(ndd)ZQed*e8d+krd,nde/j01 d-v d.gZRe/jSr6eRTed/d0d1 e/jUrKe/jVdusBJ eRTee/jVd2 eWeEEZXe(sle/jMdus^J d3eXjYe%eQe d4e' d5  eXjYeAeQeEeRd6 e/jZseXjd*d*d7 neXZ  W d   dS W d   dS 1 sw   Y  dS dS )8    )basenamesplitextN)SquadDataModule)finetune_recipemodel)3userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)get_comm_overlap_callback_idxhf_tokenizerimport_ckpt_experimentisfile_train_pack_metadatazmeta-llama/Meta-Llama-3-70BFargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsuse_mcore_fsdprecompute_layersactivation_offload_layersc                 C   s  | j dkrdn| j }| j }|dv r|dkrt|ddd}nt|dd}t|||| j||| j||||||	|
||| j| j| j	| j
| jd	}t||d
d| j| j| j| j}| jr^tt|j_ntjtdddd|j_|jj|j_|jjtkrtt|jsd|j_t|jj }|dusJ d|dkr|dkr| j dkr|dkrt!nd}|rd|jj | _"t#$tjt%&|}||jj | _'d|jj(_)d|jj(_*d|jj(_+d|j,j(_-d|jj(_.|S )z
    llama3 70b fine-tuning recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    sftnone)gb200loraTi   )peft_schemeperformance_mode
seq_length)r(   r)   )	r    r!   r"   r#   compute_dtype
fp8_recipenccl_communicator_config_pathuse_user_buffer_registration	use_sharpllmllama3nullNullTokenizeri  )library
model_name
vocab_sizeNz>MegatronCommOverlapCallback missing. Required for performance.   fp8r   F)/
finetuninggpulowerr   r   gpus_per_node	max_stepsr+   r,   r-   r.   r/   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   HF_MODEL_URIdata	tokenizerrunConfigr   r   __fn_or_cls__r   r   force_redownloadr   trainer	callbacksr   tp_comm_overlapfdlcastfdl_dcconvert_dataclasses_to_configstp_comm_overlap_cfgconfigtp_comm_overlap_disable_qkvtp_comm_bulk_dgradtp_comm_overlap_rs_dgradoptimuse_distributed_optimizer!disable_parameter_transpose_cache)r   r   r   r   r   r   r   r   r   r    r!   r"   r#   finetuning_schemegpu_typerecipecomm_overlap_callback_idxrQ    r]   _/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/finetune_llama3_70b.pyoverride_recipe_configs+   sx   






r_   __main__r1   70b   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkTr7   i    )b200r&   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dirz>HF token is required for importing checkpoint from HuggingFacezhf://)source)executornameplugins)
sequentialdetach)[os.pathr   r   fiddlerM   $fiddle._src.experimental.dataclasses_srcexperimentaldataclassesrO   nemo_runrF   #nemo.collections.llm.gpt.data.squadr   'nemo.collections.llm.recipes.llama3_70br   r   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr	   r
   r   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   r   r   rC   SKIP_IMPORTstrintboolr_   __name__
parse_argsr   r:   r;   r9   kwargsr   r   r   r   r   r   r   r   rg   r    r!   r"   r#   r[   
exp_config__file__r+   exp_nameaccount	partitionlog_dirr<   
time_limitcontainer_imagerj   rl   rm   rn   r/   rz   r|   enable_nsysappendenable_memory_profilememory_profile_out_path
Experimentexpadddryrunr]   r]   r]   r^   <module>   s   	


e

.*
$