o
    }oi.                     @   s  d dl mZmZ d dlZd dlmZ d dlmZm	Z	 d dl
mZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZmZ dZdZdede de de de de de de de de!fddZ"e#dkrze $ Z%ee% ee%j&' e%j(dde%Z)e)dd \
Z*Z+Z,Z-Z.Z/Z0Z1Z2Z3e"e%e*e+e,e-e.e/e0e1e3
Z4e* de- de. d e/ d!e0 d"e+ d#e, dZ5e%j( d"eee6d   d"e%j7 d"e5 Z8ee%j&' e%j9e%j:e%j;e*e%j<e%j=e%j>e%j?i e%j@e%jAe%jBe%jCrd$ndd%ZDed&e.d'krd(nde%j&' d)v d*gZEe%jFreEGed+d,d- e%jHr%e%jIdusJ eEGee%jId. eJe8EZKesFe%j@dus8J d/eKjLeeDe	 d0e d1  eKjLe4eDe8eEd2 e%jMs[eKjd&d&d3 neKM  W d   dS W d   dS 1 ssw   Y  dS dS )4    )basenamesplitextN)SquadDataModule)finetune_recipemodel)get_nmt_tokenizer)MemoryProfilePlugin
NsysPluginPerfEnvPlugin   )parse_cli_args)slurm_executor)args_sanity_checkget_user_configsset_exp_logging_configsset_primary_perf_configs)hf_tokenizerimport_ckpt_experimentisfile_train_pack_metadatazmeta-llama/Meta-Llama-3-8BFargs	num_nodesmbsgbstp_sizepp_sizecp_sizevp_sizeep_sizeenable_cuda_graphsc
                 C   s  | j dkrdn| j }
| j }|dv rt|
ddd}nt|
dd}t||
|| j||| j||||||	| j| j| j	| j
| jd}t||
d	d
| j| j| j| j}| jrWtt|j_ntjtdddd|j_|jj|j_|jjtkrxtt|jsxd|j_d|jj_ d|jj_!|S )z
    llama3 8b pre-train recipe aimed at achieving best possible performance.

    NOTE: Use fp8 precision training with caution. It might not give desirable results.
    sftnoneb200gb200Ti @  )peft_schemeperformance_mode
seq_length)r$   r%   )r   compute_dtype
fp8_recipenccl_communicator_config_pathuse_user_buffer_registration	use_sharpllmllama3nullNullTokenizeri  )library
model_name
vocab_size)"
finetuninggpulowerr   r   gpus_per_node	max_stepsr'   r(   r)   r*   r+   r   tensorboardwandbwandb_prj_namewandb_job_nameuse_hf_tokenizerr   HF_MODEL_URIdata	tokenizerrunConfigr   r   __fn_or_cls__r   r   force_redownloadoptimconfiguse_distributed_optimizer!disable_parameter_transpose_cache)r   r   r   r   r   r   r   r   r   r   finetuning_schemegpu_typerecipe rK   ^/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/llm/finetune_llama3_8b.pyoverride_recipe_configs&   s\   



rM   __main__r-   8b
   nodes_tp_pp_cp_vp_mbs_sharp)custom_mountscustom_env_varshf_token	nemo_home	wandb_keynetworkT   i    r!   )enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newer      )
start_stepend_step)dirz>HF token is required for importing checkpoint from HuggingFacezhf://)source)executornameplugins)
sequentialdetach)Nos.pathr   r   nemo_runr@   #nemo.collections.llm.gpt.data.squadr   &nemo.collections.llm.recipes.llama3_8br   r   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.run.pluginsr   r	   r
   argument_parserr   	executorsr   helpersr   r   r   r   utilsr   r   r   r=   SKIP_IMPORTstrintboolrM   __name__
parse_argsr   r4   r5   r3   kwargsr   r   r   r   r   r   r   r   rU   r   rJ   
exp_config__file__r'   exp_nameaccount	partitionlog_dirr6   
time_limitcontainer_imagerX   rZ   r[   r\   r+   rh   rj   enable_nsysappendenable_memory_profilememory_profile_out_path
ExperimentexpadddryrunrK   rK   rK   rL   <module>   s   	


J
 .*
$