o
    }oi3N                  0   @   sl  d dl Z d dlmZ d dlmZmZ d dlZd dlZ	d dl
mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d
dlmZ dedededede	jf
ddZdededededee f
ddZdedB dedB fddZ dDdededB fddZ!dededeee  fd d!Z"d"e#defd#d$Z$				dEd%e#d"e#dededB dededB dededeee  d&ee# d'ee# d(ee# d)ee# fd*d+Z%		,	,	,	,	 	 					dFded-ed.ed/ed0ed1eded2ed3ed4ed5ed6ee d"e#d%e#d'e#d(e#dededededeee  d7ed)ee# f.d8d9Z&ded:eded;e#d<e#d=ed>efd?d@Z'dAe(ddfdBdCZ)dS )G    N)Path)ListOptional)nan)MockDataModule)#bf16_with_fp8_current_scaling_mixedbf16_with_fp8_mixed&bf16_with_fp8_subchannel_scaling_mixedbf16_with_mxfp8_mixed)FLOPsMeasurementCallback)ModelCheckpoint)logging   )get_comm_overlap_callback_idxgputask
model_name
model_sizereturnc           
   
   C   s  t ttj }tj|dd|  d}t	d| d t
 }tj|rxt
|}||d |k|d |k@ |d |k@ |d	 |jk@ |jd
u pQ|d |jk@  }|td
i}t|dkrxtd| d| d| d|j  td t|dkr|jddd }	|	S i }	|	S )z
    Get recommended configs tuned for performance from a csv file.
    User (command line) provided args override the recommended configs.
    recommended_model_configsmodel_configs_z.csvzUsing z. for loading default recommended model configsr   modelsizedtypeNnum_gpusr   z Missing performance configs for -zAMake sure you provide all necessary arguments in the command linerecords)orient)strr   __file__parentabsoluteospathjoinr   infopd	DataFrameisfileread_csvcompute_dtyper   replacer   lenwarningto_dict)
r   r   r   r   args
script_dirrecommended_configs_csv	config_dfdfconfig r5   O/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/helpers.pyget_csv_configs%   s2   



$
r7   c              	   C   sR  t |  ||||}|  dkr|jdkrd|_td |jdu r'|dn|j}||j   }|jdu r;|dn|j}|jdu rH|dn|j}	|j	du rU|dn|j	}
|j
du rb|d	n|j
}|jdu ro|d
n|j}|jdu r||dn|j}|j}|du r|dn|}|j}|du r|dn|}|jdu r|dn|j}|du rdntt|}|jdu r|dn|j}|du rdntt|}|jdu r|dn|j}|du rdnt|}|jdu r|dn|j}|du rdnt|}|jdur|j}t|tsJ dn|ddur |dd}nd}|jdu r-|dn|j}|du r7dntt|}|jdu rH|dn|j}|du rRdntt|}|jdu rc|dn|j}|du rmdntt|}|||	|
|||||f	}dd |D }|||||||||g7 }td td| td|j  td| td |	 td!|
 td"| td#| td$| td%| td&| td'| td(| td)| td*| td+| td,| td-| td.| |S )/a.  
    Choose recommended configs tuned for performance from a csv file if available.
    User (command line) provided args override the recommended configs.

    NOTE: pre-train and PEFT recommended configs available for H100 and B200.

    Args:
        gpu (str): target GPU machine for experiment. Options- ['h100', 'b200']
        task (str): experiment task. Options- ['pre_train', 'sft', 'lora']
        model_name (str): target model for experiment. E.g.: 'llama3', 'mixtral'
        model_size (str): size of target model. E.g.: '8b' (for llama3)
    gb200   z6GB200 has 4 GPUs per node. Setting gpus_per_node to 4.Nr   mbsgbstp_sizepp_sizecp_sizeep_sizevp_sizeetp_sizecuda_graphsFuse_mcore_fsdprecompute_layersr   activation_offload_layersz recompute_modules must be a listrecompute_modules/keep_fsdp_fp8_transpose_cacheuse_user_buffer_registration	use_sharpc                 S   s    g | ]}|d urt |n|qS N)int).0argr5   r5   r6   
<listcomp>   s     z$get_user_configs.<locals>.<listcomp>z!Received model parallel configs: z
num_nodes=znum_gpus_per_node=zmbs=zgbs=ztp_size=zpp_size=zcp_size=zvp_size=zep_size=z	etp_size=zenable_cuda_graphs=zuse_mcore_fsdp=zrecompute_layers=zactivation_offload_layers=zrecompute_modules=zkeep_fsdp_fp8_transpose_cache=zuse_user_buffer_registration=z
use_sharp=)r7   lowergpus_per_noder   r-   r   getmicro_batch_sizeglobal_batch_sizetensor_parallel_sizepipeline_parallel_sizecontext_parallel_sizeexpert_parallel_sizevirtual_pipeline_parallel_sizeexpert_tensor_parallel_sizerB   boolrL   rC   rD   rE   rF   
isinstancelistsplitrH   rI   rJ   r%   )r   r   r   r   r/   r4   r   	num_nodesr:   r;   r<   r=   r>   r?   r@   rA   enable_cuda_graphsrC   rD   rE   rF   rH   rI   rJ   kwargsr5   r5   r6   get_user_configsB   s   





rb   comm_overlap_callback_idxr<   c                 C   s~   d| j j_d| jj_d| jjj_| jjj	rd| jjj_
d| jjj_d| j j_|dur=| jj| jr=td d| jj| _| S )z)
    Set Mcore FSDP related configs.
    Tmegatronoptim_grads_paramsFNzVDisabling deferring embedding wgrad compute because it cannot work with FSDP together.)r   r4   init_model_with_meta_devicetrainerstrategyfsdpddpdata_parallel_sharding_strategypluginsgrad_reduce_in_fp32average_in_collective/keep_fp8_transpose_cache_when_using_custom_fsdpgradient_accumulation_fusion	callbacksdefer_embedding_wgrad_computer   r-   )reciperc   r<   r5   r5   r6   set_mcore_fsdp_configs   s   




rt   r*   
fp8_recipec                 C   s  |du r| S |  dkrd| jj_|durU|  dkrU|du r!d}|  dkr-t | j_n(|  dkr>t | j_d| jj_n|  dkrJt	 | j_n|  d	krUt
 | j_d| jj_|  dkr|  dkrd| jjj_d| jj_t| jj}|durd| jj| _td
 | S )z(
    Set precision related configs.
    Nbf16Tfp8dscsFmxfp8sszWhen using MXFP8, to reduce memory usage, we use reuse_grad_buf_for_mxfp8_param_ag. Disabling AG overlap because it is not supported with reuse_grad_buf_for_mxfp8_param_ag.)rP   optimr4   use_precision_aware_optimizerr   rg   rl   r   first_last_layers_bf16r
   r	   rm   rh   rj   !reuse_grad_buf_for_mxfp8_param_agr   rq   overlap_param_gatherr   r-   )rs   r*   ru   rc   r5   r5   r6   set_precision_configs   s6   




r   rD   rE   rF   c                 C   s   |dkrd| j j_d| j j_|| j j_|dkr&d| j j_d| j j_|| j j_|durE|| j j_| j jjdks:J d| j jjdu sEJ d	| S )
zD
    Set activation recomputing and offloading related configs.
    r   fullblockTFN	selectivezJrecompute_granularity must be selective when recompute_modules is providedzDrecompute_num_layers must be None when recompute_modules is provided)	r   r4   recompute_granularityrecompute_methodrecompute_num_layerscpu_offloadingcpu_offloading_weightscpu_offloading_num_layersrF   )rs   rD   rE   rF   r5   r5   r6   set_recompute_configs   s"   	






r   r`   c                 C   sB   || j j_|| jj_|dv rt| jdr| jjdur|| jj_	| S )z)
    Set CUDA graph related configs.
    )nonelorapacked_sequence_specsN)
r   r4   enable_cuda_graphrg   rh   use_te_rng_trackerhasattrdatar   pad_cu_seqlens)rs   r`   r   r5   r5   r6   set_cuda_graph_configs  s   



r   rC   use_fsdp_double_bufferrI   rJ   rH   c                 C   s  d| j j_|
r|dksJ d|r|rtd d}t| ||} |r.t| jj}t	| ||} t
| ||} t| |||	} t|| jj_t| jjdoQt| jjjt }|r[|s[td |rd| jjj_d| jjj_t|| jjj_t|
| jjj_t|| jjj_| S )z7
    Set performance optimization related configs.
    teTz9use_fsdp_double_buffer requires use_mcore_fsdp to be TruezJCurrently, cuda graphs are not supported with FSDP. Disabling cuda graphs.Frj   z;DDP is not configured. Cannot use user buffer registration.)r   r4   cross_entropy_fusion_implr   r-   r   r   rg   rq   rt   r   r   r[   rh   rJ   r   r\   rj   r   check_for_nan_in_gradcheck_for_large_gradsnccl_ubfsdp_double_bufferro   )rs   rC   r`   r   r<   r*   ru   rD   rE   rF   r   rI   rJ   rH   rc   
is_ddp_objr5   r5   r6   set_perf_optimization_configs#  s2   

 
r   Fr_   num_gpus_per_noder:   r;   	max_stepsr=   r>   r@   r?   rA   nccl_communicator_config_pathc                 C   s6  || j _|| j _|| j _|| j _d| j _|| j_|| j_| jj	t
kr(|| | j_|| j j_|| j j_|	| j j_|
dkr=dn|
| j j_|| j j_|| j j_t|dk| j j_|dur^|| j j_t| j j}|| || |	  }|durt|dko|dko|
o|
dk| j j| _t| |||||||||t|||d} | S )zESet experiment configs we usually tune for performance of all models.r   r   N)rs   rC   r`   r   r<   r*   ru   rD   rE   rF   r   rI   rJ   rH   )rg   r_   devicesr   val_check_intervallimit_val_batchesr   rS   rT   __fn_or_cls__r   num_train_samplesrh   tensor_model_parallel_sizepipeline_model_parallel_sizerW   $virtual_pipeline_model_parallel_sizeexpert_model_parallel_sizerZ   r[   sequence_parallelr   r   rq   (overlap_param_gather_with_optimizer_stepr   r   )rs   r   r_   r   r:   r;   r   r<   r=   r>   r@   r?   rA   r`   rC   rI   rJ   rD   rE   r*   ru   rF   r   rH   rc   dp_sizer5   r5   r6   set_primary_perf_configs[  sR   





r   domain	enable_tb	enable_wdwandb_prj_namewandb_job_namec                 C   s   |dkr|dkr| j jtjt| jj| j|d |s$d| j	_
d| j _nd| j	_|r8ddlm} |||d	| j	_d| j	_| j j}	d}
|	rVt|	D ]\}}|jtkrU|}
 nqH|
du| j _d
| j _| S )zSet experiment logging configs.	pre_trainllm)model_configdata_configr   NFz/nemo_run/lightning_logsr   )wandb_logger)projectnamer   )rg   rq   appendrunConfigr   r   r4   r   logtensorboardloggerlog_dir(nemo.collections.llm.recipes.log.defaultr   wandbckpt	enumerater   r   enable_checkpointinglog_every_n_steps)rs   r   r   r   r   r   r   r   r   rq   checkpoint_callback_idxidxcallbackr5   r5   r6   set_exp_logging_configs  s8   	

r   r/   c                 C   sD   | j r| jdusJ d| jdusJ d| jdus J ddS dS )z/
    Check the sanity of argument settings
    Nzwandb logger needs "wandb_key"z#wandb logger needs "wandb_prj_name"z#wandb logger needs "wandb_job_name")r   	wandb_keyr   r   )r/   r5   r5   r6   args_sanity_check  s   r   rK   )NNNN)NFFFFr   r   NNNNN)*r"   pathlibr   typingr   r   nemo_runr   pandasr&   numpyr   "nemo.collections.llm.gpt.data.mockr   6nemo.collections.llm.recipes.precision.mixed_precisionr   r   r	   r
   /nemo.lightning.pytorch.callbacks.flops_callbackr   1nemo.lightning.pytorch.callbacks.model_checkpointr   
nemo.utilsr   utilsr   r   r'   r7   rL   rb   rt   r   r   r[   r   r   r   r   dictr   r5   r5   r5   r6   <module>   s   "p*

!	


E	


Q
1