o
    wi`                  2   @   s  d dl Z d dlmZ d dlmZmZ d dlZd dlZ	d dl
mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d
dlmZ dedededede	jf
ddZdededededee f
ddZdedB dedB fddZ dIdededB fddZ!dededeee  fd d!Z"d"e#defd#d$Z$d%edB d&edB fd'd(Z%				dJd)e#d"e#dededB d%edB d&edB dededB dededeee  d*ee# d+ee# d,ee# d-ee# fd.d/Z&		0	0				 	 					dKded1ed2ed3ed4ed5eded%ed6ed&ed7ed8ee d"e#d)e#d*ee# d+ee# d,ee# dededededeee  d9ed-ee# f0d:d;Z'ded<eded=e#d>e#d?ed@efdAdBZ(dCe)ddfdDdEZ*dLd%edB dFee# fdGdHZ+dS )M    N)Path)ListOptional)nan)MockDataModule)#bf16_with_fp8_current_scaling_mixedbf16_with_fp8_mixed&bf16_with_fp8_subchannel_scaling_mixedbf16_with_mxfp8_mixed)FLOPsMeasurementCallback)ModelCheckpoint)logging   )get_comm_overlap_callback_idxgputask
model_name
model_sizereturnc           
   
   C   s  t ttj }tj|dd|  d}t	d| d t
 }tj|rxt
|}||d |k|d |k@ |d |k@ |d	 |jk@ |jd
u pQ|d |jk@  }|td
i}t|dkrxtd| d| d| d|j  td t|dkr|jddd }	|	S i }	|	S )z
    Get recommended configs tuned for performance from a csv file.
    User (command line) provided args override the recommended configs.
    recommended_model_configsmodel_configs_z.csvzUsing z. for loading default recommended model configsr   modelsizedtypeNnum_gpusr   z Missing performance configs for -zAMake sure you provide all necessary arguments in the command linerecords)orient)strr   __file__parentabsoluteospathjoinr   infopd	DataFrameisfileread_csvcompute_dtyper   replacer   lenwarningto_dict)
r   r   r   r   args
script_dirrecommended_configs_csv	config_dfdfconfig r5   X/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/scripts/performance/helpers.pyget_csv_configs%   s2   



$
r7   c              	   C   sR  t |  ||||}|  dkr|jdkrd|_td |jdu r'|dn|j}||j   }|jdu r;|dn|j}|jdu rH|dn|j}	|j	du rU|dn|j	}
|j
du rb|d	n|j
}|jdu ro|d
n|j}|jdu r||dn|j}|j}|du r|dn|}|j}|du r|dn|}|jdu r|dn|j}|du rdntt|}|jdu r|dn|j}|du rdntt|}|jdu r|dn|j}|du rdnt|}|jdu r|dn|j}|du rdnt|}|jdur|j}t|tsJ dn|ddur |dd}nd}|jdu r-|dn|j}|du r7dntt|}|jdu rH|dn|j}|du rRdntt|}|jdu rc|dn|j}|du rmdntt|}|||	|
|||||f	}dd |D }|||||||||g7 }td td| td|j  td| td |	 td!|
 td"| td#| td$| td%| td&| td'| td(| td)| td*| td+| td,| td-| td.| |S )/a.  
    Choose recommended configs tuned for performance from a csv file if available.
    User (command line) provided args override the recommended configs.

    NOTE: pre-train and PEFT recommended configs available for H100 and B200.

    Args:
        gpu (str): target GPU machine for experiment. Options- ['h100', 'b200']
        task (str): experiment task. Options- ['pre_train', 'sft', 'lora']
        model_name (str): target model for experiment. E.g.: 'llama3', 'mixtral'
        model_size (str): size of target model. E.g.: '8b' (for llama3)
    gb200   z6GB200 has 4 GPUs per node. Setting gpus_per_node to 4.Nr   mbsgbstp_sizepp_sizecp_sizeep_sizevp_sizeetp_sizecuda_graphsFuse_mcore_fsdprecompute_layersr   activation_offload_layersz recompute_modules must be a listrecompute_modules/keep_fsdp_fp8_transpose_cacheuse_user_buffer_registration	use_sharpc                 S   s    g | ]}|d urt |n|qS N)int).0argr5   r5   r6   
<listcomp>   s     z$get_user_configs.<locals>.<listcomp>z!Received model parallel configs: z
num_nodes=znum_gpus_per_node=zmbs=zgbs=ztp_size=zpp_size=zcp_size=zvp_size=zep_size=z	etp_size=zenable_cuda_graphs=zuse_mcore_fsdp=zrecompute_layers=zactivation_offload_layers=zrecompute_modules=zkeep_fsdp_fp8_transpose_cache=zuse_user_buffer_registration=z
use_sharp=)r7   lowergpus_per_noder   r-   r   getmicro_batch_sizeglobal_batch_sizetensor_parallel_sizepipeline_parallel_sizecontext_parallel_sizeexpert_parallel_sizevirtual_pipeline_parallel_sizeexpert_tensor_parallel_sizerB   boolrL   rC   rD   rE   rF   
isinstancelistsplitrH   rI   rJ   r%   )r   r   r   r   r/   r4   r   	num_nodesr:   r;   r<   r=   r>   r?   r@   rA   enable_cuda_graphsrC   rD   rE   rF   rH   rI   rJ   kwargsr5   r5   r6   get_user_configsB   s   





rb   comm_overlap_callback_idxr<   c                 C   s   d| j j_d| jj_d| jjj_| jjj	rd| jjj_
d| jjj_zd| jjj_W n ty=   d| jjj_td Y nw d| j j_|durZ| jj| jrZtd d| jj| _| S )z)
    Set Mcore FSDP related configs.
    Tmegatronoptim_grads_paramsFDeprecation Notice: `keep_fp8_transpose_cache_when_using_custom_fsdp` will be deprecated in M-Core 0.14. Please use `keep_fsdp_fp8_transpose_cache` instead.NzVDisabling deferring embedding wgrad compute because it cannot work with FSDP together.)r   r4   init_model_with_meta_devicetrainerstrategyfsdpddpdata_parallel_sharding_strategypluginsgrad_reduce_in_fp32average_in_collectivekeep_fp8_transpose_cacheAttributeError/keep_fp8_transpose_cache_when_using_custom_fsdpr   r-   gradient_accumulation_fusion	callbacksdefer_embedding_wgrad_compute)reciperc   r<   r5   r5   r6   set_mcore_fsdp_configs   s*   




rw   r*   
fp8_recipec                 C   s   |du r| S |  dkrd| jj_|durU|  dkrU|du r!d}|  dkr-t | j_n(|  dkr>t | j_d| jj_n|  dkrJt	 | j_n|  d	krUt
 | j_d| jj_|  dkr||  dkr|t| jj}|durwd| jj| _td
 | S )z(
    Set precision related configs.
    Nbf16Tfp8dscsFmxfp8sszWhen using MXFP8, to reduce memory usage, we use reuse_grad_buf_for_mxfp8_param_ag. Disabling AG overlap because it is not supported with reuse_grad_buf_for_mxfp8_param_ag.)rP   optimr4   use_precision_aware_optimizerr   rh   rm   r   first_last_layers_bf16r
   r	   rn   r   rt   overlap_param_gatherr   r-   )rv   r*   rx   rc   r5   r5   r6   set_precision_configs   s2   



r   rD   rE   rF   c                 C   s   |dkrd| j j_d| j j_|| j j_|dkr&d| j j_d| j j_|| j j_|durE|| j j_| j jjdks:J d| j jjdu sEJ d	| S )
zD
    Set activation recomputing and offloading related configs.
    r   fullblockTFN	selectivezJrecompute_granularity must be selective when recompute_modules is providedzDrecompute_num_layers must be None when recompute_modules is provided)	r   r4   recompute_granularityrecompute_methodrecompute_num_layerscpu_offloadingcpu_offloading_weightscpu_offloading_num_layersrF   )rv   rD   rE   rF   r5   r5   r6   set_recompute_configs   s"   	






r   r`   c                 C   sB   || j j_|| jj_|dv rt| jdr| jjdur|| jj_	| S )z)
    Set CUDA graph related configs.
    )nonelorapacked_sequence_specsN)
r   r4   enable_cuda_graphrh   ri   use_te_rng_trackerhasattrdatar   pad_cu_seqlens)rv   r`   r   r5   r5   r6   set_cuda_graph_configs  s   



r   r=   r@   c                 C   sn  t | jdrt | jjdr| jjjdks| S g }| jjjjdkr0d| jjj_|d t	
d |rb|dkrb| jjjdkrLd| jj_|d t	
d	 | jjjdkrbd| jj_|d
 t	
d t| jj}|dur| jj| }|r|dkr|jdkrd|_|d t	
d |r|dkr|jdkrd|_|d t	
d nt	
d |rt	dd|  | S )za
    Set optimizations required for full iteration CUDA graphs based on specific conditions.
    r4   cuda_graph_scopefull_iterationFzcheck_for_nan_in_grad=FalsezHFor full iteration CUDA graphs, we need to disable check_for_nan_in_gradr   zvariable_seq_lengths=FalsezGFor full iteration CUDA graphs, we need to disable variable_seq_lengthszbatch_p2p_sync=FalsezAFor full iteration CUDA graphs, we need to disable batch_p2p_syncNzbatch_p2p_comm=FalsezSFor full iteration CUDA graphs, disabling batch_p2p_comm would improve memory usagez.overlap_param_gather_with_optimizer_step=Falsez[For full iteration CUDA graphs, we need to disable overlap_param_gather_with_optimizer_stepzAMegatronCommOverlapCallback not found in recipe.trainer.callbacksz1Applied full iteration CUDA graph optimizations: z, )r   r   r4   r   rh   ri   rk   check_for_nan_in_gradappendr   r-   variable_seq_lengthsbatch_p2p_syncr   rt   batch_p2p_comm(overlap_param_gather_with_optimizer_stepr%   r$   )rv   r=   r@   cuda_graph_configsrc   callbackr5   r5   r6   %set_full_iteration_cuda_graph_configs+  sP   














r   rC   use_fsdp_double_bufferrI   rJ   rH   c                 C   sL  d| j j_|r|dksJ d|r|rtd d}t| ||} |r(t| ||} |r6t| jj	}t
| ||} t| ||} t| |	|
|} t|| jj_t| jjdoYt| jjjt }|rc|sctd |rd| jjj_d| jjj_t|| jjj_t|| jjj_zt|| jjj_W | S  ty   t|| jjj_td Y | S w | S )	z7
    Set performance optimization related configs.
    teTz9use_fsdp_double_buffer requires use_mcore_fsdp to be TruezJCurrently, cuda graphs are not supported with FSDP. Disabling cuda graphs.Frk   z;DDP is not configured. Cannot use user buffer registration.rf   )r   r4   cross_entropy_fusion_implr   r-   r   r   r   rh   rt   rw   r   r   r[   ri   rJ   r   r\   rk   r   r   check_for_large_gradsnccl_ubfsdp_double_bufferrp   rq   rr   )rv   rC   r`   r   r<   r=   r@   r*   rx   rD   rE   rF   r   rI   rJ   rH   rc   
is_ddp_objr5   r5   r6   set_perf_optimization_configsc  sH   

 

r   Fr_   num_gpus_per_noder:   r;   	max_stepsr>   r?   rA   nccl_communicator_config_pathc                 C   s|  || j _|| j _|| j _|| j _d| j _|| j_|| j_| jj	t
kr(|| | j_|| j j_|| j j_|	| j j_|
dkr=dn|
| j j_|| j j_|| j j_t|dk| j j_|dur^|| j j_t| j j}|| || |	  }|durt|dko|dko|
o|
dk| j j| _tdi d| d|d|d|d|d	|d
|
d|d|d|d|d|d|d|d|d|} | S )zESet experiment configs we usually tune for performance of all models.r   r   Nrv   rC   r`   r   r<   r=   r@   r*   rx   rD   rE   rF   r   rI   rJ   rH   r5   )rh   r_   devicesr   val_check_intervallimit_val_batchesr   rS   rT   __fn_or_cls__r   num_train_samplesri   tensor_model_parallel_sizepipeline_model_parallel_sizerW   $virtual_pipeline_model_parallel_sizeexpert_model_parallel_sizerZ   r[   sequence_parallelr   r   rt   r   r   )rv   r   r_   r   r:   r;   r   r<   r=   r>   r@   r?   rA   r`   rC   r   rI   rJ   rD   rE   r*   rx   rF   r   rH   rc   dp_sizer5   r5   r6   set_primary_perf_configs  st   





	
r   domain	enable_tb	enable_wdwandb_prj_namewandb_job_namec                 C   s   |dkr|dkr| j jtjt| jj| j|d |s$d| j	_
d| j _nd| j	_|r8ddlm} |||d	| j	_d| j	_| j j}	d}
|	rVt|	D ]\}}|jtkrU|}
 nqH|
du| j _d
| j _| S )zSet experiment logging configs.	pre_trainllm)model_configdata_configr   NFz/nemo_run/lightning_logsr   )wandb_logger)projectnamer   )rh   rt   r   runConfigr   r   r4   r   logtensorboardloggerlog_dir(nemo.collections.llm.recipes.log.defaultr   wandbckpt	enumerater   r   enable_checkpointinglog_every_n_steps)rv   r   r   r   r   r   r   r   r   rt   checkpoint_callback_idxidxr   r5   r5   r6   set_exp_logging_configs  s8   	

r   r/   c                 C   sD   | j r| jdusJ d| jdusJ d| jdus J ddS dS )z/
    Check the sanity of argument settings
    Nzwandb logger needs "wandb_key"z#wandb logger needs "wandb_prj_name"z#wandb logger needs "wandb_job_name")r   	wandb_keyr   r   )r/   r5   r5   r6   args_sanity_check-  s   r   user_buffer_registrationc           	      C   sf   ddl m} t| dd }| j}|dv }|dur|dkrdnd}|dur)t|nd	}|||||d
S )a  
    Create a PerfEnvPlugin with consistent defaults across scripts.

    - enable_vboost only when gpu is h100
    - set nccl_pp_comm_chunksize when pipeline parallelism is used
    - set gpu_sm100_or_newer when gpu is in ['b200', 'gb200']

    Args:
        args: Parsed CLI args that include `gpu`.
        pp_size: Pipeline parallel size to decide comm chunk size.
        user_buffer_registration: Optional flag to enable user buffer registration.
    r   )PerfEnvPluginr    )b200r8   Nr   i    F)enable_vboostnccl_pp_comm_chunksizegpu_sm100_or_newerr   )nemo.lightning.run.pluginsr   getattrrP   r   r[   )	r/   r=   r   r   gpu_strr   r   r   user_bufr5   r5   r6   build_perf_env_plugin7  s   r   rK   )NNNN)NFFNNNr   r   NNNNN)NN),r"   pathlibr   typingr   r   nemo_runr   pandasr&   numpyr   "nemo.collections.llm.gpt.data.mockr   6nemo.collections.llm.recipes.precision.mixed_precisionr   r   r	   r
   /nemo.lightning.pytorch.callbacks.flops_callbackr   1nemo.lightning.pytorch.callbacks.model_checkpointr   
nemo.utilsr   utilsr   r   r'   r7   rL   rb   rw   r   r   r[   r   r   r   r   r   dictr   r   r5   r5   r5   r6   <module>   s   "p (

!E	


R	


T
1 
