o
    wi-                     @   s,   d dl Z d dlmZ ddlmZ dd ZdS )    N)get_nemorun_home   )DEFAULT_NEMO_HOMEc               
   C   s(  t jdd} | jddtddd | jdd	td
dd | jddtg dddd | jddtdt  dt d | jddtdddd ddg}| jddtd|ddd | jdd td!d"gd#dd!d$ d%}| jd&d'tg d(|dd)d$ | jd*d+d,d-d. | jd/d0d1d-d. | jd2d3td4dd5d | jd6d7d8d-d. | jd9d:d;d-d. | jd<d=td>dd5d | jd?d@tdAdd5d | jdBdCtdDdd5d | jdEdFdGdHgdIdHdJ | jdKdLtdMd5dN dOdPdQt g}| jdRdStd|tdN | jdTdUdVdd-dW | jdXdYtdZdd5d | jd[d\td]dd5d | jd^d_td`dd5d | jdadbtdcdd5d | jdddetdfdd5d | jdgdhdidj dkd5dldd5dm | jdndotdd5dp | jdqdrtdd5dp | jdsdttdudd5d | jdvdwtdxddyd | jdzd{td|dd}d d~d }| jddd|dd5d | jddd|dd5d | jddd|dd5d | jddd|dd5d | jddd|dd5d | jddtddd5d | jddtddd5d | jdtddd5d dd }| jdddd5tddd5d | jdd|ddg d | jddd-dd | jdddd-ddd | jdd|dd5d | jddd|dd5d | S )z
    Command line arguments correspong to Slurm cluster and NeMo2.0 for running pre-training and
    fine-tuning experiments.
    z/NeMo2.0 Performance Pretraining and Fine-Tuning)descriptionz-az	--accountz#Slurm account to use for experimentT)typehelprequiredz-pz--partitionz%Slurm partition to use for experimentz-gz--gpu)h100b200gb200zTarget gpu type.)r   choicesr   r   z-lz	--log_dirz6Directory for logging experiment results. Defaults to F)r   r   r   defaultz-tz--time_limitzUMaximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')z00:30:00zaNeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'zBMake sure your NGC credentials are accessible in your environment.z-iz--container_image znvcr.io/nvidia/nemo:devz-cz--compute_dtypebf16fp8z9Compute precision. Options- bf16 or fp8. Defaults to bf16)r   r   r   r   r   zFP8 recipe. Options- ds (per-tensor delayed scaling), cs (per-tensor current scaling), mxfp8, ss (subchannel scaling). Defaults to dsz-frz--fp8_recipe)dscsmxfp8ssr   z-enz--enable_nsysz)Enable Nsys profiling. Diabled by default
store_true)r   actionz-emz--enable_memory_profilez1Enable memory usage profiling. Diabled by defaultz-mpz--memory_profile_out_pathz+Path to the output file of memory profilingNz-tbz--tensorboardz/Enable tensorboard logging. Disabled by defaultz-wdz--wandbz)Enable wandb logging. Disabled by defaultz-wdkz--wandb_keyz6wandb key. Needed for wandb logger projetion to serverz-wdpz--wandb_prj_namezwandb project namez-wdjz--wandb_job_namezwandb job namez-fz--finetuningsftloraz,Finetuning scheme to use. Defaults to 'lora')r   r   r   z-hfz
--hf_tokenzWHuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.)r   r   r   z]Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searcheszjfor models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints alreadyzPexist here. Missing files will be downloaded here from HuggingFace. Defaults to z-nhz--nemo_homez-dz--dryrunzGIf true, prints sbatch script to terminal without launching experiment.)r   r   r   z-tpz--tensor_parallel_sizez?Intra-layer model parallelism. Splits tensors across GPU ranks.z-ppz--pipeline_parallel_sizezJInter-layer model parallelism. Splits transformer layers across GPU ranks.z-cpz--context_parallel_sizez?Splits network input along sequence dimension across GPU ranks.z-vpz --virtual_pipeline_parallel_sizez]Number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.z-epz--expert_parallel_sizez;Distributes Moe Experts across sub data parallel dimension.z-etz--expert_tensor_parallel_sizec                 S   s   | d urt | S d S )N)int)x r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/scripts/performance/argument_parser.py<lambda>   s    z parse_cli_args.<locals>.<lambda>?zIntra-layer tensor model parallelsm for expert layer. Splits tensors across GPU ranks.            Use -et/--expert_tensor_parallel_size <space> for None or -et/--expert_tensor_parallel_size <int>)r   nargsconstr   r   r   z-mbz--micro_batch_size)r   r   r   z-gbz--global_batch_sizez-ngz
--num_gpuszNumber of gpus.z-gnz--gpus_per_nodez&Number of gpus per node. Defaults to 8   z-msz--max_stepsz&Number of train steps. Defaults to 100d   c                 S   s.   |   dv rdS |   dv rdS td|  )N)true1tyesyT)false0fnonFz$Invalid value for boolean argument: )lower
ValueErrorargr   r   r   bool_arg  s
   z parse_cli_args.<locals>.bool_argz-cgz--cuda_graphsz'Enable CUDA graphs. Disabled by default)r   r   r   r   z-fsdpz--use_mcore_fsdpz6Enable Megatron Core (Mcore) FSDP. Disabled by defaultz-fsdp_dbz--use_fsdp_double_bufferz.Enable FSDP double buffer. Disabled by defaultz-ubrz--use_user_buffer_registrationz4Enable user buffer registration. Disabled by defaultz-sharpz--use_sharpz!Enable sharp. Disabled by defaultz-rlz--recompute_layerszNumber of Transformer layers to recompute, where all the intermediate activations of a Transformer layer are computed. Defaults to Nonez-olz--activation_offload_layerszKNumber of Transformer layers to offload to the CPU memory. Defaults to Nonez--nccl_communicator_config_pathz*Path to NCCL communicator config yaml filec                 S   s
   |  dS )N,)splitr/   r   r   r   list_of_stringsa  s   
z'parse_cli_args.<locals>.list_of_stringsz-rmz--recompute_modules*z{List of modules to perform selective activation recompute. Users can provide 0 or any number of arguments. Defaults to None)r   r    r   r   r   r   z-cmz--custom_mountsz Comma separated string of mountsz--use_hf_tokenizerz\Use HuggingFace tokenizer. Disabled by default. Null tokenizer will be used if not provided.)r   r   r   z-dcdfrz#--dump_config_diff_from_base_recipez<Dump the config diff from the base recipe. Defaults to False)r   r   r   r   z--keep_fsdp_fp8_transpose_cachez2Keep FSDP FP8 transpose cache. Disabled by defaultz-vbz--enable_vboostzOEnable VBoost which steers more power towards tensor cores. Disabled by default)argparseArgumentParseradd_argumentstrr   joinr   r   )parsercontainer_img_msgfp8_recipe_msgnemo_home_msgr1   r4   r   r   r   parse_cli_args   s  
	
				r?   )r6   nemo_run.configr   utilsr   r?   r   r   r   r   <module>   s   