o
    Ti/8                     @   s  d Z dZdZdZdZdZdZdZdZd	Z	d
Z
dZdZeZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdgZdZ dgZ!dZ"dZ#dZ$dZ%d Z&dZ'd!Z(d"Z)d#Z*dZ+d$Z,dZ-d!Z.d%Z/d&Z0dZ1d'Z2d(Z3d)Z4dZ5d*Z6d+Z6dZ7d,Z8d-Z9dZ:d.Z;dZ<d/Z=d0Z>d1Z?d2Z@dZAd3ZBd(ZCd4ZDd5ZEd2ZFdZGd6ZHdZId7ZJdZKd8ZLdZMd9ZNd:ZOd;ZPd<ZQd=ZRdZSd>ZTdZUd?ZVdZWd@ZXdAZYd2ZZdZ[dBZ\dCZ]dDZ^dEZ_dFZ`dZadGZbdHZcdZddIZedJZfdKZgdLZhdMZidZjdNZkdOZldPZmdQZndRZodZpdSZqdTZrdZsdUZtdVZudZvdWZwdXZxdZydYZzdZ{dZZ|d[Z}d2Z~dZd\ZdZd]Zd^Zd_Zd`ZdaZdbZdcZdZddZdeZdfZdZdgZd2ZdZdhZdPZdiZdjZG dkdl dlZdmZdnZejZejejejgZdoZdZdpZdZdqZdrZdZdsZdtZdZduZdvZdZdwZdxZdyZdzZdZdS ){trainevalpredictencodetrain_batch_sizeNsparse_attentiondensefixedvariablebigbirdbslongformermodeblock   different_layout_per_headFnum_local_blocks   num_global_blocks   	attentionbidirectionalhorizontal_global_attentionnum_different_global_patternsnum_random_blocks    local_window_blocksglobal_block_indicesglobal_block_end_indicesnum_sliding_window_blocks   	optimizerparamstypelegacy_fusion	schedulermax_grad_normzero_allow_untested_optimizerzero_force_ds_cpu_optimizerTsteps_per_printz_
TRAIN_MICRO_BATCH_SIZE_PER_GPU is defined in this format:
"train_micro_batch_size_per_gpu": 1
train_micro_batch_size_per_gpuzQ
Gradient Accumulation should be of the format:
"gradient_accumulation_steps": 1
gradient_accumulation_stepssparse_gradientszL
BFLOAT16 parameters should be of the format:
"bf16": {
  "enabled": true
}
bf16bfloat16enabledimmediate_grad_updatez
FP16 parameters should be of the format:
"fp16": {
  "enabled": true,
  "auto_cast": false,
  "loss_scale": 0,
  "initial_scale_power": 16,
  "loss_scale_window": 1000,
  "hysteresis": 2,
  "consecutive_hysteresis": false,
  "min_loss_scale": 1
}
fp16
loss_scale	auto_castinitial_scale_powerloss_scale_windowi  
hysteresis   consecutive_hysteresismin_loss_scalefp16_master_weights_and_gradsz8
"amp" {
  "enabled: true,
  "opt_level": "O1",
  ...
}
ampzB
Gradient clipping should be enabled as:
"gradient_clipping": 1.0
gradient_clippingg        zA
Graph harvesting should be enabled as:
"graph_harvesting": true
graph_harvestingzM
Communication data type should be set as:
"communication_data_type": "fp32"
communication_data_typezm
Optional comm data type for seq paralleism should be set as:
"seq_parallel_communication_data_type": "fp32"
$seq_parallel_communication_data_typefp32zF
Gradient prescaling should be enabled as:
"prescale_gradients": true
prescale_gradientszR
Gradient predivide factor should be enabled as:
"gradient_predivide_factor": 1.0
gradient_predivide_factorg      ?zC
Disable AllGather should be enabled as:
"disable_allgather": true
disable_allgatherz5
Dump state should be enabled as:
"dump_state": true

dump_statez>
Vocabulary size can be specified as:
"vocabulary_size": 1024
vocabulary_sizezI
Wall block breakdown should be enabled as:
"wall_clock_breakdown": true
wall_clock_breakdownmemory_breakdownz
Tensorboard can be specified as:
"eigenvalue": {
  "enabled": true,
  "verbose": true,
  "max_iter": 100,
  "tol": 1e-2,
  "stability": 1e-6
}

eigenvalueverbosemax_iterd   tolg{Gz?	stabilitygư>gas_boundary_resolution
layer_namezbert.encoder.layer	layer_numprogressive_layer_dropthetagammagMbP?c                   @   s   e Zd ZdZdZdZdS )ValidationModeWARNIGNOREFAILN)__name__
__module____qualname__rS   rT   rU    rY   rY   O/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/constants.pyrR     s    rR   
checkpointtag_validationload_universaluse_node_local_storageparallel_writepipeline_stage
data_typesgrad_accum_dtypezS
The last incomplete batch can be dropped by setting:
"dataloader_drop_last": True
dataloader_drop_lastds_pipe_replicateddata_parallel_groupglobal_rank"use_data_before_expert_parallelism)ROUTE_TRAIN
ROUTE_EVALROUTE_PREDICTROUTE_ENCODETRAIN_BATCH_SIZETRAIN_BATCH_SIZE_DEFAULTSPARSE_ATTENTIONSPARSE_DENSE_MODESPARSE_FIXED_MODESPARSE_VARIABLE_MODESPARSE_BIGBIRD_MODESPARSE_BSLONGFORMER_MODESPARSE_MODESPARSE_MODE_DEFAULTSPARSE_BLOCKSPARSE_BLOCK_DEFAULT SPARSE_DIFFERENT_LAYOUT_PER_HEAD(SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULTSPARSE_NUM_LOCAL_BLOCKSSPARSE_NUM_LOCAL_BLOCKS_DEFAULTSPARSE_NUM_GLOBAL_BLOCKS SPARSE_NUM_GLOBAL_BLOCKS_DEFAULTSPARSE_ATTENTION_TYPESPARSE_ATTENTION_TYPE_DEFAULT"SPARSE_HORIZONTAL_GLOBAL_ATTENTION*SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT$SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS,SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULTSPARSE_NUM_RANDOM_BLOCKS SPARSE_NUM_RANDOM_BLOCKS_DEFAULTSPARSE_LOCAL_WINDOW_BLOCKS"SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULTSPARSE_GLOBAL_BLOCK_INDICES#SPARSE_GLOBAL_BLOCK_INDICES_DEFAULTSPARSE_GLOBAL_BLOCK_END_INDICES'SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT SPARSE_NUM_SLIDING_WINDOW_BLOCKS(SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT	OPTIMIZEROPTIMIZER_TYPE_DEFAULTOPTIMIZER_PARAMSTYPELEGACY_FUSIONLEGACY_FUSION_DEFAULT	SCHEDULERSCHEDULER_TYPE_DEFAULTSCHEDULER_PARAMSMAX_GRAD_NORMZERO_ALLOW_UNTESTED_OPTIMIZER%ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULTZERO_FORCE_DS_CPU_OPTIMIZER#ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULTSTEPS_PER_PRINTSTEPS_PER_PRINT_DEFAULTTRAIN_MICRO_BATCH_SIZE_PER_GPU&TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULTGRADIENT_ACCUMULATION_FORMATGRADIENT_ACCUMULATION_STEPS#GRADIENT_ACCUMULATION_STEPS_DEFAULTSPARSE_GRADIENTSSPARSE_GRADIENTS_DEFAULTBFLOAT16_FORMATBFLOAT16BFLOAT16_OLDBFLOAT16_ENABLEDBFLOAT16_ENABLED_DEFAULTBFLOAT16_IMMEDIATE_GRAD_UPDATE&BFLOAT16_IMMEDIATE_GRAD_UPDATE_DEFAULTFP16_FORMATFP16FP16_ENABLEDFP16_ENABLED_DEFAULTFP16_LOSS_SCALEFP16_LOSS_SCALE_DEFAULTFP16_AUTO_CASTFP16_AUTO_CAST_DEFAULTFP16_INITIAL_SCALE_POWER FP16_INITIAL_SCALE_POWER_DEFAULTFP16_LOSS_SCALE_WINDOWFP16_LOSS_SCALE_WINDOW_DEFAULTFP16_HYSTERESISFP16_HYSTERESIS_DEFAULTFP16_CONSECUTIVE_HYSTERESIS#FP16_CONSECUTIVE_HYSTERESIS_DEFAULTFP16_MIN_LOSS_SCALEFP16_MIN_LOSS_SCALE_DEFAULTFP16_MASTER_WEIGHTS_AND_GRADS%FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT
AMP_FORMATAMPAMP_ENABLEDAMP_ENABLED_DEFAULTGRADIENT_CLIPPING_FORMATGRADIENT_CLIPPINGGRADIENT_CLIPPING_DEFAULTGRAPH_HARVESTING_FORMATGRAPH_HARVESTINGGRAPH_HARVESTING_DEFAULTCOMMUNICATION_DATA_TYPE_FORMATCOMMUNICATION_DATA_TYPECOMMUNICATION_DATA_TYPE_DEFAULT+SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_FORMAT$SEQ_PARALLEL_COMMUNICATION_DATA_TYPE,SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULTPRESCALE_GRADIENTS_FORMATPRESCALE_GRADIENTSPRESCALE_GRADIENTS_DEFAULT GRADIENT_PREDIVIDE_FACTOR_FORMATGRADIENT_PREDIVIDE_FACTOR!GRADIENT_PREDIVIDE_FACTOR_DEFAULTDISABLE_ALLGATHER_FORMATDISABLE_ALLGATHERDISABLE_ALLGATHER_DEFAULTDUMP_STATE_FORMAT
DUMP_STATEDUMP_STATE_DEFAULTVOCABULARY_SIZE_FORMATVOCABULARY_SIZEVOCABULARY_SIZE_DEFAULTWALL_CLOCK_BREAKDOWN_FORMATWALL_CLOCK_BREAKDOWNWALL_CLOCK_BREAKDOWN_DEFAULTMEMORY_BREAKDOWNMEMORY_BREAKDOWN_DEFAULTEIGENVALUE_FORMAT
EIGENVALUEEIGENVALUE_ENABLEDEIGENVALUE_ENABLED_DEFAULTEIGENVALUE_VERBOSEEIGENVALUE_VERBOSE_DEFAULTEIGENVALUE_MAX_ITEREIGENVALUE_MAX_ITER_DEFAULTEIGENVALUE_TOLEIGENVALUE_TOL_DEFAULTEIGENVALUE_STABILITYEIGENVALUE_STABILITY_DEFAULT"EIGENVALUE_GAS_BOUNDARY_RESOLUTION*EIGENVALUE_GAS_BOUNDARY_RESOLUTION_DEFAULTEIGENVALUE_LAYER_NAMEEIGENVALUE_LAYER_NAME_DEFAULTEIGENVALUE_LAYER_NUMEIGENVALUE_LAYER_NUM_DEFAULTPROGRESSIVE_LAYER_DROPPLD_ENABLEDPLD_ENABLED_DEFAULT	PLD_THETAPLD_THETA_DEFAULT	PLD_GAMMAPLD_GAMMA_DEFAULTrR   
CHECKPOINTCHECKPOINT_TAG_VALIDATIONrS   !CHECKPOINT_TAG_VALIDATION_DEFAULTrT   rU   CHECKPOINT_TAG_VALIDATION_MODESLOAD_UNIVERSAL_CHECKPOINT!LOAD_UNIVERSAL_CHECKPOINT_DEFAULT!USE_NODE_LOCAL_STORAGE_CHECKPOINT)USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULTCHECKPOINT_PARALLEL_WRITE(CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE0CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT
DATA_TYPESGRAD_ACCUM_DTYPEGRAD_ACCUM_DTYPE_DEFAULTDATALOADER_DROP_LAST_FORMATDATALOADER_DROP_LASTDATALOADER_DROP_LAST_DEFAULTPIPE_REPLICATEDDATA_PARALLEL_GROUPGLOBAL_RANKUSE_DATA_BEFORE_EXPERT_PARALLEL'USE_DATA_BEFORE_EXPERT_PARALLEL_DEFAULTrY   rY   rY   rZ   <module>   sZ  	

