o
    Ti!                     @   sH  d dl Z d dlmZ d dlmZ d dlZd dlZd dlZd dlZd dl	Z	ddl
T ddlmZmZmZmZmZ ddlmZmZmZ ddlmZmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dl m!Z! d dl"m#Z$ d dl%m&Z& d
dl'm(Z) d
dl*m+Z+ d
dl,m-Z-m.Z.m/Z/ d
dl0m1Z1 d
dl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 d
dl:m;Z; d
dl<m=Z= d
dl>m?Z? d
dl@mAZAmBZB d
dlCT ddlDmEZE ddlFmGZG ddlHmIZImJZJmKZKmLZL ddlMT d
dlNmOZO dZPdZQd ZRd!ZSd"ZTd#ZUd$ZVd%ZWd&ZXd'ZYd(ZZd)Z[eQeReSeTeUeWeVeXeYeZe[gZ\d*Z]d+Z^d,Z_G d-d. d.e`ZaG d/d0 d0eZbd1d2 Zcd3d4 Zdd5d6 Zed7d8 Zfd9d: Zgd;d< Zhd=d> Zid?d@ ZjdAdB ZkdCdD ZldEdF ZmdGdH ZndIdJ ZodKdL ZpeqerfdMdNZsdOdP ZtdQdR ZudSdT ZvdUdV ZwdWdX ZxdYdZ Zyd[d\ Zzd]d^ Z{d_d` Z|dadb Z}dcdd Z~dedf Zdgdh Zdidj Zdkdl Zdmdn Zdodp Zdqdr Zdsdt Zdudv Zdwdx Zdydz Zd{d| Zd}d~ Zdd Zdd Zdd Zdd ZG dd de&Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z	 G dd dZG dd deZdS )    N)Union)Enum   )*)INITIAL_LOSS_SCALESCALE_WINDOWDELAYED_SHIFTCONSECUTIVE_HYSTERESISMIN_LOSS_SCALE)get_scalar_param"dict_raise_error_on_duplicate_keysScientificNotationEncoder)get_zero_configZeroStageEnum)&DeepSpeedActivationCheckpointingConfig   )DeepSpeedCommsConfig)get_monitor_config)WeightQuantConfig)CompileConfig)comm)DeepSpeedConfigModel)version)logger)elasticity_enabledcompute_elastic_configensure_immutable_elastic_config)ElasticityConfigError)
ELASTICITYIGNORE_NON_ELASTIC_BATCH_INFO%IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULTMODEL_PARALLEL_SIZEMODEL_PARALLEL_SIZE_DEFAULTNUM_GPUS_PER_NODENUM_GPUS_PER_NODE_DEFAULT)DeepSpeedFlopsProfilerConfig)DeepSpeedAutotuningConfig)DeepSpeedNebulaConfig)get_compression_configget_quantize_enabled)get_aio_config)get_tensor_parallel_config)get_data_efficiency_enabledget_data_efficiency_configget_curriculum_enabled_legacyget_curriculum_params_legacy)get_timers_config   adagradadamadamwlamb
onebitadamzerooneadam
onebitlambmuadammuadamwmusgdlion
torch_adamadam_w_modeTc                   @   s   e Zd ZdS )DeepSpeedConfigErrorN)__name__
__module____qualname__ rC   rC   L/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/config.pyr?   b   s    r?   c                   @   sV   e Zd ZejddddfZejddddfZejd	d
fZej	dddfZ
dd Zdd ZdS )	DtypeEnumztorch.float16fp16float16halfztorch.float32fp32float32floatz
torch.int8int8ztorch.bfloat16bf16bfloat16c                 G   s:   t | }|d |_|dd  D ]}|| j|< q||_|S )Nr   r   )object__new___value__value2member_map__all_values)clsvaluesobjother_valuerC   rC   rD   rP   p   s   

zDtypeEnum.__new__c                 C   s&   d| j j| jddd | jD f S )Nz<%s.%s: %s>, c                 S   s   g | ]}t |qS rC   )repr).0vrC   rC   rD   
<listcomp>}   s    z&DtypeEnum.__repr__.<locals>.<listcomp>)	__class__r@   _name_joinrS   selfrC   rC   rD   __repr__y   s
   zDtypeEnum.__repr__N)r@   rA   rB   torchrG   rF   rJ   rI   rL   rN   rM   rP   rb   rC   rC   rC   rD   rE   f   s    	rE   c                 C       t |  v rt| t  ttS dS NF)PROGRESSIVE_LAYER_DROPkeysr   PLD_ENABLEDPLD_ENABLED_DEFAULT
param_dictrC   rC   rD   get_pld_enabled      rl   c                 C   ,   t |  v rt| t  }|t |S dS re   )rf   rg   copypoprh   )rk   
pld_paramsrC   rC   rD   get_pld_params   
   
rr   c                 C   rd   re   )AMPrg   r   AMP_ENABLEDAMP_ENABLED_DEFAULTrj   rC   rC   rD   get_amp_enabled   rm   rw   c                 C   rn   re   )rt   rg   ro   rp   ru   )rk   
amp_paramsrC   rC   rD   get_amp_params   rs   ry   c                 C   rd   re   )FP16rg   r   FP16_ENABLEDFP16_ENABLED_DEFAULTrj   rC   rC   rD   get_fp16_enabled   rm   r}   c                 C   2   t tfD ]}||  v rt| | tt  S qdS re   )BFLOAT16BFLOAT16_OLDrg   r   BFLOAT16_ENABLEDBFLOAT16_ENABLED_DEFAULTrk   keyrC   rC   rD   get_bfloat16_enabled   s
   r   c                 C   r~   re   )r   r   rg   r   BFLOAT16_IMMEDIATE_GRAD_UPDATE&BFLOAT16_IMMEDIATE_GRAD_UPDATE_DEFAULTr   rC   rC   rD   "get_bfloat16_immediate_grad_update   s   
r   c                 C   s   t | rt| t ttS dS re   )r}   r   rz   FP16_MASTER_WEIGHTS_AND_GRADS%FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULTrj   rC   rC   rD   )get_fp16_master_weights_and_grads_enabled   s   r   c                 C   s   t | rt| t ttS d S N)r}   r   rz   FP16_AUTO_CASTFP16_AUTO_CAST_DEFAULTrj   rC   rC   rD   get_fp16_auto_cast   s   r   c                 C   s(   t | rt| t ttS t| rdS tS )Ng      ?)r}   r   rz   FP16_LOSS_SCALEFP16_LOSS_SCALE_DEFAULTr   rj   rC   rC   rD   get_loss_scale   s
   r   c                 C   s@   t | rt| t tt}d| S t| rd}d| S t}d| S )Nr   r   )r}   r   rz   FP16_INITIAL_SCALE_POWER FP16_INITIAL_SCALE_POWER_DEFAULTr   )rk   initial_scale_powerrC   rC   rD   get_initial_dynamic_scale   s   
r   c              
      s   d }t | rH| t  tttttg}t fdd|D rHt tt	}t tt
}t tt}t tt}t tt}td| t|t|t|t|i}|S )Nc                 3   s     | ]}|t   v V  qd S r   )listrg   )rZ   arg	fp16_dictrC   rD   	<genexpr>   s    z.get_dynamic_loss_scale_args.<locals>.<genexpr>r   )r}   rz   r   FP16_LOSS_SCALE_WINDOWFP16_MIN_LOSS_SCALEFP16_HYSTERESISFP16_CONSECUTIVE_HYSTERESISanyr   r   FP16_LOSS_SCALE_WINDOW_DEFAULTFP16_HYSTERESIS_DEFAULT#FP16_CONSECUTIVE_HYSTERESIS_DEFAULTFP16_MIN_LOSS_SCALE_DEFAULTr   r   r   r	   r
   )rk   loss_scale_argsdynamic_loss_args
init_scalescale_windowdelayed_shiftconsecutive_hysteresismin_loss_scalerC   r   rD   get_dynamic_loss_scale_args   s0   r   c                 C      t | ttS r   )r   GRADIENT_ACCUMULATION_STEPS#GRADIENT_ACCUMULATION_STEPS_DEFAULTrj   rC   rC   rD   get_gradient_accumulation_steps      r   c                 C   r   r   )r   SPARSE_GRADIENTSSPARSE_GRADIENTS_DEFAULTrj   rC   rC   rD   get_sparse_gradients_enabled   r   r   c                 C   sd   t | ||}|d ur| n|}|d u r|S |dkrtjS |dkr$tjS |dkr+tjS td| )NrI   rF   rM   zVInvalid communication_data_type. Supported data types: ['fp16', 'bf16', 'fp32']. Got: )r   lowerrc   rJ   rG   rN   
ValueError)rk   	comm_typecomm_data_type_defaultvalrC   rC   rD   get_communication_data_type   s   r   c                 C   r   r   )r   PRESCALE_GRADIENTSPRESCALE_GRADIENTS_DEFAULTrj   rC   rC   rD   get_prescale_gradients  r   r   c                 C   r   r   )r   GRADIENT_PREDIVIDE_FACTOR!GRADIENT_PREDIVIDE_FACTOR_DEFAULTrj   rC   rC   rD   get_gradient_predivide_factor  r   r   c                 C   r   r   )r   STEPS_PER_PRINTSTEPS_PER_PRINT_DEFAULTrj   rC   rC   rD   get_steps_per_print  r   r   c                 C   r   r   )r   DISABLE_ALLGATHERDISABLE_ALLGATHER_DEFAULTrj   rC   rC   rD   get_disable_allgather  r   r   c                 C   r   r   )r   
DUMP_STATEDUMP_STATE_DEFAULTrj   rC   rC   rD   get_dump_state  r   r   c                 C   r   r   )r   GRADIENT_CLIPPINGGRADIENT_CLIPPING_DEFAULTrj   rC   rC   rD   get_gradient_clipping"  r   r   c                 C   r   r   )r   GRAPH_HARVESTINGGRAPH_HARVESTING_DEFAULTrj   rC   rC   rD   get_graph_harvesting&  r   r   c                 C   s   t |  v r>| t  }t|}|tkrt|S |tkrt|S |tkr&t|S |t	kr.t
|S |tkr6t|S td| dd S )NzGiven sparsity mode, z, has not been implemented yet!)SPARSE_ATTENTIONrg   get_sparse_attention_modeSPARSE_DENSE_MODEget_sparse_dense_configSPARSE_FIXED_MODEget_sparse_fixed_configSPARSE_VARIABLE_MODEget_sparse_variable_configSPARSE_BIGBIRD_MODEget_sparse_bigbird_configSPARSE_BSLONGFORMER_MODEget_sparse_bslongformer_configNotImplementedError)rk   sparsitymoderC   rC   rD   get_sparse_attention*  s   r   c                 C   s   t | tt}ttt|iS r   )r   SPARSE_BLOCKSPARSE_BLOCK_DEFAULTSPARSE_MODEr   )r   blockrC   rC   rD   r   @  s   r   c                 C   sx   t | tt}t | tt}t | tt}t | tt}t | t	t
}t | tt}t | tt}ttt|t|t|t|t	|t|t|iS r   )r   r   r    SPARSE_DIFFERENT_LAYOUT_PER_HEAD(SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULTSPARSE_NUM_LOCAL_BLOCKSSPARSE_NUM_LOCAL_BLOCKS_DEFAULTSPARSE_NUM_GLOBAL_BLOCKS SPARSE_NUM_GLOBAL_BLOCKS_DEFAULTSPARSE_ATTENTION_TYPESPARSE_ATTENTION_TYPE_DEFAULT"SPARSE_HORIZONTAL_GLOBAL_ATTENTION*SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT$SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS,SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULTr   r   )r   r   different_layout_per_headnum_local_blocksnum_global_blocks	attentionhorizontal_global_attentionnum_different_global_patternsrC   rC   rD   r   E  s8   r   c           	      C   s   t | tt}t | tt}t | tt}t | tt}t | t	t
}t | tt}t | tt}t | tt}ttt|t|t|t|t	|t|t|t|i	S r   )r   r   r   r   r   SPARSE_NUM_RANDOM_BLOCKS SPARSE_NUM_RANDOM_BLOCKS_DEFAULTSPARSE_LOCAL_WINDOW_BLOCKS"SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULTSPARSE_GLOBAL_BLOCK_INDICES#SPARSE_GLOBAL_BLOCK_INDICES_DEFAULTSPARSE_GLOBAL_BLOCK_END_INDICES'SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULTr   r   r   r   r   r   )	r   r   r   num_random_blockslocal_window_blocksglobal_block_indicesglobal_block_end_indicesr   r   rC   rC   rD   r   f  s<   r   c                 C   X   t | tt}t | tt}t | tt}t | tt}t | t	t
}ttt|t|t|t|t	|iS r   )r   r   r   r   r   r   r    SPARSE_NUM_SLIDING_WINDOW_BLOCKS(SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULTr   r   r   r   )r   r   r   r   num_sliding_window_blocksr   rC   rC   rD   r     s(   r   c                 C   r  r   )r   r   r   r   r   r  r  r   r   r   r   r   r   )r   r   r   r  r   r  rC   rC   rD   r     s0   r   c                 C      t |  v r
| t  S tS r   )r   rg   SPARSE_MODE_DEFAULTrj   rC   rC   rD   r        r   c                 C   r  r   )r   rg   r   rj   rC   rC   rD   get_sparse_attention_type  r  r	  c                 C   s<   ddddddd}|}|  di  D ]\}}|||< q|S )z&Parses pipeline engine configuration. autobestFr   T)stages	partitionseed_layersactivation_checkpoint_intervalpipe_partitionedgrad_partitionedpipeline)getitems)rk   default_pipelineconfigr   r   rC   rC   rD   get_pipeline_config  s   
r  c                 C   ,   t |  v rt| t   v r| t  t S tS r   )	OPTIMIZERrg   TYPEOPTIMIZER_TYPE_DEFAULTrj   rC   rC   rD   get_optimizer_name     r  c                 C   ,   t | d urt| t  v r| t t S d S r   )r  OPTIMIZER_PARAMSr  rg   rj   rC   rC   rD   get_optimizer_params  r  r   c                 C   s(   t | }|d urt| v r|t S d S r   )r   MAX_GRAD_NORMrg   )rk   optimizer_paramsrC   rC   rD   get_optimizer_gradient_clipping  s   r#  c                 C   r  r   )r  rg   LEGACY_FUSIONLEGACY_FUSION_DEFAULTrj   rC   rC   rD   get_optimizer_legacy_fusion  r  r&  c                 C   r   r   )r   ZERO_ALLOW_UNTESTED_OPTIMIZER%ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULTrj   rC   rC   rD   !get_zero_allow_untested_optimizer  r   r)  c                 C   r   r   )r   ZERO_FORCE_DS_CPU_OPTIMIZER#ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULTrj   rC   rC   rD   get_zero_force_ds_cpu_optimizer  r   r,  c                 C   r  r   )	SCHEDULERrg   r  SCHEDULER_TYPE_DEFAULTrj   rC   rC   rD   get_scheduler_name  r  r/  c                 C   r  r   )r/  SCHEDULER_PARAMSr-  rg   rj   rC   rC   rD   get_scheduler_params	  r  r1  c                 C   r   r   )r   TRAIN_BATCH_SIZETRAIN_BATCH_SIZE_DEFAULTrj   rC   rC   rD   get_train_batch_size  r   r4  c                 C   r   r   )r   TRAIN_MICRO_BATCH_SIZE_PER_GPU&TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULTrj   rC   rC   rD   "get_train_micro_batch_size_per_gpu  s
   r7  c                 C   r   r   )r   WALL_CLOCK_BREAKDOWNWALL_CLOCK_BREAKDOWN_DEFAULTrj   rC   rC   rD   get_wall_clock_breakdown  r   r:  c                 C   r   r   )r   MEMORY_BREAKDOWNMEMORY_BREAKDOWN_DEFAULTrj   rC   rC   rD   get_memory_breakdown   r   r=  c                   @   sV   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dS )HybridEngineConfigFenabledi   max_out_tokensr   inference_tp_sizerelease_inference_cacheTpin_parametersr1   tp_gather_partition_sizeN)r@   rA   rB   r?  bool__annotations__r@  intrA  rB  rC  rD  rC   rC   rC   rD   r>  $  s   
 r>  c                 C   s   |  di }tdi |}|S )Nhybrid_enginerC   )r  r>  )rk   hybrid_engine_config_dicthybrid_engine_configrC   rC   rD   get_hybrid_engine_config-  s   rK  c                 C   r   r   )r   USE_DATA_BEFORE_EXPERT_PARALLEL'USE_DATA_BEFORE_EXPERT_PARALLEL_DEFAULTrj   rC   rC   rD   get_expert_data_topo_config3  r   rN  c              	   C   sh   t | r*| t } t| rJ dt| t| t| t| t| t| t| t	| fS t
tttttttfS )Nz,Eigenvalue based MoQ is temporarily disabled)r)   QUANTIZE_TRAININGget_eigenvalue_enabledget_eigenvalue_verboseget_eigenvalue_max_iterget_eigenvalue_tolget_eigenvalue_stability&get_eigenvalue_gas_boundary_resolutionget_eigenvalue_layer_nameget_eigenvalue_layer_numEIGENVALUE_ENABLED_DEFAULTEIGENVALUE_VERBOSE_DEFAULTEIGENVALUE_MAX_ITER_DEFAULTEIGENVALUE_TOL_DEFAULTEIGENVALUE_STABILITY_DEFAULT*EIGENVALUE_GAS_BOUNDARY_RESOLUTION_DEFAULTEIGENVALUE_LAYER_NAME_DEFAULTEIGENVALUE_LAYER_NUM_DEFAULTrj   rC   rC   rD   get_eigenvalue_config7  s*   r`  c                 C       t |  v rt| t  ttS tS r   )
EIGENVALUErg   r   EIGENVALUE_ENABLEDrX  rj   rC   rC   rD   rP  R  rm   rP  c                 C   ra  r   )rb  rg   r   EIGENVALUE_VERBOSErY  rj   rC   rC   rD   rQ  Y  rm   rQ  c                 C   ra  r   )rb  rg   r   EIGENVALUE_MAX_ITERrZ  rj   rC   rC   rD   rR  `  rm   rR  c                 C   ra  r   )rb  rg   r   EIGENVALUE_TOLr[  rj   rC   rC   rD   rS  g  rm   rS  c                 C   ra  r   )rb  rg   r   EIGENVALUE_STABILITYr\  rj   rC   rC   rD   rT  n  rm   rT  c                 C   ra  r   )rb  rg   r   "EIGENVALUE_GAS_BOUNDARY_RESOLUTIONr]  rj   rC   rC   rD   rU  u  s   rU  c                 C   ra  r   )rb  rg   r   EIGENVALUE_LAYER_NAMEr^  rj   rC   rC   rD   rV    rm   rV  c                 C   ra  r   )rb  rg   r   EIGENVALUE_LAYER_NUMr_  rj   rC   rC   rD   rW    rm   rW  c                 C      |  ti S r   )r  
CHECKPOINTrj   rC   rC   rD   get_checkpoint_params  r   rm  c                 C   rk  r   )r  
DATA_TYPESrj   rC   rC   rD   get_data_types_params  r   ro  c                 C   s4   |  tt}| }|tv r|S td| dt )Nz;Checkpoint config contains invalid tag_validation value of z, expecting one of )r  CHECKPOINT_TAG_VALIDATION!CHECKPOINT_TAG_VALIDATION_DEFAULTupperCHECKPOINT_TAG_VALIDATION_MODESr?   )checkpoint_paramstag_validation_moderC   rC   rD   "get_checkpoint_tag_validation_mode  s   rv  c                 C   s4   |  ti }| tt}|dv r|S td| d)N)TFz5checkpoint::parallel_write::pipeline_stage value of 'z&' is invalid, expecting: true or false)r  CHECKPOINT_PARALLEL_WRITE(CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE0CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULTr?   )rt  par_write_paramspar_write_pipelinerC   rC   rD   &get_checkpoint_parallel_write_pipeline  s   
r|  c                 C   r   r   )r   DATALOADER_DROP_LASTDATALOADER_DROP_LAST_DEFAULTrj   rC   rC   rD   get_dataloader_drop_last  r   r  c                   @   s.   e Zd Zd
ddZdd Zdd Zdd	 ZdS )DeepSpeedConfigWriterNc                 C   s   |d ur	|| _ d S i | _ d S r   data)ra   r  rC   rC   rD   __init__  s   zDeepSpeedConfigWriter.__init__c                 C   s   || j |< d S r   r  )ra   r   valuerC   rC   rD   
add_config  s   z DeepSpeedConfigWriter.add_configc                 C   s   t jt|dtd| _d S )Nrobject_pairs_hook)jsonloadopenr   r  )ra   filenamerC   rC   rD   load_config  s   z!DeepSpeedConfigWriter.load_configc                 C   s>   t |d}t| j| W d    d S 1 sw   Y  d S )Nw)r  r  dumpr  )ra   r  outfilerC   rC   rD   write_config  s   "z"DeepSpeedConfigWriter.write_configr   )r@   rA   rB   r  r  r  r  rC   rC   rC   rD   r    s
    
r  c                       st   e Zd Zddeeef f fddZdd Zdd Zd	d
 Z	dd Z
dd Zdd Zdd Zdd Zdd Z  ZS )DeepSpeedConfigNr  c              
      s  t t   t|tr| _n3tj|r"t	j
t|dtd _n!zt|d}t	| _W n ttfyB   td| w z%t  _|d urS|  _n|d urbt|jdd _nt  _W n
   d _d _Y td	| d
 j  t j _ jrOtd t jt jd\}}} jt  }t!|d |"t#t$ _% j%dk rt&d j% |"t't( _) j)dk rt&d j) |"t*t+}	|	st,t-t.g}
t/t0 fdd|
rt&dt, dt- dt. dt* d	|| j  }t, jv rt1d jt,  d|  t- jv r$t1d jt-  d|  t. jv r8t1d jt.  d|  td|  | jt,< | jt-< | jt.<  2t33 j  4   5  d S )Nr  r  zutf-8zeExpected a string path to an existing deepspeed config, or a dictionary or a valid base64. Received: data_parallel)mesh_dimr   r   zConfig mesh_device z world_size = z$DeepSpeed elasticity support enabled)	ds_configtarget_deepspeed_version
world_size)runtime_elastic_config_dictzFModel-Parallel size cannot be less than 1, given model-parallel size: zNNUmber of GPUs per node cannot be less than 1, given number of GPUs per node: c                    s
   |  j v S r   )_param_dict)tr`   rC   rD   <lambda>  s   
 z*DeepSpeedConfig.__init__.<locals>.<lambda>zCOne or more batch related parameters were found in your ds_config (rX   z	, and/or z). These parameters *will not be used* since elastic training is enabled, which takes control of these parameters. If you want to suppress this error (the parameters will be silently ignored) please set z!':true in your elasticity config.z-[Elasticity] overriding training_batch_size: z -> z8[Elasticity] overriding train_micro_batch_size_per_gpu: z5[Elasticity] overriding gradient_accumulation_steps: z[Elasticity] valid GPU counts: )6superr  r  
isinstancedictr  ospathexistshjsonr  r  r   base64urlsafe_b64decodedecodeloadsUnicodeDecodeErrorAttributeErrorr   distget_rankglobal_rankget_data_parallel_world_sizer  get_world_size	get_groupr   infor   r   __version__r   r   r  r!   r"   elastic_model_parallel_sizer   r#   r$   num_gpus_per_noder   r    r2  r5  r   r   mapwarning_initialize_paramsro   _configure_train_batch_size_do_sanity_check)ra   r  mpumesh_deviceconfig_decodedfinal_batch_size
valid_gpusmicro_batch_sizeelastic_dictignore_non_elastic_batch_infobatch_paramsgradient_accu_stepsr]   r`   rD   r    s   








	


zDeepSpeedConfig.__init__c              	   C   sX  t || _t|| _t|| _t|| _t|| _	t
|| _t|| _t|tt| _t|| _t|| _t|| _t|| _| jj| _| jj| _| jj| _| jdk| _t|| _ t!|| _"t#|| _$t%|| _&t'|| _(t)|| _*t+|| _,t-|| _.| j(r| j,rJ dt/|| _0t1|| _2t3|| _4t5|| _6t7|| _8t9|| _:t;|| _<t=|| _>t?|| _@| j@d ur| j@A tBv r| j@A | _@tC|| _DtE|| _FtG|| _HtI|| _JtK|| _LtM|| _NtO|| _PtQ|| jPjRB | _StT|| _UtV|| _WtX|\| _Y| _Z| _[| _\| _]| _^| __| _`ta|| _btc|| _dte|| _ftg|| _hti|| _jtk|| _ltm|| _nto|| _ptq|| _rts|| _ttu|}tv|}|twjxk| _y|twjzk| _{||t}t~| _||tt| _t|}||tt| _t|}|| _t|| _t|| _t|| _d|v rtdi |d nd | _tdi ||di | _t|| _t|| _d S )Nr   z8bfloat16 and fp16 modes cannot be simultaneously enabledweight_quantizationcompilerC   )r4  train_batch_sizer7  train_micro_batch_size_per_gpur   gradient_accumulation_stepsr   steps_per_printr   
dump_stater   disable_allgatherr   communication_data_type$SEQ_PARALLEL_COMMUNICATION_DATA_TYPE,SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT$seq_parallel_communication_data_typer   prescale_gradientsr   gradient_predivide_factorr   sparse_gradients_enabledr   zero_configmics_shard_sizemics_hierarchical_params_gathermics_hierarchial_params_gatherstagezero_optimization_stagezero_enabledr   activation_checkpointing_configr   comms_configr   monitor_configr   gradient_clippingr}   fp16_enabledr   fp16_auto_castr   bfloat16_enabledr   bfloat16_immediate_grad_updater   !fp16_master_weights_and_gradientsrw   amp_enabledry   rx   r   
loss_scaler   initial_dynamic_scaler   dynamic_loss_scale_argsr(   compression_configr   graph_harvestingr  optimizer_namer   DEEPSPEED_OPTIMIZERSr   r"  r&  optimizer_legacy_fusionr)  zero_allow_untested_optimizerr,  zero_force_ds_cpu_optimizerr/  scheduler_namer1  scheduler_paramsr%   flops_profiler_configr:  r?  wall_clock_breakdownr=  memory_breakdownr&   autotuning_configr`  eigenvalue_enabledeigenvalue_verboseeigenvalue_max_itereigenvalue_toleigenvalue_stability"eigenvalue_gas_boundary_resolutioneigenvalue_layer_nameeigenvalue_layer_numrN   use_data_before_expert_parallel_rK  rH  r   sparse_attentionr  r  rl   pld_enabledrr   rq   r.   curriculum_enabled_legacyr/   curriculum_params_legacyr,   data_efficiency_enabledr-   data_efficiency_configrm  rv  ValidationModeIGNORE!checkpoint_tag_validation_enabledFAILcheckpoint_tag_validation_failr  LOAD_UNIVERSAL_CHECKPOINT!LOAD_UNIVERSAL_CHECKPOINT_DEFAULTload_universal_checkpoint!USE_NODE_LOCAL_STORAGE_CHECKPOINT)USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULTuse_node_local_storagero  GRAD_ACCUM_DTYPEGRAD_ACCUM_DTYPE_DEFAULTgrad_accum_dtyper|  "checkpoint_parallel_write_pipeliner*   
aio_configr  dataloader_drop_lastr'   nebula_configr   weight_quantization_configr   compile_configr0   timers_configr+   tensor_parallel_config)ra   rk   rt  validation_modedata_types_paramspar_write_piperC   rC   rD   r  #  s   






















































z"DeepSpeedConfig._initialize_paramsc              	   C   s   | j }| j}| j}|dksJ d| d|dks!J d| d|dks-J d| d||| | j ksGJ d| d| d| d| j d S )	Nr   zTrain batch size: z has to be greater than 0zMicro batch size per gpu: zGradient accumulation steps: zvCheck batch related parameters. train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size z != z * r  r  r  r  )ra   train_batchmicro_batchgrad_accrC   rC   rD   _batch_assertion  s"   z DeepSpeedConfig._batch_assertionc                 C   s   | j }| j}| j}|d ur|d ur|d urd S |d ur-|d ur-|| }|| j }|| _d S |d urC|d urC|| j }|| }|| _d S |d urY|d urY|| }|| j9 }|| _ d S |d urhd| _|| j | _d S |d urw|| j | _ d| _d S J d)Nr   FzNEither train_batch_size or train_micro_batch_size_per_gpu needs to be providedr  )ra   r  r  r  r  rC   rC   rD   _set_batch_related_parameters  s4   






z-DeepSpeedConfig._set_batch_related_parametersc                 C      |    |   d S r   )r   r  r`   rC   rC   rD   r    s   z+DeepSpeedConfig._configure_train_batch_sizec                 C   r!  r   )_do_error_check_do_warning_checkr`   rC   rC   rD   r    s   z DeepSpeedConfig._do_sanity_checkc                 C   s&   t dtj| jddtdd d S )Nz  json = {}T   ),:)	sort_keysindentrT   
separators)r   r  formatr  dumpsr  r   r`   rC   rC   rD   print_user_config  s   
z!DeepSpeedConfig.print_user_configc              
   C   s`   t d| tt| D ]}|dkr)ddt|  }t d||t| | q|   d S )Nz{}:r  .   z
  {} {} {})r   r  r*  sortedvarslengetattrr,  )ra   namer   dotsrC   rC   rD   print  s   zDeepSpeedConfig.printc                 C   sr   | j s
J dt| jsJ dt| jr%| jtjks%J dtj| j	r5| jr1| jtj
ks7J dd S d S )Nz"DeepSpeedConfig: {} is not definedz3DeepSpeedConfig: Maximum supported ZeRO stage is {}zJFp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now.)r  r*  r5  r  r   r  r  r   	max_stager  	gradientsr`   rC   rC   rD   r"    s&   zDeepSpeedConfig._do_error_checkc                 C   s   | j }| jtt}|r|t dkrtd|t | j	d ur\t
| j	 v r^| j	t
 dkr`|rE| jdkrCtdt
| j	t
  d S d S | jdkrUtd| j	t
  d| j	t
< d S d S d S d S )Nr   z]DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization.zHDeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapperz`DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zerog        )r  r  r  VOCABULARY_SIZEVOCABULARY_SIZE_DEFAULTTENSOR_CORE_ALIGN_SIZEr   r  r*  r"  r!  rg   r  )ra   r  vocabulary_sizerC   rC   rD   r#    s0   



z!DeepSpeedConfig._do_warning_check)NN)r@   rA   rB   r   strr  r  r  r  r   r  r  r,  r5  r"  r#  __classcell__rC   rC   r  rD   r    s    ]v-
	r  )r  typingr   enumr   rc   r  r  ro   r  	constantsfp16.loss_scalerr   r   r   r	   r
   config_utilsr   r   r   zero.configr   r   activation_checkpointing.configr   comm.configr   monitor.configr   inference.configr   compile.configr   	deepspeedr   r  deepspeed.runtime.config_utilsr   git_version_infor   r  utilsr   
elasticityr   r   r   elasticity.configr   elasticity.constantsr   r   r    r!   r"   r#   r$   profiling.configr%   autotuning.configr&   nebula.configr'   compression.configr(   r)   compression.constantsswap_tensor.aio_configr*   tensor_parallelr+   data_pipeline.configr,   r-   r.   r/   data_pipeline.constantsutils.configr0   r:  ADAGRAD_OPTIMIZERADAM_OPTIMIZERADAMW_OPTIMIZERLAMB_OPTIMIZERONEBIT_ADAM_OPTIMIZERZERO_ONE_ADAM_OPTIMIZERONEBIT_LAMB_OPTIMIZERMUADAM_OPTIMIZERMUADAMW_OPTIMIZERMUSGD_OPTIMIZERLION_OPTIMIZERr  TORCH_ADAM_PARAMADAM_W_MODEADAM_W_MODE_DEFAULT	Exceptionr?   rE   rl   rr   rw   ry   r}   r   r   r   r   r   r   r   r   r   COMMUNICATION_DATA_TYPECOMMUNICATION_DATA_TYPE_DEFAULTr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r  r   r#  r&  r)  r,  r/  r1  r4  r7  r:  r=  r>  rK  rN  r`  rP  rQ  rR  rS  rT  rU  rV  rW  rm  ro  rv  r|  r  r  rO   r  rC   rC   rC   rD   <module>   s   $

			
!#	