o
    Ti                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZ d dl	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZmZmZ d dlZd dlm Z! d dl"m#Z#m$Z$ ddl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZM d dlNmOZO d dlPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ d dl1m\Z\ d dl]m^Z^ d dl_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZj d dlkmlZlmmZm d d lnmoZo d d!lpmqZq d d"lrmsZs d d#lrmtZtmuZumvZv d d$lwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZ d d%lmZmZ d d&lmZ d d'lmZ d d(l"mZmZ d d)lmZ d d*lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d d+lmZ d d,lmZ d d-lmZ d d.lmZ d d/lmZ d d0lmZ dd1lmZ dd2lmZ dd3lmZ d4d5lmZ d4d6lmZmZ d4d7lmZ d4d8lmZmZ d4d9lmZ d d:lmZ d d;lmZmZ d d<lmZ d d=l>mZ d d>lmZmZmZ d d?lmZmZ d d@lmZmZmZmZ d dAlmZ d dBlmZ dCZeeee eeef f gef Zeegef Zzd dlZd dDlmZ dEZW n eݐyt   dFZY nw dGdH ZG dIdJ dJe߃ZG dKdL dLeZdS )M    N)defaultdictOrderedDictdeque)copyfile)Module)	Parameter)	Optimizer)_LRScheduler)_flatten_dense_tensors_unflatten_dense_tensors)contextmanager)CallableDictUnionIterable	Container)comm)see_memory_usage
DummyOptim   )OffloadDeviceEnumOffloadStateTypeEnum)DeepSpeedZeroOptimizer)ZeroParamStatus)is_zero_supported_optimizerZeRORuntimeException)DeepSpeedZeRoOffload)ZERO_OPTIMIZATION)FP16_Optimizer)FP16_UnfusedOptimizer)BF16_Optimizer)LoRAOptimizedLinear)GatherReplacedLayerParams!configure_tensor_parallel_runtime)DEEPSPEED_OPTIMIZERSADAGRAD_OPTIMIZERADAM_OPTIMIZERADAMW_OPTIMIZERLAMB_OPTIMIZERONEBIT_ADAM_OPTIMIZERONEBIT_LAMB_OPTIMIZERTORCH_ADAM_PARAMADAM_W_MODEADAM_W_MODE_DEFAULTZERO_ONE_ADAM_OPTIMIZERMUADAM_OPTIMIZERMUADAMW_OPTIMIZERMUSGD_OPTIMIZERLION_OPTIMIZER)DeepSpeedDataLoader)ROUTE_TRAINROUTE_PREDICT
ROUTE_EVAL	PLD_THETA	PLD_GAMMABFLOAT16FP16AMPGRADIENT_ACCUMULATION_STEPSDATA_PARALLEL_GROUPGLOBAL_RANK)ZeroStageEnum)compression_scheduler)"WEIGHT_QUANTIZE_IN_FORWARD_ENABLEDWEIGHT_QUANTIZATIONSHARED_PARAMETERSWEIGHT_QUANTIZE_ENABLEDWEIGHT_QUANTIZE_GROUPS#WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZEWEIGHT_QUANTIZE_CHANGE_RATIOWEIGHT_QUANTIZE_TYPEWEIGHT_QUANTIZE_ROUNDINGWEIGHT_QUANTIZE_VERBOSEWEIGHT_QUANTIZE_KERNEL)OPTIMIZER_STATE_DICTFROZEN_PARAM_FRAGMENTS)SparseTensor)lr_schedules)groups)loggerlog_distinstrument_w_nvtx)	NoopTimerThroughputTimerSynchronizedWallClockTimerFORWARD_MICRO_TIMERBACKWARD_MICRO_TIMERBACKWARD_INNER_MICRO_TIMERBACKWARD_REDUCE_MICRO_TIMERSTEP_MICRO_TIMERFORWARD_GLOBAL_TIMERBACKWARD_GLOBAL_TIMERBACKWARD_INNER_GLOBAL_TIMERBACKWARD_REDUCE_GLOBAL_TIMERSTEP_GLOBAL_TIMER)$debug_extract_module_and_param_names"debug_clear_module_and_param_names)MonitorMaster)ProgressiveLayerDrop)clip_grad_norm_compare_tensors_in_structures)
Eigenvalue)DATA_SAMPLINGDATA_ROUTINGDATA_SAMPLING_ENABLEDCURRICULUM_LEARNINGCURRICULUM_LEARNING_ENABLEDDATA_SAMPLING_NUM_WORKERS
RANDOM_LTDRANDOM_LTD_ENABLEDRANDOM_LTD_LAYER_IDRANDOM_LTD_LAYER_NUM"RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE!RANDOM_LTD_LAYER_TOKEN_LR_ENABLEDRANDOM_LTD_GLOBAL_BATCH_SIZERANDOM_LTD_MICRO_BATCH_SIZEDATA_EFFICIENCY)CurriculumSchedulerRandomLTDScheduler)remove_random_ltd_state_dict)RandomLayerTokenDrop)TorchCheckpointEngine)(get_fp32_state_dict_from_zero_checkpoint)PipelineModule)get_ma_status)is_compile_supported   	FusedAdam)TopKGateMOELayer)MoE)is_moe_paramconfigure_moe_param_groups)version)FlopsProfiler)print_json_distprint_configuration)get_accelerator)	DtypeEnum)is_deepcompile_supportedget_deepcompile_handledeepcompile_backward_prologue)register_compile_pass
opt_passes)zero3_compileprefetchselective_gatheroffload_adam_states)init_z1)init_z3 e)ampTFc           
      C   s   t   }t   }| D ]}|j|v sJ d|j qg g }}t|D ]5\}}g g }}	| D ]}|j|krGt|trB|| q0|	| q0|rQ|||f |	rZ|||	f q%||fS )Nz/attempting to reduce an unsupported grad type: )r   device_namesupported_dtypesdtype	enumerate
isinstancerN   append)
tensorsdevice_typesupported_typestsparse_tensor_bucketsdense_tensor_bucketsir   sparse_bucketdense_bucket r   L/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/engine.pysplit_half_float_double_sparse   s&   






r   c                   @   s   e Zd ZdZdd ZdS )EngineTimersz$Wallclock timers for DeepSpeedEnginec                 C   s
  g | _ g | _g | _g | _g | _g | _g | _|rK|  j tg7  _ |  jtg7  _|  jt	g7  _|  jt
g7  _|  jtg7  _|  jttt	t
tg7  _|r|  j tg7  _ |  jtg7  _|  jtg7  _|  jtg7  _|  jtg7  _|  jtttttg7  _d S d S N)forward_timersbackward_timersbackward_inner_timersbackward_reduce_timersstep_timersglobal_timersmicro_timersrW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   )selfenable_micro_timersenable_global_timersr   r   r   __init__   s8   zEngineTimers.__init__N)__name__
__module____qualname____doc__r   r   r   r   r   r      s    r   c                       s  e Zd ZdZ											d fdd	Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd ZdefddZdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 Zd9d: Zd;d< Z d=d> Z!d?d@ Z"dAdB Z#dCdD Z$dEdF Z%dGdH Z&dIdJ Z'dKdL Z(dMdN Z)dOdP Z*dQdR Z+dSdT Z,dUdV Z-dWdX Z.dYdZ Z/d[d\ Z0d]d^ Z1d_d` Z2dadb Z3dcdd Z4dedf Z5dgdh Z6didj Z7dkdl Z8dmdn Z9dodp Z:dqdr Z;dsdt Z<dudv Z=dwdx Z>dydz Z?d{d| Z@d}d~ ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZLdd ZMdd ZNdd ZOdd ZPdd ZQdd ZRdd ZSdd ZTdd ZUdd ZVdd ZWdd ZXdd ZYdd ZZdd Z[dd Z\dd Z]dd Z^dd Z_dd Z`dd Zadd Zbdd ZcddĄ ZdddƄ ZeddȄ Zfddʄ Zgdd̄ Zhdd΄ ZiddЄ Zjdd҄ ZkddԄ Zlddք Zmdd؄ Znddڄ Zodd܄ Zpddބ Zqdd Zrdd Zsdd Zteudd Zvevjwdd Zvdd Zxdd Zydd Zzdd Z{dd Z|dd Z}dd Z~dd Zdd Zdd Zdd Zdd Zd d Zdd Zdd Zdd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zed&ed'ejddfd(d)Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<d= Zd>d? Zd@dA ZdBdC ZdDdE ZdFdG ZdHdI ZedJdK ZedLdM ZdNdO ZdefdPdQZdedRdddfdSdTZddUdVZdWdX ZddYdZZd[d\ Zd]d^ Zd_d` Zdadb Zedcdd Zdedf Zdgdh ZeefdidjZddkdlZdmdn Zdodp ZeÐdqdr ZeddsdtZŐdudv ZƐdwdx Zǐdydz ZȐd{d| Zi fd}d~ZʐdddZːdd Z̐dd Z͐dd Zΐdd Zϐdd ZАdd Zѐdd ZҐdd ZӐdd ZԐdd ZՐdddZ֐dddZאdddZؐdd Zِdd Zڐdd ZېdddZܐdddZݐdddZސdddZߐdd Zdd ZdddZeddde fddZdddZdd Zdd Zdd ZdddZdd ZedddZddÄ Z		R	R	R		ddĐdńZ	R	R	R		ddƐdǄZddȐdɄZdʐd˄ Zd̐d̈́ Zdΐdτ ZdАdф ZdҐdӄ Zdi dRdfdԐdՄZd֐dׄ Zi dfdؐdلZdڐdۄ Zdܐd݄ Zi dfdސd߄Zdd Zdd Zdd Zdd Zdd Zdd Z dd Zdd Zdd Zdd ZdddZdddZdddZdddZdd Z	e
  i dfd dd Zdd ZdededdfddZdd Zeudefd	d
ZdejdRdfdee dedededdf
ddZddeddfddZ  ZS (!  DeepSpeedEnginezDeepSpeed engine for training.NFc                    s  t t|   || _|| _|| _|| _|	| _|| _d | _	d | _
d| _d| _d| _d| _d| _d| _|
| _|| _d | _d | _d| _d| _d | _d | _d | _d| _t  | _d| _g | _g | _ g | _!d| _"d | _#d| _$d | _%d | _&d | _'d | _(|| _)t*| | j)r| j)t+_)| ,| | -|| | .  | / dkr| 0|| 1  t2d| 3 d |d ur| 4 r| 5 s| 4 rJ d| 6| t78| j t9| jj:| _;t2d| 3 d t<|t=| _>| ?| | @ s| A | _B| C | _Dd	d
 |E D | _F| G  t2d tH | _ItJ| jjK| L | M dd| _NtOd| P  dgd | P r0tQ| jR| | S | _T|r:| U|| _Vnd | _Vd | _Wd | _Xd | _Yd}|sP| Z rRd}|d u r\| jR[ }t<|t\sft\|}|ry| ]|| | ^  | _d n| ` r| jad d| _Wn| b r| jcd d| _Wtd|drddlemf} |jg| jW_g|| jW th | _i| jRj D ]&\}}t<|tkjljmtkjljnfr| o r| jip|d  tqrds| q| t  d| _ud| _vt<| jWtws| x| | y r| z | _| { r| | | _| } r| ~ | _|  r'|  }| L |t< |  |t< | || _t|  |  p3| P d| _| jdkrM| jd |  rMt| d t| _t| _d| _t r|| tjtj | tjtj | tjtj | tjtj d S d S )Nr   TFr   z(DeepSpeed Engine: After args sanity testforcez=Elasticity is not currently supported with model parallelism.z4DeepSpeed Engine: Before configure distributed modelc                 S   s   i | ]\}}||qS r   r   ).0nameparamr   r   r   
<dictcomp>      z,DeepSpeedEngine.__init__.<locals>.<dictcomp>z3DeepSpeed Engine: After configure distributed model)
batch_sizesteps_per_outputmonitor_memoryz"DeepSpeed Flops Profiler Enabled: ranks)	optimizerprunersr   )rewrite_optimizer_stepz.weightz0Will convert {} to sparse tensor during training)r   r   zDeepSpeedEngine configurationr   )superr   r   dont_change_deviceclient_optimizerclient_lr_schedulertraining_data
collate_fnmpuall_to_all_groupdata_parallel_groupglobal_stepsglobal_samplesmicro_stepsskipped_stepsgradient_averagewarn_unscaled_lossconfig_configloaded_checkpoint_mp_world_sizeloaded_checkpoint_dp_world_sizeenable_backward_allreduceinside_no_sync_ctxtprogressive_layer_drop
eigenvalueblock_eigenvaluegas_boundary_ctrr   communication_backend_namedist_backendhas_moe_layersnum_expertsgate_modules
moe_layers_step_applied_global_grad_normuse_ds_commcheckpoint_engine"_is_gradient_accumulation_boundaryscale_wrt_gaslossesmesh_devicera   rP   _do_args_sanity_check_configure_with_arguments_do_sanity_checkautotp_size_configure_tensor_paralleltensor_parallel_configr   memory_breakdownelasticity_enabled#is_elastic_model_parallel_supported_set_distributed_varsdist	configurerc   monitor_configmonitorr   r~   pipeline_parallelism_configure_distributed_modelis_deepcompile_enabled_create_module_forward_pre_hookmodule_forward_pre_hook _create_module_forward_post_hookmodule_forward_post_hooknamed_parametersparam_names_get_model_parametersrV   timersrU   timers_configtrain_batch_sizesteps_per_print
tput_timerrR   flops_profiler_enabledr   module#flops_profiler_recompute_fwd_factorflops_profilerdeepspeed_iotraining_dataloaderr   basic_optimizerlr_scheduleroptimizer_name
parameterslist_configure_optimizer_configure_lr_scheduler_report_progresszero_optimization_configure_zero_optimizerbfloat16_enabled_configure_bf16_optimizerhasattrcompression.helperr   r   setsparse_tensor_module_namesnamed_modulestorchnn	EmbeddingEmbeddingBagsparse_gradients_enabledaddrQ   infoformat_optimized_linear_offload_setupsave_non_zero_checkpointsave_zero_checkpointr   _configure_checkpointingeigenvalue_enabled_configure_eigenvaluepld_enabled!_configure_progressive_layer_dropcurriculum_enabled_legacy&_configure_curriculum_scheduler_legacycurriculum_scheduler_legacyrandom_ltd_enabledrandom_ltd_configrt   train_micro_batch_size_per_gpuru   _configure_random_ltd_schedulerrandom_ltd_schedulerr   wall_clock_breakdownengine_timersglobal_rankprint
dump_stater   r
   flattenr   	unflatten_is_compiledr   r   r   NAMEadd_z3_gather_releaser   schedule_prefetchr   r   move_opt_states)r   argsmodelr   model_parametersr   r  r   dist_init_requiredr   r   config_classr   r   has_optimizerr   r   r  r>  	__class__r   r   r      s  











"










zDeepSpeedEngine.__init__c                 C   s  d| _ d| _d }| j D ](\}}t|tr5d| _d }|d ur)||jjks)J d|jj}|jdkr5d| _ q|d u r<d S d}| j	 D ]\}}t
|drR|| 7 }qC|| }td|d  d	| d
 d}| j	 D ]\}}t
|dr||k r|| 7 }d|_|  qld|_qld S )NFTzBall lora_config offload ratios should be the same across the modelr   r   ds_optim_paramzoffloading d   z#% of eligible params, specifically z params)%optimized_linear_base_weight_shardingoptimized_linear_lora_enabledr  r)  r   r!   lora_configoffload_ratiozero_shardsr  r%  numelrQ   r0  
ds_offloadoffload)r   r[  _r  total_paramspoffload_limittotal_offloadedr   r   r   r2    sD   




z/DeepSpeedEngine._optimized_linear_offload_setupc                 C   s   |  | t| d S r   )!_configure_tensor_parallel_statesr#   )r   rO  	tp_configr   r   r   r     s   
z*DeepSpeedEngine._configure_tensor_parallelc                    s`     |   dksJ dt _ jj  d d _ fdd} jj|ddd _dS )	a  
        Configures the tensor parallel states for the model.
        This includes setting up the tensor parallel groups, initializing the TP mesh,
        and registering a pre-hook to ensure that the Dataloader inputs are consistent across ranks.
        r   zYCurrently, the compatibility between 'autotp' and 'zero_stage = 3' has not been validated)tensor_model_parallel_sizeNc                    sP    fdd} j  } j  }|||| |||| td  j  d S )Nc                    s   t | tr	t| } t| dkrg j dkr(| g}tj|||t 	 d d}nd g}tj|||t 	 d t
| |d }tj| jt 	 d}tj||d t|tjt  jt 	 dsiJ dd S d S )Nr   )object_listsrcgroupdeviceTr   rk  rj  ziData inconsistency within the TP group. Please check the Dataloader implementation to ensure consistency.)r   tupler  lenr   get_tensor_model_parallel_rankr   broadcast_object_listr   current_devicerf   r*  tensorcommunication_data_type
all_reduceequalrP   $get_tensor_model_parallel_world_size)rN  
bcast_rankbcast_group	_src_argsis_equalequal_tensorr   r   r   broadcast_and_check  sD   

zDeepSpeedEngine._configure_tensor_parallel_states.<locals>.check_dataloader_inputs_same_across_ranks.<locals>.broadcast_and_checkz::The Dataloader has passed the TP group consistency check.)r   "get_tensor_model_parallel_src_rankget_tensor_model_parallel_grouprQ   r0  first_dataloader_checkremove)r  rN  kwargsr~  rx  ry  r}  r   r   )check_dataloader_inputs_same_across_ranks  s   
 

zdDeepSpeedEngine._configure_tensor_parallel_states.<locals>.check_dataloader_inputs_same_across_ranksTprependwith_kwargs)	_set_client_modelzero_optimization_stagerP   r   _init_tp_mesh_devicer   r  r  register_forward_pre_hook)r   rO  r  r   r}  r   re    s   
+z1DeepSpeedEngine._configure_tensor_parallel_statesc                 C   s<   | j d urt| j dr| j   |  rt   t  d S )Ndestroy)r   r%  r  r  r   cleanuprb   r}  r   r   r   r    s
   


zDeepSpeedEngine.destroyc                 C   s   |   rPi | _d}d}| j D ] }d}t|dr||j7 }n|| 7 }||7 }|jr0||7 }q| jdkrF|| j	 | jd< || j	 | jd< t
d|  d S d S )Nr   	ds_tensor
num_paramstrainable_num_paramszmodel parameter = )autotuning_profile_model_infoautotuning_model_infor  r  r%  ds_numelr]  requires_gradrD  mp_world_sizerQ   r0  )r   r  r  rb  nr   r   r   r    s$   

z%DeepSpeedEngine._get_model_parametersc                 C   s   | j | j| jfS )a  Get all training batch related settings.
        Returns:
            train_batch_size (int): The effective training batch size. This is the amount of data
                samples that leads to one step of model update.
            train_micro_batch_size_per_gpu (int): Batch size to be processed by one GPU in one
                step (without gradient accumulation).
            gradient_accumulation_steps (int): Number of training steps to accumulate gradients
                before averaging and applying them.
        )r  r?  gradient_accumulation_stepsr}  r   r   r   get_batch_info  s   zDeepSpeedEngine.get_batch_infoc                 C   sD   ||   | j  dkrtd||   | j  }|| j_|| j_dS )a  Adjust the global batch size by increasing or decreasing the number of
        micro-batches (i.e., gradient accumulation steps). The size of each micro-batch
        (i.e., ``train_micro_batch_size_per_gpu``) is not changed.
        Args:
            train_batch_size (int): The new global batch size for training.
        Raises:
            ValueError: if ``train_batch_size`` is not divisible by the
                configured micro-batch size and data parallelism.
        r   zBTrain batch size must be divisible by micro-batch data parallelismN)r?  dp_world_size
ValueErrorr   r  r  )r   r  new_gasr   r   r   set_train_batch_size   s
   
z$DeepSpeedEngine.set_train_batch_sizec                 C   s&   || j j | j }|| j _|| j _dS )zAdjust the micro batch size(i.e., the micro batch size in every data parallel group),
        while keep the gradient accumulation steps the same.
        Args:
            micro_batch_size (int): The new micro batch size for training.
        N)r   r  r  r  r?  )r   micro_batch_sizenew_global_batch_sizer   r   r   set_train_micro_batch_size2  s   z*DeepSpeedEngine.set_train_micro_batch_sizec                 C   s   | j d ur|| j _d S d S r   )r  post_process_func)r   r  r   r   r   set_data_post_process_func=  s   
z*DeepSpeedEngine.set_data_post_process_funcc                 C   s,   | j d ur|  r| j j| d S d S d S r   )r  curriculum_learning_enableddata_sampler'set_custom_curriculum_learning_schedule)r   schedule_func_dictr   r   r   r  A  s   z7DeepSpeedEngine.set_custom_curriculum_learning_schedulereturnc                 C      | j S )a  Return the 2-norm of all gradients. If there is model parallelism,
        the norm will be global.
        The computed norm will be cached and reused until the next step() pass.
        .. note::
            In the presence of model parallelism, this is a collective call
            and acts as a barrier among ``mpu.get_model_parallel_group()``.
        Returns:
            float: norm
        )r   r}  r   r   r   get_global_grad_normE  s   
z$DeepSpeedEngine.get_global_grad_normc                 C   s`   i }d| j v r| j d }|t| v rt| |S |t|v r"t||S tdt| j d| d)zg
        Pass through attributes defined in the model if they are not overridden by ds-engine.
        r  'z' object has no attribute ')__dict__dirgetattrAttributeErrortyper   )r   r   _moduler   r   r   __getattr__Q  s   



zDeepSpeedEngine.__getattr__c                 C      | j jS r   )r   !checkpoint_tag_validation_enabledr}  r   r   r   r  `     z1DeepSpeedEngine.checkpoint_tag_validation_enabledc                 C   r  r   )r   checkpoint_tag_validation_failr}  r   r   r   r  c  r  z.DeepSpeedEngine.checkpoint_tag_validation_failc                 C   r  r   )r   r   r}  r   r   r   r   f  r  z"DeepSpeedEngine.elasticity_enabledc                 C   s(   |   r| jj| jj dkrdS dS d S Nr   TF)r   r   num_gpus_per_nodeelastic_model_parallel_sizer}  r   r   r   r   i  s
   z3DeepSpeedEngine.is_elastic_model_parallel_supportedc                 C   r  r   )r   r8  r}  r   r   r   r8  q  r  zDeepSpeedEngine.pld_enabledc                 C   r  r   )r   
pld_paramsr}  r   r   r   r  t  r  zDeepSpeedEngine.pld_paramsc                 C      |   t S r   )r  r7   r}  r   r   r   	pld_thetaw     zDeepSpeedEngine.pld_thetac                 C   r  r   )r  r8   r}  r   r   r   	pld_gammaz  r  zDeepSpeedEngine.pld_gammac                 C   r  r   )r   r6  r}  r   r   r   r6  }  r  z"DeepSpeedEngine.eigenvalue_enabledc                 C   r  r   )r   eigenvalue_verboser}  r   r   r   r    r  z"DeepSpeedEngine.eigenvalue_verbosec                 C   r  r   )r   eigenvalue_max_iterr}  r   r   r   r    r  z#DeepSpeedEngine.eigenvalue_max_iterc                 C   r  r   )r   eigenvalue_tolr}  r   r   r   r    r  zDeepSpeedEngine.eigenvalue_tolc                 C   r  r   )r   eigenvalue_stabilityr}  r   r   r   r    r  z$DeepSpeedEngine.eigenvalue_stabilityc                 C   r  r   )r   "eigenvalue_gas_boundary_resolutionr}  r   r   r   r    r  z2DeepSpeedEngine.eigenvalue_gas_boundary_resolutionc                 C   r  r   )r   eigenvalue_layer_namer}  r   r   r   r    r  z%DeepSpeedEngine.eigenvalue_layer_namec                 C   r  r   )r   eigenvalue_layer_numr}  r   r   r   r    r  z$DeepSpeedEngine.eigenvalue_layer_numc                 C   r  r   )r   r:  r}  r   r   r   r:    r  z)DeepSpeedEngine.curriculum_enabled_legacyc                 C   r  r   )r   curriculum_params_legacyr}  r   r   r   r    r  z(DeepSpeedEngine.curriculum_params_legacyc                 C   r  r   )r   data_efficiency_enabledr}  r   r   r   r    r  z'DeepSpeedEngine.data_efficiency_enabledc                 C   r  r   )r   data_efficiency_configr}  r   r   r   r    r  z&DeepSpeedEngine.data_efficiency_configc                 C      | j jt t S r   )r   r  rh   rj   r}  r   r   r   data_sampling_enabled     z%DeepSpeedEngine.data_sampling_enabledc                 C   s   | j jt S r   )r   r  rh   r}  r   r   r   data_sampling_config  r  z$DeepSpeedEngine.data_sampling_configc                 C      | j jt t t S r   )r   r  rh   rk   rl   r}  r   r   r   r       z+DeepSpeedEngine.curriculum_learning_enabledc                 C   r  r   )r   r  rh   rk   r}  r   r   r   curriculum_learning_config  r  z*DeepSpeedEngine.curriculum_learning_configc                 C   r  r   )r   r  ri   rn   ro   r}  r   r   r   r=    r  z"DeepSpeedEngine.random_ltd_enabledc                 C   r  r   )r   r  ri   rn   r}  r   r   r   r>    r  z!DeepSpeedEngine.random_ltd_configc                 C   s   |   sJ |  }tdd t|t D }d}| j D ]'\}}t|trEt	|dkrEt
|d |v rE||| j| |  |d7 }q|t |krXtd|t  d| |t t ri| jd u seJ tdd S )Nc                 S   s   g | ]}|qS r   r   )r   xr   r   r   
<listcomp>  s    z9DeepSpeedEngine.random_ltd_initialize.<locals>.<listcomp>r   r   zrandom_ltd_layer_num zF must be                 equivalent to the len of random_ltd_layer_id znot yet support)r=  r>  r   sortedrp   r  r)  r   r{   ro  strinit_configrA  popleftrq   r  rr   rs   r   )r   r>  random_ltd_queuecountr   layerr   r   r   random_ltd_initialize  s&   
z%DeepSpeedEngine.random_ltd_initializec                 C   r  r   )seq_parallel_groupr}  r   r   r   get_sequence_parallel_group  s   z+DeepSpeedEngine.get_sequence_parallel_groupc                 C   r  r   )r   rB  r}  r   r   r   rB    r  z$DeepSpeedEngine.wall_clock_breakdownc                 C   s   | j jjp|  S r   )r   flops_profiler_configenabledautotuning_enabledr}  r   r   r   r       z&DeepSpeedEngine.flops_profiler_enabledc                 C   
   | j jjS r   )r   r  recompute_fwd_factorr}  r   r   r   r       
z3DeepSpeedEngine.flops_profiler_recompute_fwd_factorc                 C   s    | j jj}| j jjr|  }|S r   )r   r  profile_stepautotuning_configr  autotuning_start_profile_step)r   stepr   r   r   flops_profiler_profile_step  s   

z+DeepSpeedEngine.flops_profiler_profile_stepc                 C   r  r   )r   r  module_depthr}  r   r   r   flops_profiler_module_depth  r  z+DeepSpeedEngine.flops_profiler_module_depthc                 C   r  r   )r   r  top_modulesr}  r   r   r   flops_profiler_top_modules  r  z*DeepSpeedEngine.flops_profiler_top_modulesc                 C   s   | j jjrdS | j jjS NF)r   r  r  r  detailedr}  r   r   r   flops_profiler_detailed  s   

z'DeepSpeedEngine.flops_profiler_detailedc                 C   r  r   )r   r  output_filer}  r   r   r   flops_profiler_output_file  r  z*DeepSpeedEngine.flops_profiler_output_filec                 C   r  r   )r   r   r}  r   r   r   r     r  z DeepSpeedEngine.memory_breakdownc                 C   r  r   )r   r  r  r}  r   r   r   r    r  z"DeepSpeedEngine.autotuning_enabledc                 C   r  r   )r   r  start_profile_stepr}  r   r   r   r    r  z-DeepSpeedEngine.autotuning_start_profile_stepc                 C   r  r   )r   r  end_profile_stepr}  r   r   r   autotuning_end_profile_step  r  z+DeepSpeedEngine.autotuning_end_profile_stepc                 C   $   | j jj}|stjt d}|S )Nzautotuning_metric.json)r   r  metric_pathospathjoingetcwdr   r  r   r   r   autotuning_metric_path     
z&DeepSpeedEngine.autotuning_metric_pathc                 C   r  )Nzautotuning_model_info.json)r   r  model_info_pathr  r  r  r  r  r   r   r   autotuning_model_info_path  r  z*DeepSpeedEngine.autotuning_model_info_pathc                 C   r  r   )r   r  metricr}  r   r   r   autotuning_metric   r  z!DeepSpeedEngine.autotuning_metricc                 C   s$   |   o| jjjo| jjjddS )NprofileF)r  r   r  
model_infogetr}  r   r   r   r    s   
z-DeepSpeedEngine.autotuning_profile_model_infoc                 C   r  r   )r   r.  r}  r   r   r   r.    r  z(DeepSpeedEngine.sparse_gradients_enabledc                 C   r  r   )r   r  r}  r   r   r   r    r  z DeepSpeedEngine.train_batch_sizec                 C   r  r   )r   r?  r}  r   r   r   r?    r  z.DeepSpeedEngine.train_micro_batch_size_per_gpuc                 C   s   | j r| j jjS | jjS r   )r   rU  r   r   r  r}  r   r   r   r    s   zDeepSpeedEngine.optimizer_namec                 C   r  r   )r   optimizer_paramsr}  r   r   r   r    r  z DeepSpeedEngine.optimizer_paramsc                 C   r  r   )r   optimizer_legacy_fusionr}  r   r   r   r    r  z'DeepSpeedEngine.optimizer_legacy_fusionc                 C   r  r   )r   scheduler_namer}  r   r   r   r    r  zDeepSpeedEngine.scheduler_namec                 C   r  r   )r   scheduler_paramsr}  r   r   r   r    r  z DeepSpeedEngine.scheduler_paramsc              
   C   s   | j jt t t | j jt t t | j jt t t | j jt t t | j jt t t | j jt t t	 | j jt t t
 | j jt t t | j jt t t f	S r   )r   compression_configrB   rC   rA   rD   rE   rF   rG   rH   rI   rJ   rK   r}  r   r   r   quantize_training   s   z!DeepSpeedEngine.quantize_trainingc                 C   r  r   )r   zero_enabledr}  r   r   r   r!  /  r  z!DeepSpeedEngine.zero_optimizationc                 C   r  r   )r   zero_allow_untested_optimizerr}  r   r   r   r	  2  r  z-DeepSpeedEngine.zero_allow_untested_optimizerc                 C   r  r   )r   zero_force_ds_cpu_optimizerr}  r   r   r   r
  5  r  z+DeepSpeedEngine.zero_force_ds_cpu_optimizerc                 C   r  r   )r   zero_configreduce_scatterr}  r   r   r   zero_reduce_scatter8  r  z#DeepSpeedEngine.zero_reduce_scatterc                 C   r  r   )r   r  overlap_commr}  r   r   r   zero_overlap_comm;  r  z!DeepSpeedEngine.zero_overlap_commc                 C   r  r   )r   r  offload_optimizerr}  r   r   r   zero_offload_optimizer>  r  z&DeepSpeedEngine.zero_offload_optimizerc                 C   r  r   )r   r  offload_paramr}  r   r   r   zero_offload_paramA  r  z"DeepSpeedEngine.zero_offload_paramc                 C   s*   | j jjd ur| j jjjtjtjfv S dS r  )r   r  r  rk  r   cpunvmer}  r   r   r   zero_use_cpu_optimizerD  s   z&DeepSpeedEngine.zero_use_cpu_optimizerc                 C   s$   | j jjd ur| j jjjtjkS dS r  )r   r  r  rk  r   r  r}  r   r   r   zero_cpu_offloadI  s   z DeepSpeedEngine.zero_cpu_offloadc                 C   s   t | jjjddS )Nratio      ?)r  r   r  r  r}  r   r   r   zero_partial_offloadN  r  z$DeepSpeedEngine.zero_partial_offloadc                 C   r  r   )r   r  sub_group_sizer}  r   r   r   zero_sub_group_sizeQ  r  z#DeepSpeedEngine.zero_sub_group_sizec                 C   r  r   )r   r  r}  r   r   r   r  T  r  z'DeepSpeedEngine.zero_optimization_stagec                 C   r  r   )r   mics_shard_sizer}  r   r   r   r  W  r  zDeepSpeedEngine.mics_shard_sizec                 C   r  r   )r   r  reduce_bucket_sizer}  r   r   r   zero_reduce_bucket_sizeZ  r  z'DeepSpeedEngine.zero_reduce_bucket_sizec                 C   r  r   )r   r  use_multi_rank_bucket_allreducer}  r   r   r    zero_multi_rank_bucket_allreduce]  r  z0DeepSpeedEngine.zero_multi_rank_bucket_allreducec                 C   r  r   )r   r  allgather_bucket_sizer}  r   r   r   zero_allgather_bucket_size`  r  z*DeepSpeedEngine.zero_allgather_bucket_sizec                 C      |   tjkS r   )r  r?   	gradientsr}  r   r   r   %zero_optimization_partition_gradientsc     z5DeepSpeedEngine.zero_optimization_partition_gradientsc                 C   r$  r   )r  r?   weightsr}  r   r   r   #zero_optimization_partition_weightsf  r'  z3DeepSpeedEngine.zero_optimization_partition_weightsc                 C   s>   |   dk r|  rdnd}|   dkr| j|   k rd}|S r  )r  r)  rD  )r   retr   r   r    is_first_weights_partition_groupi  s   z0DeepSpeedEngine.is_first_weights_partition_groupc                 C   r  r   )r   r  contiguous_gradientsr}  r   r   r   zero_contiguous_gradientsp  r  z)DeepSpeedEngine.zero_contiguous_gradientsc                 C   r  r   )r   r  load_from_fp32_weightsr}  r   r   r   zero_load_from_fp32_weightss  r  z+DeepSpeedEngine.zero_load_from_fp32_weightsc                 C   r  r   )r   r  elastic_checkpointr}  r   r   r   zero_elastic_checkpointv  r  z'DeepSpeedEngine.zero_elastic_checkpointc                 C   s   t | jddS )Nswap_optimizerF)r  r   r}  r   r   r   zero_nvme_offload_optimizery  r'  z+DeepSpeedEngine.zero_nvme_offload_optimizerc                 C   r  r   )r   r  max_live_parametersr}  r   r   r   zero_max_live_parameters|  r  z(DeepSpeedEngine.zero_max_live_parametersc                 C   r  r   )r   r  max_reuse_distancer}  r   r   r   zero_max_reuse_distance  r  z'DeepSpeedEngine.zero_max_reuse_distancec                 C   r  r   )r   r  prefetch_bucket_sizer}  r   r   r   zero_prefetch_bucket_size  r  z)DeepSpeedEngine.zero_prefetch_bucket_sizec                 C   r  r   )r   r  module_granularity_thresholdr}  r   r   r   !zero_module_granularity_threshold  r  z1DeepSpeedEngine.zero_module_granularity_thresholdc                 C   r  r   )r   r  param_persistence_thresholdr}  r   r   r    zero_param_persistence_threshold  r  z0DeepSpeedEngine.zero_param_persistence_thresholdc                 C   r  r   )r   r  model_persistence_thresholdr}  r   r   r    zero_model_persistence_threshold  r  z0DeepSpeedEngine.zero_model_persistence_thresholdc                 C   r  r   )r   r  "gather_16bit_weights_on_model_saver}  r   r   r   'zero_gather_16bit_weights_on_model_save  r  z7DeepSpeedEngine.zero_gather_16bit_weights_on_model_savec                 C   r  r   )r   r  
grad_hooksr}  r   r   r   zero_grad_hooks  r  zDeepSpeedEngine.zero_grad_hooksc                 C   r  r   )r   r  legacy_stage1r}  r   r   r   zero_legacy_stage1  r  z"DeepSpeedEngine.zero_legacy_stage1c                 C   r  r   )r   r  ignore_unused_parametersr}  r   r   r   zero_ignore_unused_parameters  r  z-DeepSpeedEngine.zero_ignore_unused_parametersc                 C   r  r   )r   r   r}  r   r   r   r     r  z&DeepSpeedEngine.tensor_parallel_configc                 C   r  r   )r   r   r   r}  r   r   r   r     r  zDeepSpeedEngine.autotp_sizec                 C   r  r   )r   graph_harvestingr}  r   r   r   rH    r  z DeepSpeedEngine.graph_harvestingc                 C   r  r   )r   fp16_enabledr}  r   r   r   rI    r  zDeepSpeedEngine.fp16_enabledc                 C   r  r   )r   r#  r}  r   r   r   r#    r  z DeepSpeedEngine.bfloat16_enabledc                 C   r  r   )r   !fp16_master_weights_and_gradientsr}  r   r   r   rJ    r  z1DeepSpeedEngine.fp16_master_weights_and_gradientsc                 C   r  r   )r   amp_enabledr}  r   r   r   rK    r  zDeepSpeedEngine.amp_enabledc                 C   r  r   )r   
amp_paramsr}  r   r   r   rL    r  zDeepSpeedEngine.amp_paramsc                 C   r  r   )r   fp16_auto_castr}  r   r   r   rM    r  zDeepSpeedEngine.fp16_auto_castc                 C   r  r   r   
loss_scaler}  r   r   r   rO    r  zDeepSpeedEngine.loss_scalec                 C   r  r   )r   r  r}  r   r   r   r    r  z+DeepSpeedEngine.gradient_accumulation_stepsc                 C   r  r   )r   use_node_local_storager}  r   r   r   rP    r  z&DeepSpeedEngine.use_node_local_storagec                 C   r  r   )r   load_universal_checkpointr}  r   r   r   rQ    r  z)DeepSpeedEngine.load_universal_checkpointc                 C   s6   | j j}|d ur
|S |  rtjS |  rtjS tjS r   )r   rt  rI  r*  float16r#  bfloat16float32)r   resr   r   r   rt    s   z'DeepSpeedEngine.communication_data_typec                 C   s   || j _d S r   )r   rt  )r   valuer   r   r   rt    s   c                 C   s
   | j j S r   )r   prescale_gradientsr}  r   r   r   postscale_gradients  r  z#DeepSpeedEngine.postscale_gradientsc                 C   r  r   )r   gradient_predivide_factorr}  r   r   r   rY    r  z)DeepSpeedEngine.gradient_predivide_factorc                 C   r  r   )r   r  r}  r   r   r   r    r  zDeepSpeedEngine.steps_per_printc                 C   r  r   )r   r  allgather_partitionsr}  r   r   r   zero_allgather_partitions  r  z)DeepSpeedEngine.zero_allgather_partitionsc                 C   r  r   )r   r  round_robin_gradientsr}  r   r   r   zero_round_robin_gradients  r  z*DeepSpeedEngine.zero_round_robin_gradientsc                 C   r  r   )r   r  zero_hpz_partition_sizer}  r   r   r   r^    r  z'DeepSpeedEngine.zero_hpz_partition_sizec                 C   r  r   )r   r  zero_quantized_weightsr}  r   r   r   r_    r  z&DeepSpeedEngine.zero_quantized_weightsc                 C   r  r   )r   r  #zero_quantized_nontrainable_weightsr}  r   r   r   r`    r  z3DeepSpeedEngine.zero_quantized_nontrainable_weightsc                 C   r  r   )r   r  zero_quantized_gradientsr}  r   r   r   ra    r  z(DeepSpeedEngine.zero_quantized_gradientsc                 C   r  r   )r   r  zeropp_loco_paramr}  r   r   r   rb    r  z!DeepSpeedEngine.zeropp_loco_paramc                 C   r  r   )r   r  log_trace_cache_warningsr}  r   r   r   zero_log_trace_cache_warnings  r  z-DeepSpeedEngine.zero_log_trace_cache_warningsc                 C   r  r   )r   rF  r}  r   r   r   rF    r  zDeepSpeedEngine.dump_statec                 C   r  r   )r   gradient_clippingr}  r   r   r   re    r  z!DeepSpeedEngine.gradient_clippingc                 C   s   | j jdkS Nr   rN  r}  r   r   r   dynamic_loss_scale  r  z"DeepSpeedEngine.dynamic_loss_scalec                 C   r  r   )r   initial_dynamic_scaler}  r   r   r   rh    r  z%DeepSpeedEngine.initial_dynamic_scalec                 C   r  r   )r   dynamic_loss_scale_argsr}  r   r   r   ri     r  z'DeepSpeedEngine.dynamic_loss_scale_argsc                 C   r  r   )r   swap_tensor_configr}  r   r   r   rj    r  z"DeepSpeedEngine.swap_tensor_configc                 C   r  r   )r   
aio_configr}  r   r   r   rk    r  zDeepSpeedEngine.aio_configc                 C   sr   t j}|  rt j}n|  rt j}| jjd u r.|t jkr(|  s(t j}||fS |}||fS t	| jjj
}||fS r   )r*  rT  rI  rR  r#  rS  r   grad_accum_dtyper!  r   rV  )r   model_dtyperl  r   r   r   get_data_types	  s   zDeepSpeedEngine.get_data_typesc                 C      | j d uo
t| j dS )Ncheckpoint_event_prologuer   r%  r}  r   r   r   "_optimizer_has_ckpt_event_prologue     z2DeepSpeedEngine._optimizer_has_ckpt_event_prologuec                 C   ro  )Ncheckpoint_event_epiloguerq  r}  r   r   r   "_optimizer_has_ckpt_event_epilogue  rs  z2DeepSpeedEngine._optimizer_has_ckpt_event_epiloguec                 C   s   | j r$t| j trtddgd |  | j| _n!tddgd | j | _n| | j}td|   dgd || _td| j dgd d S )Nz6DeepSpeed using client callable to create LR schedulerr   r   z#DeepSpeed using client LR schedulerz*DeepSpeed using configured LR scheduler = zDeepSpeed LR Scheduler = )	r   r   r   rR   r  r  _scheduler_from_configr   r  )r   r  r   r   r   r     s   
z'DeepSpeedEngine._configure_lr_schedulerc              
   C   s   t  | _| jd ur=| jjjr=zddlm} || jjd| _W n ty< } zt	d|  t  | _W Y d }~nd }~ww t
 }|  rH| jn|}|dkpU|  oU|  | _|  s_|  rntj| jjd}||k| _d S d S )Nr   )NebulaCheckpointEngine)config_paramszBNo torch_nebula was found! Will fall back to torch.save. Details: rm  )r|   r   r   nebula_configr  <deepspeed.runtime.checkpoint_engine.nebula_checkpoint_enginerw  ImportErrorrQ   errorrP    _get_sequence_data_parallel_rankrP  
local_rankr)  r+  r3  r!  r#  r   get_rankr   dp_process_groupr4  )r   rQ  rw  errdp_rankrank
param_rankr   r   r   r5  0  s&   z(DeepSpeedEngine._configure_checkpointingc                 C   sp   |   }|d ur6tt|rtt|}nttjj|s!J d| ttjj|}|  }||fi |}|S d S )Nz*DeepSpeed does not recognize LR scheduler )r  r%  rO   r  r*  optimr  r  )r   r   r  	schedulerr  instantiated_schedulerr   r   r   rv  M  s   
z&DeepSpeedEngine._scheduler_from_configc                 C   sz   |d urt |dr|jn| j}|dkr/t | tt || _t	 | _
t | _d S d| _
d| _t  | _d S )Ndevice_rankr   r   )r%  r  r~  r   
set_devicer*  rk  r   r   get_world_size
world_sizer  rD  )r   rN  r  r   r   r   r   ^  s   
z%DeepSpeedEngine._set_distributed_varsc                 C   sv   dt jv r&t jd}t jd|}||ks!J d| d| d|t jd< tt jd | _t|dr9| j|_d S d S )NOMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANKzLOCAL_RANK (z!) != OMPI_COMM_WORLD_LOCAL_RANK (zG), not sure how to proceed as we're seeing conflicting local rank info.r~  )r  environr  intr~  r%  )r   rN  r   ompi_local_rankr~  r   r   r   r   k  s   


z)DeepSpeedEngine._configure_with_argumentsc                 C   s   dt jv sdt jv sJ dt|drJ|jd urLt|jts,J d|j dt|j |jdkrNtt jd}||jksPJ d|j d	| d
d S d S d S d S )Nr  r  zDeepSpeed requires the LOCAL_RANK environment variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch's launcher. If using a different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed.r~  zargs.local_rank of z is an unknown type r   z0Mismatch in local rank setting, args.local_rank=z but env['LOCAL_RANK']=.)r  r  r%  r~  r   r  r  r  )r   rN  env_local_rankr   r   r   r   }  s   
z%DeepSpeedEngine._do_args_sanity_checkc                 C   s   |t v pttj|d d uS r   )r$   r  r*  r  )r   r  r   r   r   _is_supported_optimizer  s   z'DeepSpeedEngine._is_supported_optimizerc                 C   s@   d }zddl m} W n	 ty   Y nw tg}|r|| |S )Nr   )FairseqOptimizer)fairseq.optim.fairseq_optimizerr  r{  r   r   )r   r  expected_optim_typesr   r   r   _supported_optims  s   
z!DeepSpeedEngine._supported_optimsc                 C   s
  |   rt  std|  rt  std|  }|td tg7 }t	| j
t|s8J dt| j
 | j
sQ|  d urQ| |  sQJ d|  |  tks]|  tkrj|  sjJ d|  t	| jtrt	| j
tsJ dt| j
 dd S d S )Nz*Type fp16 is not supported on your device.z*Type bf16 is not supported on your device.z'Client Optimizer is of unexpected type z){} is not a supported DeepSpeed Optimizerz4DeepSpeed {} optimizer requires dynamic loss scalingzClient Optimizer (type = z< is not instantiated but Client LR Scheduler is instantiated)rI  r   is_fp16_supportedr  r#  is_bf16_supportedr  r  r   r   r   rn  r  r  r1  r(   r*   rg  r   r	   r   )r   r  r   r   r   r     s4   z DeepSpeedEngine._do_sanity_checkc                 C   s   dd }| j  D ]8\}}t|r,t|r+||r+tj|jt	|j
| j|j
 d q	t|rA||rAtj|jt | jd q	d S )Nc                 S   s,   t | dr| jtjurdS t | drdS dS )N	ds_statusFrV  T)r%  r  r   	AVAILABLE)rb  r   r   r   is_replicated  s
   
z7DeepSpeedEngine._broadcast_model.<locals>.is_replicatedrm  )r  r  r   r*  	is_tensorr   	broadcastdatarP   _get_expert_broadcast_src_rank
group_nameexpert_data_parallel_group_get_broadcast_src_rankseq_data_parallel_group)r   r  r  rb  r   r   r   _broadcast_model  s   

z DeepSpeedEngine._broadcast_modelrO  r   c                    s   d S r   )allr  r   r  r  r  )rO  r   r   )r   r   __check_params  s   zDeepSpeedEngine.__check_paramsc                 C   s"   | j d}||d< || j d< d S )N_modulesr  )r  r  )r   rO  modulesr   r   r   r    s   z!DeepSpeedEngine._set_client_modelc                 C   s>  |  | |  otdd | j D }|  r)|r#| | jtj | j  n| 	 r=|r7| | jtj
 | j
  n| | jtj | jsQ|sQ| j| j | j D ]\}}t|trid| _| j|j qV| jr| j D ](\}}t|tr| j| |  rd|_t|tr| j| |  rd|_qr| jd ur| jt_| j D ]\}}t|dr|| jj qd | _ | ! r| " rdnd}t#|dgd t$ | _ t% | _&t' | _(t) | _*t+ | _,t- | _.t/ | _0t1 | _2t3 | _4| j4d	kr| jj5| _6t7 | _8| 9 s|s| :  d S d S d S )
Nc                 S   s   g | ]}t |d qS ds_idr%  r   r   r   r   r   r    r   z@DeepSpeedEngine._configure_distributed_model.<locals>.<listcomp>Tset_deepspeed_parallelismzUsing LoCo quantized gradientszUsing quantized gradientsr   r   r   );r  r)  anyr  r  rI  _DeepSpeedEngine__check_paramsr*  halfr#  rS  floatr   tork  r)  r   r   r   r   r   r   r   rB  r   r   r   rP   r%  r  r    use_data_before_expert_parallel_local_all_to_all_groupra  rb  rR   _get_local_all_to_all_group_get_data_parallel_groupr   _get_data_parallel_world_sizer  !_get_sequence_data_parallel_groupr  &_get_sequence_data_parallel_world_sizeseq_dp_world_size_get_model_parallel_world_sizer  _get_expert_parallel_group_dictexpert_parallel_group$_get_expert_data_parallel_group_dictr  !_get_sequence_parallel_world_sizesequence_parallel_size$seq_parallel_communication_data_typert  _get_sequence_parallel_groupr  rK  r  )r   rO  is_zero_init_modelr`  r  messager   r   r   r    sp   


















z,DeepSpeedEngine._configure_distributed_modelc                    sZ   | j  D ]%\}}t|dd  t fdd|jD }|dks*J d| dqd S )Nc                 S   s   dd | D S )Nc                 S   s   g | ]}t |qS r   )idr  r   r   r   r  #      zKDeepSpeedEngine._check_for_duplicates.<locals>.ids_list.<locals>.<listcomp>r   rm  r   r   r   ids_list"  r'  z7DeepSpeedEngine._check_for_duplicates.<locals>.ids_listc                    s2   g | ]} |d  v r |d   ndqS paramsr   )r  )r   rj  r  param_idr   r   r  %  s    $z9DeepSpeedEngine._check_for_duplicates.<locals>.<listcomp>r   zParameter with name: zo occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behavior.)r  r  r  sumparam_groups)r   r   r   r   
occurrencer   r  r   _check_for_duplicates  s   z%DeepSpeedEngine._check_for_duplicatesc                 C   sL  |   \}}|  }|  }|r|rJ d|rFt|s.|  s$J d| jdkr.td |tj	krD|tj
krD|  dkrD|  sDtS tS |rv||krPtd|tj	ksZ|tjkr^tdztdtj W tS  tyu   td	w ||kr|tj	kr| jrtd
 tS td|tjkrtS d S |tj	kr|tj
krtS td)NzrAmp and ZeRO are not currently compatible, please use (legacy) fp16 mode which performs similar to amp opt_mode=O2zYou are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.r   zN**** You are using ZeRO with an untested optimizer, proceed with caution *****r   zLModel data type and gradient accumulation data type must be equal to use Ampz:Cannot enable both amp with (legacy) fp16 or bfloat16 modezInitializing Apex amp from: {}z;Unable to import apex/amp, please make sure it is installedz{**** BF16 gradient accumulation is not safe numerically with large number of accumulation steps, proceed with caution *****zqBfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulationz=unsupported mix of model dtype and gradient accumulation type)rn  r!  rK  r   r	  rD  rQ   warningr*  rS  rT  r  r  r9   r   NotImplementedErrorrR  r0  r1  r   __path__	NameErrorRuntimeErrorr;   r  r:   )r   r  rm  rl  r  rK  r   r   r   _do_optimizer_sanity_check+  sj   



z*DeepSpeedEngine._do_optimizer_sanity_checkc                 C   s  |d u r| j rt|}| |}td|   ddgd n:t|t|  r1|}tddgd n||}tddgd |  rXt|t	j
jjsX|  rXdt| d}t|d	d
 |jD |jd d < tddgd | | || _td|jjdgd | |}|tkr| || _nC|tkr|  }td| dgd tj| j|fi |\}| _| | |    n|t!kr| "|| _n|t#kr| $|| _n|| _td| jjjdgd | % | _&| ' | _(d S )Nz%Using DeepSpeed Optimizer param name z as basic optimizerr   r   z)Using client Optimizer as basic optimizerz/Using client callable to create basic optimizerz=You are using ZeRO-Offload with a client provided optimizer (a  ) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.c                 S   s    g | ]}t |d  dkr|qS r  )ro  )r   pgr   r   r   r  u       z8DeepSpeedEngine._configure_optimizer.<locals>.<listcomp>z@Removing param_group that has no 'params' in the basic OptimizerzDeepSpeed Basic Optimizer = {}z$Initializing AMP with these params: zDeepSpeed Final Optimizer = {}))r   r   _configure_basic_optimizerrR   r  r   rn  r  r  	deepspeedopsadamDeepSpeedCPUAdamr
  r  r   r  r  r  r1  rU  r   r  r   r"  r   r;   rL  r   
initializer  r  r  r:   _configure_fp16_optimizerr9   r$   _configure_compression_schedulerr@   _configure_quantization	quantizer)r   r   rP  r  msgoptimizer_wrapperrL  rO  r   r   r   r  b  sH   





z$DeepSpeedEngine._configure_optimizerc                 C   s  |   }|d u r
i }d| v rtd|  ttfv rx|td}|tt	}|  tkp/|}|rL|s@t
jj|fi |}|S t
jj|fi |}|S |  rdddlm} ||fi |d|i}|S ddlm} ||fi |d|i}|S |  tkr|  rdd	lm}	 |	|fi |}|S t
jj|fi |}|S |  tkrdd
lm}
 |
|fi |}|S |  tkr|  rJ dddlm} ||| fi |}|  std |S |  t kr|  rJ dddl!m"} ||| fi |}|  std |S |  t#kr1|  rJ dddl$m%} ||| fi |}|  s/td |S |  t&kr]|  rMddl'm(} ||fi |}|S ddl'm)} ||fi |}|S |  t*krzddl+m,} W n t-y{   t.d Y nw ||fi |}|S |  t/krzddl+m0} W n t-y   t.d Y nw ||fi |}|S |  t1krzddl+m2} W n t-y   t.d Y nw ||fi |}|S t3t
j|  }||fi |}|S )Nmax_grad_normz'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more detailsFr   )r  
adamw_moder   adam_w_mode)DeepSpeedCPUAdagrad)	FusedLambz%1bit-Adam is not compatible with ZeRO)
OnebitAdamzCCurrently the convergence of 1-bit Adam is only verified under FP16z$0/1 Adam is not compatible with ZeRO)ZeroOneAdamzACurrently the convergence of 0/1 Adam is only verified under FP16z%1bit-Lamb is not compatible with ZeRO)
OnebitLambzCCurrently the convergence of 1-bit Lamb is only verified under FP16)DeepSpeedCPULion)	FusedLion)MuAdamz#Install mup to use MuAdam optimizer)MuAdamWz$Install mup to use MuAdamW optimizer)MuSGDz"Install mup to use MuSGD optimizer)4r  keysr  r  r&   r'   popr+   r,   r-   r*  r  AdamAdamWr  deepspeed.ops.adamr  r   r%   deepspeed.ops.adagradr  Adagradr(   deepspeed.ops.lambr  r)   r!  "deepspeed.runtime.fp16.onebit.adamr  rI  rQ   r  r.   $deepspeed.runtime.fp16.onebit.zoadamr  r*   "deepspeed.runtime.fp16.onebit.lambr  r2   deepspeed.ops.lionr  r  r/   mupr  r{  r|  r0   r  r1   r  r  )r   rP  optimizer_parameters
torch_adamr  effective_adam_w_moder   r  r   r  r  r  r  r  r  r  r  r  r  torch_optimizerr   r   r   r    s   MKGA862
+

$



z*DeepSpeedEngine._configure_basic_optimizerc                 C   s   t | j| jjS r   )r@   r  r   r  r}  r   r   r   r    r  z0DeepSpeedEngine._configure_compression_schedulerc                 C   s   t |S r   rx   )r   configsr   r   r   r@    r  z/DeepSpeedEngine._configure_random_ltd_schedulerc                 C   sz   |   \	}}}}}}}}}	|r|s|  sJ dd }
|r;|s;ddlm} ||||||||  |	|  r8|  nd	}
|
S )NzRMoQ (quantize in optimization step) weight quantization is only supported for FP16r   )	Quantizer)r  rI  deepspeed.runtime.quantizer  r6  r  )r   quantize_weight_in_forwardquantize_enabledq_groupsq_mixed_fp16q_change_ratioq_type
q_rounding	q_verboseuse_quantizer_kernelr  r  r   r   r   r    s<   
z'DeepSpeedEngine._configure_quantizationc                 C   s  |   }|  }|  }trtjjtf}nt}t||s$|  t	t
fv rl|  rMtddgd |  r6| jnt }t|| d||| j||  || jd
}|S td|   dgd t|| |  | j||  | jd}|S tddgd t|| |  |  || j||  tkd	}|S )
Nz/Creating fp16 optimizer with dynamic loss scaler   r   T)	r  rg  rh  dynamic_loss_argsr   	clip_gradfused_adam_legacyr  r   z0Creating fp16 optimizer with static loss scale: )r  static_loss_scaler   r  r  r   z7Creating fp16 unfused optimizer with dynamic loss scale)r  r  rg  r  r   r  fused_lamb_legacy)rh  ri  re  APEX_INSTALLEDapex
optimizersr   r   r  r)   r.   rg  rR   rB  r  rT   r   r   r  r   rO  r   r(   )r   r   rh  r  r  
fused_optsr  r   r   r   r    s^   
$
z)DeepSpeedEngine._configure_fp16_optimizerc                 C   s   |   }|d u rtt| j }tddgd |  r| jnt }t	|| j
| j||  | j||  d |  | jj| jd}|S )NzCreating BF16 optimizerr   r   r   )	r   r  r"  r  r  grad_acc_dtyperH  immediate_grad_updater   )re  r   r  r  r  rR   rB  r  rT   r    r  r   r#  r  rn  rH  r   bfloat16_immediate_grad_updater   )r   r   r  r  r   r   r   r$  O  s$   
z)DeepSpeedEngine._configure_bf16_optimizerc                 C   s
  |   }|  }|  \}}|  r| jnt }|d u r%tt| j	 }| 
 r-td|tjkr<|  }|  }|  }	t|trKJ d|td| d| ddgd t| jtrh|rhtd d	}t|| jfi d
|d|  d|  d|  d|  d|d|  d|  d|  d| j d| j!r| j"nd d| j!r| j#nId d| $ d|d| % d| j&d| ' d| ( d| ) d| * d|tjkd|	d | j!d!| + d"|d#| j,d$| - }|S d| $ d|d| % d| j&d| ' d| ( d| ) d| * d|tjkd|	d | j!d!| + d"|d#| j,d$| - }|S |tj.kr~| j!rJJ d%t|trtd&dgd t/0 }
| 1 d'kro|
d u ro| 2  t/0 }
t3| jf|| j4|  | 5 | 6 | 7 | 8 | 9 | : | j&|
| ; | < | = | > d(}|S td)| d*|dk d+| j?j@ dgd |dkr| A||S td| d| ddgd dd,lBmC} || j|fi d
|d-| j4d|  d|  d|  d|  d|  d|  d.| 5 d/| 6 d0| 7 d1| 8 d2| 9 d| j d3| jDd| $ d|  d| % d4| : d5| E d6| F d| j&d| ' d| ( d| ) d7| G d"|d#| j,d8| 1 d9| ; d:| < d;| = d<| H d=| > }|S tId>|)?NzThe deprecated version of ZeRO Stage 1 is not supported in deepspeed >= 0.5.9. Please downgrade to a version less than 0.5.9 if you need to use this deprecated version of ZeRO.z#zero stage {} requires an optimizerz	Creating z ZeRO stage z
 optimizerr   r   zQPipeline parallelism does not support overlapped communication, will be disabled.Fr  r  rg  r  r  r,  r  r   r"  r  r  r  r  r  offload_optimizer_configr   rX  rY  r  rF  partition_gradsr\  r   rJ  gradient_accumulation_dtypert  r0  zMoE not supported with Stage 3zCreating ZeRO Offloadr   )r  	ds_configr  r8  r6  r4  r<  r>  offload_param_configr   zero_param_parallel_groupr_  r`  r;  rc  zCreating fp16 ZeRO stage z optimizer, MiCS is enabled z, Hierarchical params gather )DeepSpeedZeroOptimizer_Stage3r  r8  r6  r4  r<  r>  all2all_process_groupr  r  r[  rk  r^  r_  r`  r;  rb  rc  zZeRO stage {} not implemented)Jr  r  rn  rB  r  rT   r   r  r  r  rE  	Exceptionr?   r%  r  r-  r]  r   r1  rR   r~   rQ   r  r   r  rO  rg  ri  re  r  r!  r#  r  r   r  r  r  r  r   rX  rY  r  rG  rJ  rt  r1  r(  rP   $_get_zero_param_intra_parallel_groupr^  _set_zero_group_parallelismr   r   r9  r7  r5  r=  r?  r  r_  r`  r;  rd  r   mics_hierarchial_params_gather_return_mics_optimizerdeepspeed.runtime.zero.stage3r!  r  r  r  rk  rb  r  )r   r   
zero_stager  rm  r  r  r  r,  r\  r   r!  r   r   r   r"  f  s  
	

n
nG
	
 !"#$*z)DeepSpeedEngine._configure_zero_optimizerc                 C   s   ddl m} |  \}}|| j|fi d|d| jd|  d|  d|  d|  d	| 	 d
| 
 d|  d|  d|  d|  d|  d| jd|  d|  d|  d|  d|  d| jd|  d|  d|  d|  d|d| j}|S )Nr   )MiCS_Optimizerr  r  r  rg  r  r  r,  r  r8  r6  r4  r<  r>  r  r  r  r  r  r  r   rX  rY  r  rk  r  rt  )deepspeed.runtime.zero.micsr*  rn  r  r   rO  rg  ri  re  r-  r  r9  r7  r5  r=  r?  r  r  r  r  r  r  r   rX  rY  r  rk  rt  )r   r  r  r*  rm  r  r   r   r   r   r'    st   	
z&DeepSpeedEngine._return_mics_optimizerc              	   C   s6   t |  |  |  |  |  |  |  d}|S )N)verbosemax_itertol	stabilitygas_boundary_resolution
layer_name	layer_num)rg   r  r  r  r  r  r  r  )r   r   r   r   r   r7    s   
z%DeepSpeedEngine._configure_eigenvaluec                 C   s   t |  |  d}|S )N)thetagamma)rd   r  r  )r   pldr   r   r   r9     s   z1DeepSpeedEngine._configure_progressive_layer_dropc                 C   s   t |  }|S r   )rw   r  )r   r  r   r   r   r;  %     z6DeepSpeedEngine._configure_curriculum_scheduler_legacyc                 C   s   t | do	t | dS )N__getitem____len__r  objr   r   r   is_map_style_dataset)  s   z$DeepSpeedEngine.is_map_style_datasetc                 C   s   t | tjjjS r   )r   r*  utilsr  IterableDatasetr9  r   r   r   is_iterable_style_dataset-  s   z)DeepSpeedEngine.is_iterable_style_datasetc                 C   r  r   )r   dataloader_drop_lastr}  r   r   r   r?  1  r  z$DeepSpeedEngine.dataloader_drop_lastc                 C   r  )a`  Returns True if the latest ``step()`` produced in parameter updates.
        Note that a ``False`` return is not an error condition. Steps are frequently
        no-ops, such as between gradient accumulation boundaries or when overflows
        occur.
        Returns:
            bool: Whether the latest ``step()`` modified model parameters.
        )r   r}  r   r   r   was_step_applied4  s   z DeepSpeedEngine.was_step_appliedTc                 C   s
  |  |s| |std|d u r|  }|d u r| j}d }|tkr&| j}| j}	| j}
| j	d ur;| j	
 }	| j	 }
|d u rR|tksG|tkrRtjjj||	|
dd}i }|  rrt|  t|  t| jt|  t| jt|  t i}t||||| j||||	|
|  |dS )Nz%Training data must be a torch DatasetF)num_replicasr  shuffle)datasetr   
pin_memoryr   r~  r  num_local_io_workersr  data_parallel_world_sizedata_parallel_rankr?  deepspeed_dataloader_config) r;  r>  r  r?  r   r4   r  r  rD  r   get_data_parallel_world_sizeget_data_parallel_rankr5   r6   r*  r<  r  DistributedSamplerr  rk   rv   r  r=   r   r<   r  r>   rm   r  r3   r~  r?  )r   rC  r   routerD  r  r   rE  deepspeed_io_timerrF  rG  rH  r   r   r   r  >  sV   


zDeepSpeedEngine.deepspeed_ioc                 C   s   d| _ | j| dS ) TNr   r  train)r   moder   r   r   rP  z     zDeepSpeedEngine.trainc                 C   s   d| _ | jd dS )rN  TFNrO  r}  r   r   r   eval  rR  zDeepSpeedEngine.evalc                 C   s   |d u r|   n|}t|tjr|| }|S t|ts t|tr<g }|D ]}t|tjr4|||  q$|| q$|S |}| jrNt	dt
|  d| _|S )Nz0DeepSpeed unable to scale loss because of type: F)r  r   r*  Tensorrn  r  r   r   rQ   r  r  )r   prescaled_losseval_micro_batchesscaling_factorscaled_losslr   r   r   _scale_loss_by_gas  s    z"DeepSpeedEngine._scale_loss_by_gasc                    s    fdd} j j|dddS )Nc                    s     ||S r   )_forward_prologue)r  inputsr  r}  r   r   _module_forward_pre_hook  r  zQDeepSpeedEngine._create_module_forward_pre_hook.<locals>._module_forward_pre_hookFTr  )r  r  )r   r]  r   r}  r   r    s   z/DeepSpeedEngine._create_module_forward_pre_hookc                    s    fdd} j |S )Nc                    s       d S r   )_forward_epilogue)r  inputoutputr}  r   r   _module_forward_post_hook  r  zSDeepSpeedEngine._create_module_forward_post_hook.<locals>._module_forward_post_hook)r  register_forward_hook)r   ra  r   r}  r   r	    s   z0DeepSpeedEngine._create_module_forward_post_hookc                 C   s  d}|   std|  d |  o| j|  ko| jdk}| jdkrYt| drY| jj	dd | j
rY|  dkr<| jjn| jj}| jjrY| j
||  rP| jjnd|  d  d}|rb| jjd d	 |d ur| jjru| jru|| j  | jjd
kr| jjr|  r| j| jd  |   d dkr|d| j! i d}| jjr| " r| j#$| j | j%d u r| j&'  | (| j)j* | + r| j, D ]}d|j-_.q| / r| 0|}d}|r||fS d S )NFzEngine before forwardr   r   r@   T)step_zero_checkr   )ignore_listPipelineEnginer   curriculum_typeseqlencurriculum_seqlen)1r  r   r   r  r   r  rD  r%  r@   r  r  r  r   bit16_groupsfp16_groupsweight_quantization_enabledquantizerI  overflowr6  r  start_profiler  trainingr   update	get_staterU  r   r:  r<  update_difficultyr  get_current_difficultyr=  rA  
update_seqr  r  start_start_timersrC  r   r)  r  _parameters_in_forwardrM  _cast_inputs_half)r   r\  r  return_modifiedflops_profiler_activetensor_to_quantizer  r   r   r   r[    sd   



z!DeepSpeedEngine._forward_prologuec                 C   s|   |   r| j D ]}d|j_q	| | jj |  o&| j	| 
 ko&| jdk}|r.| j  |  s<td|  d d S d S )NFr   zEngine after forwardr   )r)  r  r  rw  rx  _stop_timersrC  r   r  r   r  rD  r  stop_profiler  r   r   )r   r  r{  r   r   r   r^    s   

z!DeepSpeedEngine._forward_epiloguec                 O   sx   |   rt }|  rt| dr| | j | j|i |}|   r:t | }|| jd< t| jdg| 	 d t
  |S )zExecute forward propagation
        Arguments:
            *inputs: Variable length input list
            **kwargs: variable length keyword arguments
        launch_compile_passesactivation_mem_per_gpur   r  )r  r   r  r%  r  r   r  r  r   r  exit)r   r\  r  malossactivation_memr   r   r   forward  s   

zDeepSpeedEngine.forwardc                 C   s   t |ttfrg }|D ]
}|| | q||S t |tr4i }| D ]\}}| |||< q&|S t|drA|	 rA|
 S |S )Nr  )r   r  rn  r   ry  rU  dictitemsr%  is_floating_pointr  )r   r\  
new_inputsvkr   r   r   ry    s   

z!DeepSpeedEngine._cast_inputs_halfc                 C   s   d}d}d}d}| j D ]}||j7 }q| jD ]}||j7 }||j7 }||j7 }qtd|dd|dd|dd|dd|ddd	gd
 d S )N        ztime (ms) | fwd: z.2fz (fwd_moe: z, 1st_a2a: z, 2nd_a2a: z	, top_k: )r   r   )r   	gate_timer   time_moetime_falltoalltime_salltoallrR   )r   fwd_timer  moe_time	falltoall	salltoallgaterY  r   r   r   print_forward_breakdown   s   



,
z'DeepSpeedEngine.print_forward_breakdownc                 C   sr   |   | j_ |  r| j  d S |   r7|  tjkr,t| jdr,| jj| j	d d S d }| j
||d d S d S )Nreduce_gradients)pipeline_parallel)gradselements_per_buffer)!is_gradient_accumulation_boundaryr   r&  /overlapping_partition_gradients_reduce_epiloguer  r?   optimizer_statesr%  r  r  buffered_allreduce_fallback)r   bucket_sizer  r   r   r   allreduce_gradients7  s   z#DeepSpeedEngine.allreduce_gradientsc                 C   s   t d|  d | jd ur| j}| jo| j o|   }|r-|  dkr-|r-| | }|	 
 }| jd u r:|n| j| | _| jjr_|  r_| jdkr_d| j | jfg| _| j| j |  rit|   |S )NzEngine before backwardr   r   r   zTrain/Samples/train_loss)r   r   r   r   r   r  r  rZ  r  meandetachr   r  r  r  rD  itemr   summary_eventswrite_eventsr   )r   r  r   do_gradient_reduction	mean_lossr   r   r   _backward_prologueH  s&   

z"DeepSpeedEngine._backward_prologuec                 C   sD   |  | jj | jr| js|   | | jj td|  d d S )NzEngine after backwardr   )	rv  rC  r   r   r   r  r}  r   r   r}  r   r   r   _backward_epiloguee  s
   z"DeepSpeedEngine._backward_epiloguec                 C   s  |  | jj |  r|  | j_| jj||d nc|  rC|   }tj	|| j|d}|j|d W d    n1 s=w   Y  n:| 
 r^|  rU| jj|ddd n(| jj||d n|  rk| jj||d n|  rw|jddd n|j|d | | jj d S )N)retain_graph)delay_unscaleT)create_graphr  )rv  rC  r   r!  r  r   backwardrK  r   
scale_lossrI  r6  r#  r}  )r   r  r  r  rX  r   r   r   _do_optimizer_backwardn  s(   
z&DeepSpeedEngine._do_optimizer_backwardc                 c   sL    |   rJ d|   | jrJ dd| _z	dV  W d| _dS d| _w )a  
            Context manager to disable gradient reduction during backward pass.
            This context manager has the following effects on other DeepSpeed features:
            1. Incompatible with ZeRO stage 2/3 which rely on reduction for gradient partitioning.
            2. It is illegal to call engine.step() within the context manager.
            3. Tracking of gradient accumulation steps is disabled.
        zWno_sync context manager is incompatible with gradient partitioning logic of ZeRO stage z.no_sync context manager reentry is unsupportedTNF)r&  r  r   r}  r   r   r   no_sync  s   
	zDeepSpeedEngine.no_syncc                 C   s^   | j durt| j trJ d| | jj | ||}| || |   | 	| jj |S )zExecute backward pass on the loss
        Arguments:
            loss: Torch tensor on which to execute backward propagation
            retain_graph: bool, default: false
                forward on user defined choice of retain_graph
        Nz;must provide optimizer during init in order to use backward)
r   r   r   rv  rC  r   r  r  r  r}  )r   r  r  r   r   r   r   r    s   zDeepSpeedEngine.backwardc                 C   s&   | j du r| jd |   dkS | j S )a  
        Query whether the current micro-batch is at the boundary of
        gradient accumulation, and thus will trigger gradient reductions and
        an optimizer step.

        Returns:
            bool: if the current step is a gradient accumulation boundary.

        Nr   r   )r   r   r  r}  r   r   r   r    s   

z1DeepSpeedEngine.is_gradient_accumulation_boundaryc                 C   s   || _ || j_dS )a  
        Manually overrides the DeepSpeed engine's gradient accumulation boundary state, this is an optional
        feature and should be used with care. The state should be set before to the intended
        value before each forward/backward. The final forward/backward should have the
        boundary state set to True. This style allows client code to only call engine.step() once after all
        the gradient accumulation passes are complete. See example below:
        .. code-block:: python
        engine.set_gradient_accumulation_boundary(False)
        for _ in range(gradient_accumulation_steps - 1):
            micro_batch = next(data_loader)
            loss = engine(micro_batch)
            engine.backward(loss)
        engine.set_gradient_accumulation_boundary(True)
        micro_batch = next(data_loader)
        loss = engine(micro_batch)
        engine.backward(loss)
        engine.step()
        Arguments:
            is_boundary (bool): are we at a gradient accumulation boundary or not?
        N)r   r   r  )r   is_boundaryr   r   r   "set_gradient_accumulation_boundary  s   z2DeepSpeedEngine.set_gradient_accumulation_boundaryc                 C   s   | j  D ]\}}d|_qdS )z'
        Zero parameter grads.
        N)r  r  grad)r   
param_namer   r   r   r   	zero_grad  s   zDeepSpeedEngine.zero_gradc                 C   s   t | j |  | jd d S )Nr  max_normr   )re   r  r  re  r   r}  r   r   r   clip_fp32_gradients  s   z#DeepSpeedEngine.clip_fp32_gradientsc                 C   s   |   dkr/|  s|  s|  s|  s|   n|  r/t| j}t	||   | j
d | j  t| jdr?| jj| _| jrg|  dkrL| jjn| jj}| jjrg| j||  r`| jjnd|  | |  r}|  r{t| jdr{| j  n	 n|  s|  s|  r| j  n|   d}t| jdr| jj}| | _|r|  jd7  _n)| j  | jd urz| jjdi |pi  W n ty   | j|   Y nw |  d ur| jr| jd	knd
}|r| jd |   d	kr|  | jd  d | _!|  jd7  _|  j"|  7  _"d S )Nr  r  r   r   Fr  rm  r   r   Tr   )#re  rI  r#  rK  r!  r  r   master_paramsr   re   r   r  r%  r   r  r  ri  rj  r@   rk  rl  rm  r6  r  r   r   r  	TypeErrorr  r  rD  r   r   r   r   )r   	lr_kwargsr   r  r|  rm  report_progressr   r   r   _take_model_step  sb    




z DeepSpeedEngine._take_model_stepc                 C   sV  | j rJ dtd|  d |  o| j|  ko| jdk}| | jj	 | j
dur1t| j
tr5J dd}d| _|  r|  jd7  _|  rk| j|   dkrk| j rktd	dgd
 | j| j| j| j
j| _| jru| j| j |  r| j|   s| j r| || j n| | | jr| jdknd}| jj|  |d |  | jj	 | j!j"r|  r| jdkrd| # d | j$fg| _%| & rt'| j
dr| j%(d| j
j| j$f |  r| j|   s| j) }t*t+|D ]}| j%(d| | j,| d | j$f q| j!-| j% |rC| . r*| j/0 d | _1| j/2 | _3n| j/j4| j| 5 | 6 | 7 | 8 d | j/9  | . rV| j| : d krV| ;  | < rg| j=j>| jj?|  d | < sq|  r|  r| j!j"r| @  | jAr| =tBjCdd}| jD|d | j=>| jjE |  jFd7  _Ftd|  d dS )zpExecute the weight update step after forward and backward propagation
        on effective_train_batch.
        zBIt is illegal to call Engine.step() inside no_sync context managerzEngine before stepr   r   Nz7must provide optimizer during init in order to use stepFr   zcomputing eigenvalue...r   T)global_stepreport_speedzTrain/Samples/lr	cur_scalezTrain/Samples/loss_scalez"Train/Eigenvalues/ModelBlockParam_   )r  r  r  r  r  )namesr   reset)r  zEngine after step)Gr   r   r   r  r   r  rD  rv  rC  r   r   r   r   r   r  r   r6  r  r  any_precision_switchrR   r   compute_eigenvaluer  rk  r  r   r   update_stater  r  stopr}  r  r  get_lrr   r  rI  r%  r   valuesrangero  	ev_valuesr  r  r  get_total_flopsflopsget_total_durationfwd_durationprint_model_profiler  r  r  r  end_profiler  _autotuning_exitrB  r  logr   _write_monitorr   r\   elapsedr  r   r   )r   r  r{  r  r  r   r  r   r   r   r  "	  s   









zDeepSpeedEngine.stepc                 C   s   |D ]	}|  |  qd S r   )r  ru  )r   timer_namesr   r   r   r   rv  	  s   zDeepSpeedEngine._start_timersc                 C   s>   |   o|  o| j|  k}|D ]}| |j|d qd S )N)record)r  r  r   r  r  r  )r   r  r  r   r   r   r   r}  	  s   zDeepSpeedEngine._stop_timersc                 C   s  | j dkr| jjtttgdd}d}|t|v r|t nd7 }|t|v r'|t nd7 }|t|v r3|t nd7 }||  9 }||d< | jd |   | |d< |  d |d  |d< t	|dg| 
 d	 td
| 
  dtj| 
  dgd dd l}|td t  d S )Nr   Fr  r  latencyi@B FLOPS_per_gpu
throughputr  zWrote metrics to , r   z0Autotuning: done with running current ds config.)rD  r  get_meanr\   r]   r`   r  r  r  r   r  rR   r  r  abspathatexitregisterrE  r  )r   r  titerr  r   r   r   r  	  s4   


z DeepSpeedEngine._autotuning_exitc                 C   s   | j dkrMd| tjdd| jfd| tjdd| jfd| tjdd| jfd| tjdd| jfd| tjdd| jfg| _	| j
| j	 d S d S )	Nr   z%Train/Samples/elapsed_time_ms_forwardFr  z&Train/Samples/elapsed_time_ms_backwardz,Train/Samples/elapsed_time_ms_backward_innerz0Train/Samples/elapsed_time_ms_backward_allreducez"Train/Samples/elapsed_time_ms_step)rD  r  r\   r  r   r]   r^   r_   r`   r  r  r  r}  r   r   r   r  	  s0   
zDeepSpeedEngine._write_monitorc                 C   sB   g }| j s|S | j jD ]}||v r|||  q|d q|S )Nr  )r   r  r   )r   r  resultrj  r   r   r   _get_optimizer_param	  s   z$DeepSpeedEngine._get_optimizer_paramc                 C   
   |  dS )Nlrr  r}  r   r   r   r  	  r  zDeepSpeedEngine.get_lrc                 C   r  )Nr  r  r}  r   r   r   get_type	  r  zDeepSpeedEngine.get_typec                 C   s    |   dv r| dS | dS )N)SGDRMSpropmomentumbetas)r  r  r}  r   r   r   get_mom	  s   

zDeepSpeedEngine.get_momc                 C   s   | j r| j  S d S r   )r   	get_thetar}  r   r   r   get_pld_theta	  s   
zDeepSpeedEngine.get_pld_thetac              	   C   s<   |   }|  }td| d| j d| d| dgd d S )Nzstep=z
, skipped=z, lr=z, mom=r   r   )r  r  rR   r   )r   r  r  momr   r   r   r   	  s   ,z DeepSpeedEngine._report_progressc                 C   s   |  |}|}| j|jkr|| j}|d u rtj|d}|  rJ|  dkr0|d|    tj	||d | j
rI|  |krI||  |  n|d|  tj	||d | j|jkrg||urg|| |S Nrm  r  )rG  rt  r   r  r   r  rX  rY  mul_ru  r   copy_)r   bucketdp_groupr  rs  tensor_to_allreducer   r   r   allreduce_bucket	  s&   

z DeepSpeedEngine.allreduce_bucketc                 C   s8   |  |||}t|| ||D ]	\}}|| qd S r   )r  ziprH  r  )r   small_bucketr  r  
allreducedbufsyncedr   r   r   allreduce_and_copy
  s   z"DeepSpeedEngine.allreduce_and_copyr   c                 C   sh   g }d}|D ]}| | ||  }||kr"| ||| g }d}qt|dkr2| ||| d S d S rf  )r   r]  r  ro  )r   r  r  numel_per_bucketr  r  r]  rs  r   r   r   allreduce_no_retain
  s   
z#DeepSpeedEngine.allreduce_no_retainc                 C   s   g }i }| j r| j D ]}g ||< q| j D ]=\}}|js q|jd u r2tj|	 |j
|jd|_|jj}|| jv s>|jrCt|j}t|rP||j | q|| q||fS )Nrl  )r   r  r  r  r  r  r  r*  zerossizer   rk  r  r(  	is_sparserN   r   r  r   )r   non_expert_gradsexpert_gradskeyr  r   	grad_datar   r   r   _get_gradients_for_reduction 
  s"   


z,DeepSpeedEngine._get_gradients_for_reductionc                 C   s   t |\}}| jr| j }t|}nt }t|t| j	 }t
|D ]\}}|r8|\}	}
| j|
||d q&t
|D ]\}}|rP|\}	}| j||||d q=d S N)r  r  )r  r  r  )r   r  r   get_data_parallel_groupr   r  rP   r  r  r  r   sparse_allreduce_no_retainr   )r   r  r  split_sparse_tensor_bucketssplit_dense_tensor_bucketsr  r  r`  sparse_bucket_tuplebucket_typer   dense_bucket_tupler   r   r   r   _reduce_non_expert_gradients?
  s*   
z,DeepSpeedEngine._reduce_non_expert_gradientsc                 C   s   t t }| D ]>\}}t|}t|\}}t|D ]\}	}
|
r0|
\}}| j|||d qt|D ]\}	}|rH|\}}| j	||||d q5qd S r	  )
r   r  rP   r  r  _get_expert_data_parallel_groupr   r   r  r   )r   r  r  r  ep_nameexpert_grads_groupep_dp_groupr  r  r`  r  r  r   r  r   r   r   r   _reduce_expert_gradientsT
  s,   
z(DeepSpeedEngine._reduce_expert_gradientsc                 C   sj   |d u rt | jdr| j \}}n|  \}}n	| jr J d|}| || | jr3| || d S d S )Nget_grads_for_reductionz8attempting to reduce grads in unsupported way w.r.t. MoE)r%  r   r  r  r   r  r  )r   r  r  r  r  r   r   r   r  k
  s   z+DeepSpeedEngine.buffered_allreduce_fallbackc                 C   s@   |  |||}|D ]}|jr| |j_q	|j|  q	d S r   )sparse_allreduce_bucketr  to_coo_tensororig_dense_tensorr  r  to_dense)r   r  r  r  allreduced_sparsesrs  r   r   r   r  {
  s   z*DeepSpeedEngine.sparse_allreduce_no_retainc                 C   s&   g }|D ]}| | ||| q|S r   )r   sparse_allreduce)r   r  r  r  sparse_listsparser   r   r   r  
  s   z'DeepSpeedEngine.sparse_allreduce_bucketc           	      C   s   |j j}| j|j jkr'| jtjtjfv r|jtj}n|j}|j | j}n|j}|j }|d u r7t	j
|d}|  rH| jrG||  |  n|d|  | ||}| ||}t|tj|_t|||_ |S r  )r  r   rt  r*  rR  rS  indicesr  int32r   r  rX  r   r  rY  sparse_all_gathercatlong)	r   r  r  r  original_data_typer   r  indices_device_listvalues_device_listr   r   r   r  
  s(   z DeepSpeedEngine.sparse_allreducec                    s6  t  d g| j}| ||}t |   | } dv s'J  dkrL|dkr;t 	|g fddt
tj|dD }n#|dkr_t 	| d g fddt
tj|dD }tj||d g }t|D ]\}}	|| d }
||	dt j|
t j| jd q}|S )	Nr   )r   r   r   c                    s   g | ]}  qS r   )	new_emptyr   r`  max_sizerV  r   r   r  
  r   z5DeepSpeedEngine.sparse_all_gather.<locals>.<listcomp>rm  c                    s    g | ]}   d  qS )r   )r(  r  r)  r*  r   r   r  
  s    
rl  )r*  
LongTensorr  r  rk  all_gather_scalarr#  maxdimr(  r  r   r  
all_gatherr   r   index_selectaranger$  )r   rV  r  my_size	all_sizes	fill_sizetensor_listr   dev_idxr   r  r   r*  r   r"  
  s(   "$z!DeepSpeedEngine.sparse_all_gatherc                    s2    fddt tj|dD }tj| |d |S )Nc                    s   g | ]	}    qS r   )	new_zerosr  r)  rV  r   r   r  
  s    z5DeepSpeedEngine.all_gather_scalar.<locals>.<listcomp>rm  )r  r   r  r0  )r   rV  r  r6  r   r9  r   r-  
  s   z!DeepSpeedEngine.all_gather_scalarrN  c                 C   sR   | j j|||d}|r| j  D ]\}}|js||v r||= q|  r't|}|S )N)destinationprefix	keep_vars)r  
state_dictr  r  r=  rz   )r   r:  r;  r<  exclude_frozen_parameterssdr  rb  r   r   r   module_state_dict
  s   z!DeepSpeedEngine.module_state_dictr   c              
   C   sz  |rYt t  }t|t t   }	t|	D ]>}
||	 |
 }|jt| d|||t	
dd}d}t| D ]}|| | | |
 }||||< q9|| qd S d}| D ][\}}t|tr|j}|j}	t |}t|	D ]>}
||	 |
 }|jt| ||||t	
dd}d}t| D ]}|| | | |
 }||||< q|| qw|d7 }q_d S )Nr  map_location).deepspeed_moe.experts.deepspeed_experts.r   r   )rP   _get_expert_data_parallel_rank_get_max_expert_size_namer.  _get_expert_parallel_world_sizer  loadr   _get_expert_ckpt_namer*  rk  r  r  replacer  rp  r)  r   r   expert_group_namenum_local_experts_get_expert_parallel_rank)checkpoint_pathtagr=  old_moe_loadrO  r   r   r   	expp_rankrL  local_expert_idglobal_expert_idexpert_state_dictmoe_str_prefixr  	local_keymoe_layer_idn_moduler  r  r   r   r   load_moe_state_dict
  s`   	






z#DeepSpeedEngine.load_moe_state_dictc           
      C   s  |rdd | j  D }ng }tjj|dd |d }|r&||| j d n| j j||d W d    n1 s8w   Y  |td d ur|t }| j  D ]3}|jrTqN|| j	vrat
d| d	| j	| }	t|d
rv|jj||	 j qN|j||	 j qNd S d S )Nc                 S   s&   g | ]}t |d r|jtjkr|qS r  )r%  r  r   NOT_AVAILABLEr   rb  r   r   r   r    s    z:DeepSpeedEngine.load_module_state_dict.<locals>.<listcomp>r   modifier_rankr  )ri  dst)strictfailed to find frozen  in named paramsr  )r  r  r  zeroGatheredParametersload_state_dictr  rM   r  r  r  r%  r  r  r  )
r   
checkpointr_  custom_load_fnfetch_z3_paramsparams_to_fetchr@  saved_frozen_paramsr   r   r   r   r   load_module_state_dict
  s8   	


z&DeepSpeedEngine.load_module_state_dictc                 C   s   |rdnd d| S )Nbf16_rN  zero_pp_rank_r   )r   r  	bf16_moder   r   r   _get_zero_ckpt_prefix)  rs  z%DeepSpeedEngine._get_zero_ckpt_prefixc                 C   s4   | j ||d}tj|t|| d|dd}|S )N)rm  	_mp_rank_02d_optim_states.pt)rn  r  r  r  r  )r   checkpoints_pathrO  mp_rankr  rm  file_prefixzero_ckpt_namer   r   r   _get_rank_zero_ckpt_name,  s   z(DeepSpeedEngine._get_rank_zero_ckpt_namec                 C   sB   | j d u rdn| j  }tj| jjd}|  }| |||||S )Nr   rm  )r   get_model_parallel_rankr   r  r   r  r#  rv  )r   rr  rO  rs  pp_rankrm  r   r   r   _get_zero_ckpt_name5  s   z#DeepSpeedEngine._get_zero_ckpt_namec                 C   s   |d ur|}n| j d u rdn| j  }|d}|  r?|  r"d}ndtj| jjd}t	j
|t|| d| d}|S t	j
|t|d| d }|S )	Nr   rp  zero_pp_rank_0zzero_pp_rank_{}rm  ro  _model_states.ptmp_rank_)r   rw  r)  rQ  r1  r   r  r   r  r  r  r  r  )r   rr  rO  mp_placeholdermp_rank_strrs  filename	ckpt_namer   r   r   _get_ckpt_name;  s(   
zDeepSpeedEngine._get_ckpt_namec              	   C   s@   | j d u rdn| j  }tj|t|d| d|dd}|S )Nr   
expp_rank_ro  rp  rq  )r   rw  r  r  r  r  )r   rr  rO  rQ  rs  r  r   r   r   _get_optimizer_ckpt_nameT  s
   z(DeepSpeedEngine._get_optimizer_ckpt_namec                 C   s   |d u rdn|  }|dkr(tj| |d u rdnt|d| d|dd}|S tj| |d u r2dnt|d| d	| d|dd}|S )
Nr   rA  rN  expert_ro  rp  r{  layer__expert_)rw  r  r  r  r  )rr  layer_id	expert_idrO  r   rs  r  r   r   r   rI  Z  s   z%DeepSpeedEngine._get_expert_ckpt_namec                 C   s.   | j ||dd}dd l}||}|  |S )N*)r}  r   )r  globsort)r   rr  rO  ckpt_file_patternr  
ckpt_filesr   r   r   _get_all_ckpt_namesg  s
   
z#DeepSpeedEngine._get_all_ckpt_namesc              	   C   s  |du rL|   r
dnd}tj||}	tj|	r5t|	d}
|
  }W d   n1 s/w   Y  n|   rAtd|	 dt	
d|	 d d	S |  rU| j  | j|||||||d
\}}|duon|  pn|  }|r|ru|ry|   r| j|||d}nd}|s| j  |  rddlm}m} | jjj}tj||d}||\}}}t	d| d| d|d dd |||dd ||\}}}t	d|d dd | j  |  r| j  |   r|  s| j  ||fS )a  
        Load training checkpoint

        Arguments:
            load_dir: Required. Directory to load the checkpoint from
            tag: Checkpoint tag used as a unique identifier for checkpoint, if not provided will attempt to load tag in 'latest' file
            load_module_strict: Optional. Boolean to strictly enforce that the keys in state_dict of module and checkpoint match.
            load_optimizer_states: Optional. Boolean to load the training optimizer states from Checkpoint. Ex. ADAM's momentum and variance
            load_lr_scheduler_states: Optional. Boolean to add the learning rate scheduler states from Checkpoint.
            load_module_only: Optional. Boolean to load only the model weights from the checkpoint. Ex. warmstarting.
            custom_load_fn: Optional. Custom model load function.

        Returns:
            A tuple of ``load_path`` and ``client_state``.
            *``load_path``: Path of the loaded checkpoint. ``None`` if loading the checkpoint failed.
            *``client_state``: State dictionary used for loading required training states in the client code.

        Important: under ZeRO3, one cannot load checkpoint with ``engine.load_checkpoint()`` right
        after ``engine.save_checkpoint()``. It is because ``engine.module`` is partitioned, and
        ``load_checkpoint()`` wants a pristine model. If insisting to do so, please reinitialize engine
        before ``load_checkpoint()``.

        Nlatest_universallatestrz"Invalid for universal checkpoint: z does not existzUnable to find latest file at z, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint.NN)load_module_strictload_optimizer_statesload_lr_scheduler_statesload_module_onlyrf  r  Fr   copytree
disk_usageoffloaded_tensorsz%Copying NVMe offload checkpoint from  to r      eA,.2f  GB free on target filesystem...T)dirs_exist_okCopying complete!  GB free on target filesystem)rQ  r  r  r  isfileopenreadstripr  rQ   r  rr  r   rp  _load_checkpointr!  r#  _load_zero_checkpoint_restore_from_bit16_weightsr3  shutilr  r  optimizer_swapperswap_folderr0  reset_swap_buffersru  rt  r)  update_lp_params)r   load_dirrO  r  r  r  r  rf  
latest_taglatest_pathfd	load_pathclient_statesload_zero_checkpointsuccessr  r  offload_diroffload_ckpt_dirr`  freer   r   r   load_checkpointp  s`    







zDeepSpeedEngine.load_checkpointc              
      sZ  ddl m} | ||}	|j|	| jd}
t| jt}| jd u r!dn| j	 }|
j
| j||d\}}}|d u r8dS d}|  rH|sHt||d< d}|rRtj||| _| jrsd}t|d	 ts`d}tj|||d || j| j| j| jd
 |  s| j||||d |d | _d }|rdg | jd urt| jdr| j  n|  p|  }|r| jd ur|s| jrt  }t!|}| "|||}| jj
|t#$dd}n|}| % s|  r| jj&|d |d n|}| j&|d  |r| j'd ur| j'&|d  | ( r| j)d urd|v r| j)&|d  | j*d ur,| + r,d|v r,| j*j,&|d  dd }d|v r:|d }nd|v rD|d }nd }|d urb|rR|| _-n|| j-|t.| j/ |d | _-|d | _0|1d| j0| 2  | _3|d | _4|d | _5g d i }|r 6d |r 6d  fdd|7 D }|d ur|d |d< ||fS ) Nr   )SDLoaderFactory)r   )is_pipe_parallelr  Fr  Tr   )r=  rP  rO  r   r   r   )re  r_  rf  rg  r  refresh_fp32_paramsr  rB  r   r  r  
random_ltdr  c                 S   sL   t  }| D ]}||v r||vrq|| q|D ]}||v r#|| q|S r   )r'  r/  )original_set
loaded_setoriginal_parametersloaded_parametersr  r   r   r   r   get_sparse_tensor_module_names   s   
zHDeepSpeedEngine._load_checkpoint.<locals>.get_sparse_tensor_module_namesr(  csr_tensor_module_namesr   r   r   r  )r  r(  r   r   r  r  r  r  c                    s   i | ]\}}| vr||qS r   r   )r   r  rV  deepspeed_statesr   r   r   K  s    z4DeepSpeedEngine._load_checkpoint.<locals>.<dictcomp>)8$deepspeed.runtime.state_dict_factoryr  r  get_sd_loaderr   r   r  r~   r   rw  rH  r  r)  r}   r  r  r  _curr_ckpt_pathr   r  r   rY  r   rQ  rj  r   r   r%  r  r!  r#  rP   rF  rM  r  r*  rk  rI  rd  r  r=  rA  r  r  r  r(  r  r  r   r  r  r   r   r   r   r  )r   r  rO  r  r  r  r  rf  r  	ckpt_list	sd_loaderr  rs  r  re  r`  rg  rP  optim_checkpointhas_zero_optimizer_statelargest_group_namerQ  optim_load_pathr  r(  client_stater   r  r   r    s   	


 











z DeepSpeedEngine._load_checkpointc                 C   s$  d }| j jjr,|  tjksJ dtd| j	}t
 dkr,t
j|t
 d d |  r;d }tj|| }n#|rP| j| jkrPtd| j d| j dd }| ||}|d u r^dS |  }| jj|||  |||d	 |  rtd
| d| j  dS tdt| d| j  dS )Nz3Only stage3 support for pipeline checkpoint loadingr   r   )rs  ri  z4The checkpoint being loaded used a DP world size of z but the current world size is zo. Automatic adjustment of ZeRO's optimizer state partitioning with a new world size is not currently supported.F)state_dict_listr  r.  checkpoint_folderload_serialparam_shapesz'loaded universal zero checkpoints from z
 for rank zloading z% zero partition checkpoints for rank T)r   r  pipeline_loading_checkpointr  r?   r(  r*  r  r  rk  r   get_local_rankrecvr  rQ  r  r  r  r  r   r   _get_all_zero_checkpoints_get_zero_param_shapesr   rd  r/  rQ   r0  rD  ro  )r   r  rO  r  r  zero_sd_listr  r  r   r   r   r  R  sH   

z%DeepSpeedEngine._load_zero_checkpointc           	      C   s4   g }t |D ]}| j|||||d}|| q|S )N)rr  rO  rs  r  rm  )r  rv  r   )	r   r  rO  rs  r  rm  zero_ckpt_namesr  r  r   r   r   "_get_mp_rank_zero_checkpoint_namesz  s   z2DeepSpeedEngine._get_mp_rank_zero_checkpoint_namesc           	      C   sz   | j d u rdn| j  }| j|||| j|d}t|D ]\}}tj|s:d|v r:|dd}tj|r:|||< qq|S )Nr   )r  rO  rs  r  rm  zoptim_states.ptrq  )	r   rw  r  r   r   r  r  existsrJ  )	r   r  rO  rm  rs  r  r   r  ckpt_name_tryr   r   r   _get_all_zero_checkpoint_names  s    z.DeepSpeedEngine._get_all_zero_checkpoint_namesc                 C   s   g }t |D ]/\}}d }|d u rtd i}n|  s#tj| jjd|kr,| jj|dd}ntd i}|	| qdd |D }t
dt| d| j  |S )Nrm  r  rB  c                 S   s   g | ]}|t  qS r   )rL   )r   r?  r   r   r   r    r  zHDeepSpeedEngine._get_all_zero_checkpoint_state_dicts.<locals>.<listcomp>zsuccessfully read z ZeRO state_dicts for rank )r   rL   r1  r   r  r   r  r   rH  r   rQ   r0  ro  rD  )r   r  r  r   r  _statezero_optimizer_sdr   r   r   $_get_all_zero_checkpoint_state_dicts  s   
z4DeepSpeedEngine._get_all_zero_checkpoint_state_dictsc                 C   s   |   |    fD ]4}| |||}|d ur=||   ur6|r tnt}|   r(tnt}td| d| d | |  S q	d S )NzLoading z zero checkpoints into z training engine)r#  r  r9   r:   rQ   r  r  )r   r  rO  rm  r  checkpoint_bit16engine_bit16r   r   r   r    s   z)DeepSpeedEngine._get_all_zero_checkpointsc                 C   s   |   r_t| }t| g | j	}|
 }|
 }tj|tjjd tj|tjjd t||ko>t||k}dt  d| d}|  rV|sTJ |d S |sat| d S d S d S )N)opz[rank=z] The checkpoint tag name 'z' is not consistent across all ranks. Including rank unique information in checkpoint tag could cause issues when restoring with different world sizes.)r  hashlibsha1encoder*  
ByteTensordigestrG  r  rk  cloner   ru  ReduceOpMAXMINr  r  r  rQ   r  )r   rO  s_hashbhash	max_bhash	min_bhashvalidr  r   r   r   _checkpoint_tag_validation  s   z*DeepSpeedEngine._checkpoint_tag_validationc              	   C   s  |   r	| j  |  r| jn| j}|dkr| jj|dd t	  |du r-d| j
 }t|}| j| | | | jrRd| _| ||d | j||||d | jse| ||d | j||||d | jrt| || | || |  rddlm}m} | jjj}	tj||d	}
||\}}}t d
|	 d|
 d|d dd ||	|
dd dd ||\}}}t d|d dd | ! r| j"  | j#| |r|dkrt$tj|dd}|%| W d   n1 sw   Y  t	  dS )a  Save training checkpoint

        Arguments:
            save_dir: Required. Directory for saving the checkpoint
            tag: Optional. Checkpoint tag used as a unique identifier for the checkpoint, global step is
                used if not provided. Tag name must be the same across all ranks.
            client_state: Optional. State dictionary used for saving required training states in the client code.
            save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint.
            exclude_frozen_parameters: Optional. Exclude frozen parameters from checkpointed state.
        Important: all processes must call this method and not just the process with rank 0. It is
        because each process needs to save its master weights and scheduler+optimizer states. This
        method will hang waiting to synchronize with other processes if it's called just for the
        process with rank 0.

        r   Texist_okNr  F)r  r>  r  r  z Copying NVMe offload files from r  r  r  r  r  c                 S   s   t tdd |S )Nc                 S   s   d| v S )Ngradientr   )r  r   r   r   <lambda>  s    zCDeepSpeedEngine.save_checkpoint.<locals>.<lambda>.<locals>.<lambda>)r  filter)r`  dir_listr   r   r   r    s    z1DeepSpeedEngine.save_checkpoint.<locals>.<lambda>)ignorer  r  r  r  w)&rr  r   rp  rP  r~  rD  r   makedirsr   barrierr   r  creater  r   r3  _create_checkpoint_file_save_moe_checkpoint_save_checkpointr4  _create_zero_checkpoint_files_save_zero_checkpointr3  r  r  r  r  r  r  r  r  rQ   r0  ru  rt  commitr  write)r   save_dirrO  r  save_latestr>  r  r  r  r  r  r`  r  r  r   r   r   save_checkpoint  sj   

	

zDeepSpeedEngine.save_checkpointc                 C   s0   t | D ]}d|v rd|vr|| q|S )z>
            Get the state dict of the non-moe layers
        expertmoe.gate.wg.weight)r  r  r  )r   full_state_dictr  r   r   r   _get_non_moe_state_dict%  s
   
z'DeepSpeedEngine._get_non_moe_state_dictc                 C   s  |  ||}d}| j D ]\}}t|tr|j}	|j}
t|	}t	|	}|dkr/|d7 }qi }|
  D ]\}}d|v rKd|vrK|||d | < q7d}tt}t| D ]F}td| d|}d }|srtd	| d n|d}||
 t| }|| | | | }||  }||t| |< qX| D ]\}}| ||||| j}|  rt|}| j || q|d7 }qt!j"#||| _$t% }t|}t	|}|dkrd
| j&r| ' s| j&
 nd i}| (|||}| j || t) dkr`| *t+j,| |d}|| j-d ur| j-
 nd | j.d ur-| / r-| j.j0
 nd |  r8| j1
 nd | j2| j3| j4| j5| j6| j7| j8d}|9| t:d|  | j || d S d S )Nr   r   r
  r  r  rD  z.*z
([0-9]+).*zNo expert found in key r   r>  )r  r  r  r  r(  r   r   r   r  r  r   Saving model checkpoint: );r  r  r)  r   r   rK  rL  rP   rM  rE  r=  r  r   r  r  r  rematchrQ   r  rj  r  rJ  r  r  r  r  rI  r   r=  rz   r   saver  r  r  r  rF  r   r!  r  _get_data_parallel_rankr  r   r@  r  r  r  r  rA  r(  r   r   r   r  r  r   rp  r0  )r   r  rO  r  r>  	save_pathrW  rX  r  r  rL  rQ  exp_dp_rankmoe_state_dictr  rb  rU  experts_state_dictr  mrR  rS  
expert_key	truncatedrT  moe_save_pathr  optimizer_state	file_pathmodel_state_dictstater   r   r   r  /  s   







z$DeepSpeedEngine._save_moe_checkpointc              	   C   s`   |r| j n| j}z|||}tj|}| jj|dd W dS    td| d|  Y dS )NTr  z"Failed saving model checkpoint to z
 with tag F)	ry  r  r  r  dirnamer   r  rQ   r|  )r   r  rO  zero_checkpointname_functioncheckpoint_namer  r   r   r   r     s   
z'DeepSpeedEngine._create_checkpoint_filec                 C   sH   d}t t| jjD ]}|| jkr| ||d}tj| jjd q|S )NTrm  )r  r   r  r   r  rD  r   r  )r   r  rO  r  r  r   r   r   r    s   
z-DeepSpeedEngine._create_zero_checkpoint_filesc           
      C   s  |  ||}|  p|  }|  o| }tj||| _| j|d}d | _t	di d|d| 
 d| jr<|s<| j nd d| jrH|rH|  nd d|rS| | jnd d| jr_|r_|  nd d|rj| | jnd d	| jd urw| j nd d
| jd ur|  r| jj nd d|  r| j nd d| jd| jd| jd| jd| jd| jd| jdt}	|	 | | j!rt"d| ddgd | j#$|	| d S d S )Nr  r  buffer_namesr   r  frozen_param_shapesshared_paramsfrozen_param_fragmentsr  r  r  r(  r   r   r   r  r  r  
ds_versionr  r   r   )r  r   r   )%r  r!  r#  r&  r  r  r  r  r@  r  _get_buffer_namesr   r=  r  !_get_zero_frozen_param_attributes_get_param_shape_func_get_shared_params_get_param_fragment_funcr  r  r  r  r=  rA  r(  r   r   r   r  r  r   r   rp  r3  rR   r   r  )
r   r  rO  r  r>  r  zero_optimizer_statesave_frozen_paramr  r  r   r   r   r    sx   	
z DeepSpeedEngine._save_checkpointc                    s&   g  d fdd	| j dd  S )NrN  c                    sf   | j ddD ]\}}|d ur|| jvr ||  q|  D ]\}}|d ur0||| d  qd S )NFrecurser  )named_buffers_non_persistent_buffers_setr   named_children)r  r;  r   r  childr$  get_layer_named_buffersr   r   r7    s   zBDeepSpeedEngine._get_buffer_names.<locals>.get_layer_named_buffersr;  rN  )r  r}  r   r6  r   r)    s   	z!DeepSpeedEngine._get_buffer_namesc                 C   s   t |dr|jS |jS Nr  )r%  ds_shapeshaper   r   r   r   r   r+    rs  z%DeepSpeedEngine._get_param_shape_funcc                 C   s$   t |dr|j  S |  S r:  )r%  r  r  r  r=  r   r   r   r-    s   $z(DeepSpeedEngine._get_param_fragment_funcc                 C   sR   t  }| j D ]}|jrq|| jvrtd| d| j| }||||< q|S )Nr`  ra  )r   r  r  r  r  r  )r   	attr_funcr'  r   r   r   r   r   r*    s   

z1DeepSpeedEngine._get_zero_frozen_param_attributesc           
      C   s   g }d}d}t | jdr| jj}n|  r t | jdr | jj}n|  dkr*| jjn| jj}|D ]>}t }|D ]1}|d7 }|t |drF|j	n|
 7 }t |drS|jn|j}|| jvr_td| j| }	|||	< q7|| q0|S )	a  Returns a dict of name to shape mapping, only for the flattened fp32 weights saved by the
        optimizer. the names are exactly as in state_dict. The order is absolutely important, since
        the saved data is just flattened data with no identifiers and requires reconstruction in the
        same order it was saved.
        We can't rely on self.module.named_parameters() to get the saved tensors, as some params
        will be missing and others unsaved and then it'd be impossible to reconstruct state_dict
        from the flattened weights.
        optimizer.bit16_groups seems to be the easiest to use as it's in all zeroX versions.
        r   round_robin_bit16_groupsbf16_groupsr   r   r  r;  z.failed to find optimizer param in named params)r%  r   r?  r#  r@  r  ri  rj  r   r  r]  r;  r<  r  r  r   )
r   param_group_shapescntr]  ri  bit16_groupr  r   r<  r   r   r   r   r    s0   





z&DeepSpeedEngine._get_zero_param_shapesc                    sZ   i i |   otdd | j D d	 fdd	 t dkr+ | jdd S )
a0  
        Returns a dict of shared params, which can later be used to reconstruct the original state dict,
        e.g. in `zero_to_fp32`. Each dict entry is a pair of param names, where the key is the name
        of the variable that isn't stored and the value is the actual param holding data.
        c                 s   s    | ]}t |d V  qdS )r  Nr  r  r   r   r   	<genexpr><  s    z5DeepSpeedEngine._get_shared_params.<locals>.<genexpr>rN  c                    s   | j ddD ],\}}|d u srt|dsq|| }r|jn| }|v r.| |< q||< q|  D ]\}}|d urH ||| d  q7d S )NFr0  r  r  )r  r%  r  data_ptrr4  )r  r;  r   r   r  r  r5  get_layer_state_dictis_zero3_modelshared_indexshared_params_by_full_namer   r   rG  >  s   
z@DeepSpeedEngine._get_shared_params.<locals>.get_layer_state_dictr   r8  Nr9  )r)  r  r  r  r   r  r}  r   rF  r   r,  2  s   z"DeepSpeedEngine._get_shared_paramsc                 C   sN   t jt jt}d}t j|d|}t j||}t|| | | d S )Nzzero_to_fp32.pyr<  )r  r  r   __file__r  r   #_change_recovery_script_permissions)r   r  base_dirscriptri  r^  r   r   r   _copy_recovery_scriptZ  s   
z%DeepSpeedEngine._copy_recovery_scriptc              
   C   sd   zt |t |jtjB  W d S  ttfy1 } ztd| d| d W Y d }~d S d }~ww )Nz*Warning: Could not change permissions for z due to error: z*. Continuing without changing permissions.)	r  chmodstatst_modeS_IEXECFileNotFoundErrorPermissionErrorrQ   r0  )r   r^  er   r   r   rL  c  s    z3DeepSpeedEngine._change_recovery_script_permissionsc                 C   sl   |  ||}t| j | jtd}| j|| | jdkr"| 	| | 
 r(dnd}t| d|  d S )N)optimizer_state_dictr  r(  r   rb  	bf16_zeroz checkpoint saved )ry  r  r   r=  r   r   r   r  rD  rO  r!  rQ   r0  )r   r  rO  zero_checkpoint_namezero_sd	ckpt_typer   r   r   r  m  s   

z%DeepSpeedEngine._save_zero_checkpointc                    sB   t  dkr	t ndd fdd	  | jdd t   S )a  
        Get a full non-partitioned state_dict with fp16 weights on cpu.
        Important: this function must be called on all ranks and not just rank 0.
        This is similar to nn.Module.state_dict (modelled after _save_to_state_dict)
        This method is used for tensor parallel training.

        Returns:
        OrderedDict: The consolidated state dictionary if the current process rank is 0, otherwise None.
        r   NrN  c                    s   t t| jdd| dd* | jddD ]\}}|d u rq|| }t dkr/|  |< qW d    n1 s:w   Y  |  D ]\}}|d urT ||| d  qCd S )NFr0  T)r  r   r  )	r"   r  r  r  r   r  r  r  r4  )r  r;  r   r   r  r5  rG  r=  r   r   rG    s   	zUDeepSpeedEngine._replace_module_consolidated_state_dict.<locals>.get_layer_state_dictr8  r9  )r   r  r   r  r   synchronizer}  r   r\  r   '_replace_module_consolidated_state_dictw  s
   
z7DeepSpeedEngine._replace_module_consolidated_state_dictc                 C   s4   |   tjkr| |S |  dkr|  S td)z:
        Consolidate the 16-bit state dictionary.
        r   zconsolidated_16bit_state_dict is only applicable to cases where weights are partitioned, including Zero Stage 3 and tensor parallelism.)r  r?   r(  $_zero3_consolidated_16bit_state_dictr   r^  r  r   r>  r   r   r   _consolidated_16bit_state_dict  s
   
z.DeepSpeedEngine._consolidated_16bit_state_dictc                    s   |   stdt dkrt ndi d fdd	|  r(| j  tddd	 | j	dd
 tddd	 | 
 rD| j  S )a  
        Get a full non-partitioned state_dict with fp16 weights on cpu.
        Important: this function must be called on all ranks and not just rank 0.
        This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
        1. consolidates the weights from different partitions on gpu0
        2. works on one layer at a time to require as little gpu0 memory as possible, by
        moving the already consolidated weights to cpu
        3. takes care to keep the shared params shared when gradually copying the params to cpu
        Returns:
            a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
        z"this function requires ZeRO-3 moder   NrN  c                    s  t jjt| jdddda t dkrh| jddD ].\}}|d u s( r)|js)q|| }|j	v r<|j	  |< q|
  |< ||j	< q| jddD ]\}}|d urg|| jvrg|
  || < qPW d    n1 srw   Y  |  D ]\}}|d ur||| d  q{d S )NFr0  r   r\  r  )r  rb  rc  r  r  r   r  r  r  r  r  r  r2  r3  r4  )r  r;  r   r   r  r  r5  r>  rG  r&  r=  r   r   rG    s(   
zRDeepSpeedEngine._zero3_consolidated_16bit_state_dict.<locals>.get_layer_state_dictzbefore get_layer_state_dictFr   r8  zafter get_layer_state_dictr9  )r)  r  r   r  r   rr  r   rp  r   r  ru  rt  r`  r   rb  r   r_    s   "

z4DeepSpeedEngine._zero3_consolidated_16bit_state_dictpytorch_model.binc                 C   s   |  ||S )z]has been renamed to save_16bit_model, keeping this around for backwards
        compatibility)save_16bit_model)r   r  save_filenamer   r   r   save_fp16_model  s   zDeepSpeedEngine.save_fp16_modelc                 C   s   t j||}|  r!|  r| j|d}ntd| d dS | j|d}d| j	 }t
|}| j| t dkrW| jj|dd td	| d
|  | j|| | j| dS )ak  
        Save 16bit model weights

        This method saves the 16bit model weights at the desired destination.

        Arguments:
            save_dir: Required. Directory for saving the model
            save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
            exclude_frozen_parameters: Optional. Exclude frozen parameters from checkpointed state.

        Returns:
            ``True`` when a model has been saved, ``False`` otherwise. It will not be saved if
            stage3_gather_16bit_weights_on_model_save is ``False``.

        Important: all processes must call this method and not just the process with rank 0. It is
        because the processes need to work in sync to gather the weights. This method will hang
        waiting to synchronize with other processes if it's called just for the process with rank 0.

        r  zDid not save the model z; because stage3_gather_16bit_weights_on_model_save is FalseFr  r   Tr  zSaving model weights to z, tag: )r  r  r  r)  rA  r_  rQ   r0  r@  r   r  r   r  r   r  r  r  r  )r   r  re  r>  r  r=  rO  r   r   r   rd    s(   
z DeepSpeedEngine.save_16bit_modelc                 C   s0   t | jdr| j  t  t   dS dS )zL
        Release GPU memory consumed by offloaded model parameters.
        empty_partition_cacheN)r%  r   rg  gccollectr   empty_cacher}  r   r   r   rg    s
   
z%DeepSpeedEngine.empty_partition_cachec                    s  dt jj_t std| jrdS d|v rtd t	d| 
  d|  | 
 r|  tjks<|  tjks<J dt| jtrFJ d	|durWd
d   fdd|D }|dv scJ d| d| jj}d| jv rd| jd v rd| jd v r| jjjjdkr| jjjjdkrd|_|  tjkrt| ||||}n|  tjkrt| ||||}| jjdi i |d|i d| _dS )zCompile the module using the specified backend and kwargs.
        If a compiler_fn is set, it will be used instead of torch.compile().
        Fz4compile is not supported in your version of PyTorch.NbackendzYThe `backend` in `compile_kwargs` will be overridden. Use the `backend` argument instead.zCompiling deepcompile=z	 backend=z1Currently DeepCompile supports stage 1 or 3 only.z<Currently DeepCompile is not supported without an optimizer.c                 S   s6   | D ]}t |s|tv sJ d| qdd | D S )NzUnknown pass c                 S   s    g | ]}t |r
|nt| qS r   callabler   r[  r   r   r   r  C  r  zFDeepSpeedEngine.compile.<locals>.passes_name_to_fn.<locals>.<listcomp>rl  )passesrb  r   r   r   passes_name_to_fn@  s   z2DeepSpeedEngine.compile.<locals>.passes_name_to_fnc                    s   g | ]
\}}| |fqS r   r   )r   r  rn  ro  r   r   r  E  s    z+DeepSpeedEngine.compile.<locals>.<listcomp>)inductoreagerzBackend z" is not supported for DeepCompile.r!  r  r  r  Tr   )r  r<  nvtxenable_nvtxr   r  is_compiledrQ   r  rE  r  r  r?   r  r(  r   r   r   r   compile_configr   r  r  rk  r  offload_parametersr   r   r  compilerI  )r   rk  compile_kwargsschedulerv  r   rp  r   rx  $  sD   


zDeepSpeedEngine.compilec                 C   s   ddl m} |S )Nr   )opt_pass_times)deepspeed.compile.backendr{  )r   r{  r   r   r   get_compile_timeY  r6  z DeepSpeedEngine.get_compile_time	pass_namepass_fnc                 C   s   t || d S r   )r   )r   r~  r  r   r   r   r   ]  r'  z%DeepSpeedEngine.register_compile_passc                 C   r  r   )r   rv  deepcompiler}  r   r   r   r  `  r  z&DeepSpeedEngine.is_deepcompile_enabledc                 C   r  r   )rI  r}  r   r   r   ru  c  s   zDeepSpeedEngine.is_compiledincluderk  rD  non_blockingc                 C   s   |   tjksJ d|  }|du s|jtjksJ d|  }|du s/|jtjks/J dt| j	t
r9J d|tjkrEtd dS |tjkrNtd| j	j||||d dS )	a  Offload the engine's states to the specified device.

        Arguments:
            include: Optional. The set of states to offload. If not provided, all states are offloaded.
            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
            pin_memory: Optional. Whether to pin the memory of the offloaded states.
            non_blocking: Optional. Whether to offload the states asynchronously.
        zAMoving buffers across devices is supported only for ZeRO stage 3.NzMMoving states across devices is not supported for offloaded optimizer states.zGMoving states across devices is not supported for offloaded parameters.CMoving states across devices is not supported without an optimizer.z*No device specified for offloading states.z4NVMe offload is not supported for offloading states.)r  rk  rD  r  )r  r?   r(  r  rk  r   noner  r   r   r   rQ   r  r  r  offload_states)r   r  rk  rD  r  opt_offload_configparam_offload_configr   r   r   r  g  s*   


zDeepSpeedEngine.offload_statesc                 C   s<   |   tjksJ dt| jtrJ d| jj|d dS )zReload the engine states to the original device.

        Arguments:
            non_blocking: Optional. Whether to offload the states asynchronously.
        z7Moving buffers back is supported only for ZeRO stage 3.r  )r  N)r  r?   r(  r   r   r   reload_states)r   r  r   r   r   r    s   zDeepSpeedEngine.reload_states)NNNNNNNNNNF)Tr   )FT)r   N)Nr   )NrN  FF)TNF)NTTTFN)TTTFN)F)rc  )rc  F)r  N(  r   r   r   r   r   r2  r   re  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r8  r  r  r  r6  r  r  r  r  r  r  r  r:  r  r  r  r  r  r  r  r=  r>  r  r  rB  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r.  r  r?  r  r  r  r  r  r  r!  r	  r
  r  r  r  r  r  r  r  r  r  r  r  r!  r#  r&  r)  r+  r-  r/  r1  r3  r5  r7  r9  r;  r=  r?  rA  rC  rE  rG  r   r   rH  rI  r#  rJ  rK  rL  rM  rO  r  rP  rQ  propertyrt  setterrX  rY  r  r[  r]  r^  r_  r`  ra  rb  rd  rF  re  rg  rh  ri  rj  rk  rn  rr  ru  r  r5  rv  r   r   r   r  r  r   r  staticmethodr   r*  r   r  r  r  r  r  r  r  r  r@  r  r  r$  r"  r'  r7  r9  r;  r;  r>  r?  boolr@  r4   r  rP  rS  rZ  r  r	  r[  r^  rS   r  ry  r  MEMORY_OPT_ALLREDUCE_SIZEr  r  r  r  r   r  r  r  r  r  r  r  r  rv  r}  r  r  r  r  r  r  r  r   r  r  r   r  r  r  r  r  r  r  r"  r-  r@  r|   rY  rj  rn  rv  ry  r  r  rI  r  r  r  r  r  r  r  r  r  r	  r  r  r   r  r  r)  r+  r-  r*  r  r,  rO  rL  r  r^  ra  r_  rf  rd  rg  r   get_compile_backendrx  r}  r  r   r   r  ru  r   r  r   r   r  r  __classcell__r   r   rT  r   r      s    K%@

"D72b 3 !<>	Bm	7	] 	([
o+*(	

"@1$	5$
("r   )r  r  rQ  r*  r  collectionsr   r   r   r  r   rh  torch.nn.modulesr   torch.nn.parameterr   torch.optimr   torch.optim.lr_schedulerr	   torch._utilsr
   r   
contextlibr   typingr   r   r   r   r   r  r   r   deepspeed.runtime.utilsr   r   zero.offload_configr   r   $deepspeed.runtime.zero.stage_1_and_2r   +deepspeed.runtime.zero.partition_parametersr   deepspeed.runtime.zero.utilsr   r   (deepspeed.runtime.zero.parameter_offloadr   deepspeed.runtime.zero.configr   &deepspeed.runtime.fp16.fused_optimizerr   (deepspeed.runtime.fp16.unfused_optimizerr    deepspeed.runtime.bf16_optimizerr    !deepspeed.linear.optimized_linearr!   deepspeed.module_inject.layersr"   r#   deepspeed.runtime.configr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   deepspeed.runtime.dataloaderr3   deepspeed.runtime.constantsr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   deepspeed.compressionr@   deepspeed.compression.constantsrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   deepspeed.checkpoint.constantsrL   rM   deepspeed.runtime.sparse_tensorrN   deepspeed.runtimerO   deepspeed.utilsrP   rQ   rR   rS   deepspeed.utils.timerrT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   deepspeed.utils.debugra   rb   deepspeed.monitor.monitorrc   (deepspeed.runtime.progressive_layer_droprd   re   rf   deepspeed.runtime.eigenvaluerg   )deepspeed.runtime.data_pipeline.constantsrh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   4deepspeed.runtime.data_pipeline.curriculum_schedulerrw   6deepspeed.runtime.data_pipeline.data_routing.schedulerry   3deepspeed.runtime.data_pipeline.data_routing.helperrz   8deepspeed.runtime.data_pipeline.data_routing.basic_layerr{   ;deepspeed.runtime.checkpoint_engine.torch_checkpoint_enginer|   deepspeed.utils.zero_to_fp32r}   pipe.moduler~   r<  r   compilerr   ops.adamr   moe.sharded_moer   r   	moe.layerr   	moe.utilsr   r   git_version_infor   +deepspeed.profiling.flops_profiler.profilerr   deepspeed.utils.loggingr   r   deepspeed.acceleratorr   r   deepspeed.compile.utilr   r   r   r|  r   r   deepspeed.compile.passesr   r   r   r   deepspeed.compile.init_z1r   deepspeed.compile.init_z3r   r  r  DeepSpeedOptimizerCallableDeepSpeedSchedulerCallabler  r   r  r{  r   objectr   r   r   r   r   r   <module>   s   D44<D #