o
    Ti                     @   s  d Z ddlZddlZddlZddlmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ dadadadada da!g a"g a#g a$g a%da&da'da(da)da*da+dZ,dBddZ-dCddZ.G dd dZ/e/ Z0dd Z1dd Z2dd Z3dd Z4dd Z5dBd d!Z6d"d# Z7d$d% Z8d&d' Z9d(d) Z:d*d+ Z;d,d- Z<G d.d/ d/ej=j>Z?d0d1 Z@ejAd2d3 ZBd4d5 ZCd6d7 ZDd8d9 ZEdBd:d;ZFd<d= ZG							dDd>d?ZHd@dA ZIdS )EaV  
Use to partition the activations stored for backward propagation
Therefore reduces the memory consumption
Also implements CPU checkpointing and contiguous memory checkpointing
Reduces memory consumption and memory fragmentation

Code for rng checkpointing taken from NVIDIA Megatron-LM mpu/random.py
b886b7bb972afe72bac0f5de4f42a4a7bae8ebef
    N)comm)_C)DeepSpeedConfig)logger)copy_to_devicemove_to_devicesee_memory_usage)SynchronizedWallClockTimerFORWARD_GLOBAL_TIMER)bwc_tensor_model_parallel_rank)get_accelerator)compilerF   zmodel-parallel-rngc                 C   s   t | tr8g }| D ]*}t |tjs|| q	|j}|d ur%|j|d}n|}| }||_|| q	t|S tdt	| j
)Ndevicez@Only tuple of tensors is supported. Got Unsupported input type: )
isinstancetupletorchTensorappendrequires_gradtodetachRuntimeErrortype__name__)inputsr   outinpr   x r    l/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/activation_checkpointing/checkpointing.pydetach_variableD   s   

r"   c                    s   t tdrttjr fdd}n- dkrtt   nt t	r*t  nt t
r8tt     fdd}t | dS )aU  Sets the random number generator state of the current GPU.

    Arguments:
        new_state (torch.ByteTensor): The desired state
    This function is adapted from PyTorch repo (torch.cuda.set_rng_state) #ignore-cuda
    with a single change: the input state is not cloned. Cloning caused
    major performance issues for +4 GPU cases.
    _cuda_setRNGStatec                      s<   t    t W d    d S 1 sw   Y  d S N)r   r   r   r$   r    r   	new_stater    r!   cbf   s   "z_set_cuda_rng_state.<locals>.cbr#   c                     s2    j } | d u rt  } t | }| d S r%   )indexr   current_devicedefault_generator	set_state)idxr+   r&   r    r!   r(   r   s
   
N)hasattrr   callabler$   r   r   r   device_namer   strint	lazy_call)r'   r   r(   r    r&   r!   _set_cuda_rng_state[   s   	

r4   c                   @   sJ   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Ze	j
efddZdS )CudaRNGStatesTrackera  Tracker for the cuda RNG states.

    Using the `add` method, a cuda rng state is initialized based on
    the input `seed` and is assigned to `name`. Later, by forking the
    rng state, we can perform operations and return to our starting
    cuda state.
    c                 C   s   i | _ t | _d S r%   states_setseeds_selfr    r    r!   __init__   s   zCudaRNGStatesTracker.__init__c                 C   s   i | _ t | _dS )z&Set to the initial state (no tracker).Nr6   r:   r    r    r!   reset   s   zCudaRNGStatesTracker.resetc                 C   s   t  | jS )z{Get rng states. Copy the dictionary so we have direct
        pointers to the states, not just a pointer to the dictionary.)copyr7   r:   r    r    r!   
get_states   s   zCudaRNGStatesTracker.get_statesc                 C   s
   || _ dS )zhSet the rng states. For efficiency purposes, we do not check
        the size of seed for compatibility.N)r7   )r;   statesr    r    r!   
set_states   s   
zCudaRNGStatesTracker.set_statesc                 C   sn   || j v rtd|| j | || jv rtd|t  }t | t  | j|< t| dS )zTrack the rng state.zseed {} already existsz cuda rng state {} already existsN)	r9   	Exceptionformataddr7   r   get_rng_statemanual_seedr4   )r;   nameseedorig_rng_stater    r    r!   rD      s   


zCudaRNGStatesTracker.addc              	   c   sr    || j vrtd|t  }t| j |  zdV  W t  | j |< t| dS t  | j |< t| w )zVFork the cuda rng state, perform operations, and exit with
        the original state.zcuda rng state {} is not addedN)r7   rB   rC   r   rE   r4   )r;   rG   orig_cuda_rng_stater    r    r!   fork   s   


zCudaRNGStatesTracker.forkN)r   
__module____qualname____doc__r<   r=   r?   rA   rD   
contextlibcontextmanager _MODEL_PARALLEL_RNG_TRACKER_NAMErK   r    r    r    r!   r5   |   s    r5   c                   C      t S )zGet cuda rng tracker.)_CUDA_RNG_STATE_TRACKERr    r    r    r!   get_cuda_rng_tracker   s   rT   c              	   C   sl   t t}| d }|| }| }t dkr$tdt |t || t	  t
 | tt| dS )ap  Initialize model parallel cuda seed.

    This function should be called after the model parallel is
    initialized. Also, no get_accelerator().manual_seed should be called
    after this function. Basically, this is replacement for that
    function.
    Two set of RNG states are tracked:
        default state: This is for data parallelism and is the same among a
                       set of model parallel GPUs but different across
                       different model parallel groups. This is used for
                       example for dropout in the non-model-parallel regions.
        model-parallel state: This state is different among a set of model
                              parallel GPUs, but the same across data parallel
                              groups. This is used for example for dropout in
                              model parallel regions.
    
  r   z> initializing model parallel cuda seeds on global rank {}, model parallel rank {}, and data parallel rank {} with model parallel seed: {} and data parallel seed: {}N)r   mpudistget_rankr   inforC   get_data_parallel_rankrS   r=   r   rF   rD   rQ   )rH   tp_rankoffsetmodel_parallel_seeddata_parallel_seedr    r    r!   model_parallel_cuda_manual_seed   s   
r_   c                 C   sN   t t}| d | }t  t | W d    d S 1 s w   Y  d S )NrU   )r   rV   rS   rK   r   rF   )rH   r[   r]   r    r    r!   "model_parallel_reconfigure_tp_seed   s
   
"r`   c                 C   s    |   }|t }|t }t|S r%   )numelmp_sizemp_rankr2   )itemsizepartition_sizestartr    r    r!   get_partition_start   s   rh   c                 C   s,   |   }|t dksJ d|t }t|S )Nr   zJDoesn't handle if partition activation if item is not divisible by mp size)ra   rb   r2   )rd   re   rf   r    r    r!   get_partition_size  s   ri   c                 C   sP  t | d dksJ dt |  g }tt | d }t|D ]}| d|  }| d| d  }t|s9|| qtd u sAtdkrY|t|	 }|d urS|
|}|| q| }|t }|d urptj|g|j|d}	ntj|g|j|jd}	|	d|t |}
|
| tj|	|
td |	t|	 }|j|_|| qt|S )N   r   z,Expected even count of tensors, instead got r   dtyper   )group)lenr2   rangeis_activation_to_checkpointr   mp_grouprb   viewlistnumpyr   ra   r   zerosrl   r   narrowrc   copy_rW   all_gather_into_tensordatar   )tensorsr   r   num_argsird   re   rf   tensor_sizeflat_tensorpartinput_tensorr    r    r!   gather_partitioned_activations
  s6   "



r   c                 C   sV   dd | D }dd | D }dd | D }t | tu r&t|t|t|fS |||fS )a  
    Separate objects in list/tuple into tensors and non-tensors and create a mapping to enable re-aggregation.
    The order of tensors and non-tensors is preserved in their respective output groups.

    Parameters:
        all_objects (list/tuple): Objects containing tensors and non-tensors to be split.

    Returns:
        tuple: Containing tensors, non-tensors, and bools of whether each position in original list/tuple was a tensor.

    c                 S   s   g | ]	}t |r|qS r    r   	is_tensor.0vr    r    r!   
<listcomp>=      z#extract_tensors.<locals>.<listcomp>c                 S   s   g | ]	}t |s|qS r    r   r   r    r    r!   r   >  r   c                 S   s   g | ]}t |qS r    r   r   r    r    r!   r   ?  s    )r   r   )all_objectstensor_objectsnon_tensor_objectstensor_flagsr    r    r!   extract_tensors1  s   
r   c           
      C   s   g }d}d}d}t r g }d}|D ]}|rd}q|}|| qn|}|D ]}	|	r4|| |  |d7 }q$|||  |d7 }q$t|S )a  
    Merge two lists (or tuples) of tensors and non-tensors using a mapping of positions in merged list (or tuple).

    Parameters:
        tensor_objects (list/tuple): Tensors to merge.
        non_tensor_objects (list/tuple): Non-tensors to merge.
        tensor_flags (list/tuple): Indicates whether each position in output is a tensor.

    Returns:
        tuple: Merge of tensors and non-tensors
    r   NFr   )PARTITION_ACTIVATIONSr   r   )
r   r   r   merged_objects
tensor_idxnon_tensor_idxreal_tensor_flagsprevious_flagflagr   r    r    r!   merge_tensorsE  s,   

r   c                 C   sB   t | d pt | do| jdk}t| o |  o |  tko |S )z1
        Is an activation to be checkpointed
    no_checkpointingF)r.   r   r   r   is_floating_pointra   rb   )rd   
extra_flagr    r    r!   rp   o  s   "rp   c           
         s  g }d}t | D ]\}}t|s|| |d7 }q|| }t||  ddt|	 |r;t
dnj |r|ttkr^ fddttD }t| td nt| d u ry fddttD }|t|< dt|< dt| t|  jtdt| t|  jjd ttjt| t|  j  < t| t|  jj}	t| d t|< ||	 qtr n| q|S )Nr   r   r#   cpuc                    &   g | ]}t d jgj dqS r    rk   r   tensor	new_emptyrl   r   _buffer_device	partitionrf   r    r!   r         z)partition_activations.<locals>.<listcomp>c                    r   r   r   r   r   r    r!   r     r   )	enumeraterp   r   ri   r   
contiguousrr   rv   rh   cloner   r   rn   contiguous_data_buffersro   
num_layersdata_offsetsry   shaper2   mmapPAGESIZEelement_sizerw   CPU_CHECKPOINTr   )
argscpu_checkpointcontiguous_checkpointr   num_non_fp_tensors	arg_indexrd   r|   tensor_listcontiguous_partitionr    r   r!   partition_activationsy  sF   
$
r   c                 C   sr  g }d}t t| |D ]\}\}}t|rt| nd }t|s2|| || |d7 }qtjg |j	dj
|_
|j
|_|| || }	|r| }
|	ttkrptd}t|j|
t g|j|j	d td nt|	 d u rtd}|j|
t g|j|j	dt|	< dt|	< t|	 dt|	 |
j
|j
}||}t|	 |
 t|	< || q|| q|S )Nr   r   r   r    rk   )r   zipr   r   r   re   rp   r   emptyr   ry   
saved_datara   rn   contiguous_size_buffersr   r   rl   size_offsetsrv   rw   view_as)r   r   r   new_argsr   r   argr   re   r|   ra   tmpcontiguous_sizer    r    r!   (get_partitioned_activations_for_backward  s>   




 
r   c                 C   s^   g }t t| |D ]#\}\}}t|s|| q	tjg |jdj|_|j|_|| q	|S )Nr   )	r   r   rp   r   r   r   r   ry   r   )r   r   r   r|   r   r   r    r    r!    get_cpu_activations_for_backward  s   
r   c                   @   s(   e Zd ZdZedd Zedd ZdS )CheckpointFunctiona  This function is adapted from torch.utils.checkpoint with
       two main changes:
           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`  #ignore-cuda
           2) the states in the model parallel tracker are also properly
              tracked/set/reset.
           3) Performance activation partitioning, contiguous memory optimization
           4) CPU Checkpointing
           5) Profile forward and backward functions
    c                    s   fdd}t rt   td u rtrt atrtt  | _t 	 }t j
|d}tr6t|tt}ntrBt|tdtd}t||td}t  _t   _t   _tddd t  || }	W d    n1 ssw   Y  td	dd ~trt||t}
t|
d
 dksJ dt|
 ||
  ntrt||}
||
  n||  trtt  ttg t rt   t |	r|	! s|	gng }ndd |	D } j"|  t |	r||	g7 }|	S ||	7 }t#|	d\}	}}t$|	S )Nc                     s&   t | d\}}}| _| _| _d S )Nr   )r   deepspeed_saved_tensorsnon_tensor_argsr   )all_argstensor_argsr   r   ctxr    r!   save_args_for_backward  s   
z:CheckpointFunction.forward.<locals>.save_args_for_backwardr   r   r   criterion_funcz#Before running forward on the layerFforce"After running forward on the layerrj   r   2save_for_backward called with odd number of args, c                 S   s"   g | ]}t |r| s|qS r    )r   r   r   )r   or    r    r!   r   9  s   " z.CheckpointFunction.forward.<locals>.<listcomp>r   )%SYNCHRONIZEr   synchronizetimersPROFILE_TIMETimersr
   rg   run_functioncurrent_device_nameStreamr   r   r   CONTIGUOUS_CHECKPOINTINGr   r   r   rp   rE   fwd_cpu_rng_statefwd_cuda_rng_staterT   r?   fwd_cuda_rng_state_trackerr   no_gradr   rn   r   stoplogr   r   mark_non_differentiabler   r   )r   r   all_outputsr   r   cuda_devicetransport_streamr   inputs_cudaoutputsr   non_grad_outputsr   r    r   r!   forward  s^   




"







zCheckpointFunction.forwardc                 G   s  t ddd trt   trtd  tr&tD ]}g }qg ag a	g a
g at ddd tj s5tdt  }t j|d}| jD ]}|d ur_t|dr_|jd ur_|j|j|_d |_qDtrrt| jtri|nd d}t|}ntrt| j|t}t|}n| j}t|}t|| j| j d	}t! }t ! }	t" # }
t$| j% t&| j' t" (| j) t d
dd t*  | j+| }W d    n1 sw   Y  t ddd t$| t&|	 t" (|
 t,|tj-r|f}t.|d\}}}g }g }t/||D ]\}}|j0r|1| |1| qt ddd tj2|| d | _d | _d | _ t ddd tr<td3  t4dg trDt   d d g}|D ]}t5|rZ|1|j6 qJ|1d  qJt7|S )NIn backwardFr   backwardIn backward checkpointing codePCheckpointing is not compatible with .grad(), please use .backward() if possibler   r   r   r   r   -In backward checkpointing code before forward,In backward checkpointing code after forwardr   z.In backward checkpointing code before backward0After backward checkpointing code after backward)8r   r   r   r   r   r   rg   r   r   r   r   r   r   autograd_is_checkpoint_validr   r   r   r   r.   r   r   r   ry   r   r   r   r"   r   rp   r   r   r   rE   rT   r?   set_rng_stater   r4   r   rA   r   enable_gradr   r   r   r   r   r   r   r   r   r   r   gradr   )r   gradsbuffersr   r   tr   detached_inputsbwd_cpu_rng_statebwd_cuda_rng_statebwd_cuda_rng_state_trackerr   r   output_tensorsgrad_tensorsr   r   ret_listr   r    r    r!   r   D  s   














zCheckpointFunction.backwardN)r   rL   rM   rN   staticmethodr   r   r    r    r    r!   r     s    

Pr   c              	      s<  ddd

fdd}t rt   tdu rtrt atr&tt  t  }t j	|d}t
r;t|tt}ntrGt|tdtd}t||td}t t  t  t
rzt||t}t|d dksuJ d	t| ||  ntrt||}||  n||  G d
d d t 	g g d fdd}	
f	dd}	fdd}
tjj||	 | }W d   n1 sw   Y  tst rD ]}||
 qtddd trtt  t tg t rt   g }t!|r||g7 }n||7 }t|dkr|d S t"|S )a  This function is union of `torch.utils.checkpoint._checkpoint_without_reentrant` and `CheckpointFunction` in this module

    This function is aim to solve the back probagation error raised from all input requires no grad.
    * has already been implemented in pytorch for a while, the solution is stable at most time except for jit module mode.
    * can help to solve the issue which is hacked by `deepspeed.runtime.pipe.module.PipelineModule._is_checkpointable`

    Main modifications compared to the implementation of torch:
    1. adapt to the signature of `checkpoint` function in this module
    2. solve the non-deterministic by random state management consistent with deepspeed `CheckpointFunction`
    3. when there is partition or cpu checkpointing, gather them in the unpack_hook during back probagation
    4. make all after backward blocks in the hook which will executed after all leaf nodes backward execution.
    5. above 4. is inspired by `torch.autograd.graph.register_multi_grad_hook`, which is only implemented after 2.0.0
    Nc                     s    t | d\}| dS )zJkeep this function to reduce the modification from original implementationr   N)r   )r   r   )r   r   r   r    r!   r     s   z8non_reentrant_checkpoint.<locals>.save_args_for_backwardr   r   r   rj   r   r   c                   @   s   e Zd ZdZdS )z(non_reentrant_checkpoint.<locals>.Holderz:the place holder object used as activations to save memoryN)r   rL   rM   rN   r    r    r    r!   Holder  s    r  c                    s0     } t| | jr| jr |  |S )zused to record the activation order in the `weak_holder_list`

        the activation order in holder list is consistent between the first forward and recomputing forward.
        * the jit compiled forward will break the order consistency *
        )r   weakrefrefr   is_leaf)tensor_from_forwardres)r  leaf_tensorsweak_holder_listr    r!   checkpoint_pack  s
   
z1non_reentrant_checkpoint.<locals>.checkpoint_packc              	      s  t dkrd  	fdd}dd }tddd tr!t   tr)td	  tr:t	D ]}g }q-g a	g a
g ag atd
dd tj sItdt  }t j|d}trfttr]|ndd}t|}ntrst|t}t|}n}t|}t|d}t }t  }	t  }
t t t  tddd t  $ tjj!"|| | }W d   n1 sw   Y  W d   n1 sw   Y  tddd t| t|	 t |
 ddd| vrtd|  S )z'retrieve the activations from recomputer   c                    s<   d7 d   du rdS |   }| d   < dS )zsave recompute activationsr   N)r   )tensor_from_replaydetached_activations)storageunpack_counterr
  r    r!   replay_pack)  s   zHnon_reentrant_checkpoint.<locals>.checkpoint_unpack.<locals>.replay_packc                 S   s   t d)z$recompute graph need not to backwardz<You are calling backwards on a tensor that is never exposed.)r   )
none_valuer    r    r!   replay_unpack6     zJnon_reentrant_checkpoint.<locals>.checkpoint_unpack.<locals>.replay_unpackr   Fr   r   r   r   r   Nr   r   r   zAttempt to retrieve a tensor saved by autograd multiple times without checkpoint recomputation being triggered in between, this is not currently supported.)#rn   r   r   r   r   r   r   rg   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r   rp   r   rE   rT   r?   r   r4   rA   r   graphsaved_tensors_hooks)holder_from_backwardr  r  r   r   r   r   r   r   r   r   _unused)	r   functionr   r   r   r   r  r   r
  )r  r!   checkpoint_unpack   sr   









 
z3non_reentrant_checkpoint.<locals>.checkpoint_unpackc                    sV    d7   t kr'tddd trtd  tdg tr)t   dS dS dS )z'the hook registered to all leaf tensorsr   r   Fr   r   N)	rn   r   r   r   r   r   r   r   r   )_nonuse_grads)backward_visited_leaf_nodesr	  r    r!   after_backward_hook  s   z5non_reentrant_checkpoint.<locals>.after_backward_hookr   Fr   r   )#r   r   r   r   r   r   r
   rg   r   r   r   r   r   r   r   r   r   rp   rE   rT   r?   r   rn   r   r  WeakKeyDictionaryr   r  r  register_hookr   r   r   r   r   )r  r   r   r   r   r   r   r   r  r  r  r   leaf_tensorr   r    )r  r  r   r  r   r   r   r	  r   r  r   r
  r!   non_reentrant_checkpoint  sn   



"


k

r   c                 G   s4   g }t j| |g|R   t|dkr|d S t|S )zhCheckpoint a model or part of the model.
    This has been directly copied from torch.utils.checkpoint. r   r   )r   applyrn   r   )r  r   r   r    r    r!   
checkpoint  s
   r"  c                 C   s*   | a t dkrtdt  d d S d S )Nr   z$**************Partition Activations z************)r   rW   rX   r   rY   )partition_activationr    r    r!   #partition_activations_in_checkpoint  s   r$  c                 C   s   | a d S r%   )r   )nlayersr    r    r!   set_num_layers  r  r&  c                  C   s*   t rtD ]} g } qg ag ag ag adS dS )a  Resets memory buffers related to contiguous memory optimizations.
    Should be called during eval when multiple forward propagations are
    computed without any backward propagation that usually clears these
    buffers.
    Arguments:
        None

    Return:
        None
    N)r   r   r   r   r   )r   r    r    r!   r=     s   r=   c                 C   sP   t | |dj} t dkrt|   | ja| j	a
| ja| ja| ja| jad S )NrV   r   )r   activation_checkpointing_configrW   rX   r   rY   reprr   r   contiguous_memory_optimizationr   number_checkpointsr   cpu_checkpointingr   synchronize_checkpoint_boundaryr   profiler   )configrV   r    r    r!   _configure_using_config_file  s   
r0  c                   C   s    da dadadadadadad S )NFT)r   r   r   r   r   r   deepspeed_checkpointing_enabledr    r    r    r!   _configure_defaults  s   r2  c                 C   sN  t   | dur	| a|durt|td |dur|a|dur|a|dur%|a|dur+|a|dur1|a|dur7|atr?ts?J dtrItdusIJ dtdurkt	tdr_t
 at at ant at at atddd t d	krtd
 tdt dt  tdt dt d tdt  tdt  dS dS )a  Configure DeepSpeed Activation Checkpointing.

    Arguments:
        mpu_: Optional: An object that implements the following methods
            get_model_parallel_rank/group/world_size, and get_data_parallel_rank/group/world_size

        deepspeed_config: Optional: DeepSpeed Config json file when provided will be used to
            configure DeepSpeed Activation Checkpointing

        partition_activations: Optional: Partitions activation checkpoint across model parallel
            GPUs when enabled. By default False. Will overwrite deepspeed_config if provided

        contiguous_checkpointing: Optional: Copies activation checkpoints to a contiguous memory
            buffer. Works only with homogeneous checkpoints when partition_activations is enabled.
            Must provide num_checkpoints. By default False. Will overwrite deepspeed_config if
            provided

        num_checkpoints: Optional: Number of activation checkpoints stored during the forward
            propagation of the model. Used to calculate the buffer size for contiguous_checkpointing
            Will overwrite deepspeed_config if provided

        checkpoint_in_cpu: Optional: Moves the activation checkpoint to CPU. Only works with
            partition_activation. Default is false. Will overwrite deepspeed_config if provided

        synchronize: Optional: Performs get_accelerator().synchronize() at the beginning and end of
            each call to deepspeed.checkpointing.checkpoint for both forward and backward pass.
            By default false. Will overwrite deepspeed_config if provided

        profile: Optional: Logs the forward and backward time for each
            deepspeed.checkpointing.checkpoint invocation. Will overwrite deepspeed_config
            if provided

    Returns:
        None
    Nr'  zContiguous Checkpointing is only available with partitioned activations. Set partitioned activations to true in deepspeed configzFMust specify the number of layers with contiguous memory checkpointingget_tensor_model_parallel_rankzAfter configurationFr   r   z$Activation Checkpointing Informationz----Partition Activations z, CPU CHECKPOINTING z$----contiguous Memory Checkpointing z with z total layersz----Synchronization z$----Profiling time in checkpointing )r2  rV   r0  r   r   r   r   r   r   r.   r3  rc   $get_tensor_model_parallel_world_sizerb   get_tensor_model_parallel_grouprq   get_model_parallel_rankget_model_parallel_world_sizeget_model_parallel_groupr   rW   rX   r   rY   )mpu_deepspeed_configr   contiguous_checkpointingnum_checkpointscheckpoint_in_cpur   r.  r    r    r!   	configure  sJ   2


r>  c                   C   rR   )zTrue if deepspeed activation checkpointing has been configured
        by calling deepspeed.checkpointing.configure, else returns false

    Arguments:
        None

    Return:
        True of configured, else False
    )r1  r    r    r    r!   is_configuredl  s   
r?  r%   )r#   )NNNNNNN)JrN   r>   r   rO   	deepspeedr   rW   r  r   r   deepspeed.runtime.configr   deepspeed.utilsr   deepspeed.runtime.utilsr   r   r   deepspeed.utils.timerr	   r   r
   deepspeed.utils.bwcr   deepspeed.acceleratorr   deepspeed.runtimer   r1  rV   rc   rb   rq   r   r   r   r   r   r   r   r   r   r   r   rQ   r"   r4   r5   rS   rT   r_   r`   rh   ri   r   r   r   rp   r   r   r   r   Functionr   r   disabler"  r$  r&  r=   r0  r2  r>  r?  r    r    r    r!   <module>   s   

!E)
'*
7* Y t


g