o
    TiN=                     @   sT   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z
 G dd dejjZdS )    N)get_accelerator)required_torch_version)commc                       sP   e Zd ZdZ														
d fdd	ZdddZ fddZ  ZS )
OnebitAdamaQ  Implements the 1-bit Adam algorithm. Currently GPU-only.
    For usage example please see https://www.deepspeed.ai/tutorials/onebit-adam/
    For technical details please read https://arxiv.org/abs/2102.02888

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        freeze_step (int, optional): Number of steps for warmup (uncompressed)
            stage before we start using compressed communication. (default 100000)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in 1-bit Adam!
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
        cuda_aware (boolean, required): Set True if the underlying MPI implementation
            supports CUDA-Aware communication. (default: False)
        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    NMbP?順 Tg?g+?:0yE>F        ncclc                    s  |rt dt|||||	|
d}tt| || |rdnd| _d| _d| _d| _d| _	|| _
d| _d| _|| _|| _d| _|| _t sIJ dd | _| jdkrotd	d
sZJ dddlm} t| j
d| _|| j
j| _nD| jdkrddlm} ||| _n3| jdkrddlm} t| j
d| _|| j
j| _n| jdkrddlm} t| j
d| _|| j
j| _| jj| _t | jd t!"| jd | _#d S )Nz01-bit Adam does not support the AMSGrad variant.)lrbias_correctionbetasepsweight_decaymax_grad_normr      r
   Fz0Please initialize the torch distributed backend.r   g?)min_versionzPlease use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend)NcclBackend"pipeline_enable_backward_allreducempi)
MpiBackendhccl)HcclBackend
compressed)CompressedBackend   )$RuntimeErrordictsuperr   __init__eps_mode	comm_time	step_timeave_stepbk_time	deepspeedadam_freeze_key
initializefreeze_step
cuda_awareusing_pipelinecomm_backend_namedistis_initializedcomm_backend_handler   deepspeed.runtime.comm.ncclr   hasattrmpudeepspeed.runtime.comm.mpir   deepspeed.runtime.comm.hcclr   !deepspeed.runtime.comm.compressedr   sizeintnpgcddivider)selfparamsr&   r   r)   r   r   r   eps_inside_sqrtr   r   amsgradr*   r,   defaultsr   r   r   r   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/onebit/adam.pyr    .   sZ   




"zOnebitAdam.__init__c              
   C   s  d}|dur	| }d}d}d}| j du rd}|du r#dgt| j }nt|tjr-|g}nt|d tkr9|g}n|}t| j|D ]\}	}
|
du rSdgt|	d  }
|	d rYdnd}t|	d |
D ]\}}|j	du rq|du rqqb|du ry|j	j
}|jrtd| j| }t|dkrd|d	< t|j
|d
< t|j
|d< | jr| j rd| vrt|j
|d< |d |d< |d | j| j  dkr|d  | j| j |d | j| j   7  < |d | j |d< t   tj|d |jd|d< tj|d |jd|d< t   d| _ | jst dkrtd |d
 |d }}|	d \}}|d	  d7  < | j du r_||d| | ||d| || d}| jr^|| |	d   }nd|	 v r|	d du rt | |dt!   ||d| | ||d| || d}nK| jdu r||d| | d}| jdkr|"| j#$||d |d | j%j& d|	v r|j|	d jkr|	d j'|jd|	d< ||	d  | jr|| |	d   }| jr!|	d dkr||	d |j
 7 }t(  ||	d  |  W d   n	1 sw   Y  qb| js6tddd |)d |)d qA| jsLd| _ d| _tdt   |S | j du ro|d	 | j*krotd d| _ | j+rkd| j%_,|S d| j%_-|S )a  Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        Nr   Fr
   r<   r   r   z,1-bit Adam does not support sparse gradientsstepexp_avg
exp_avg_sqworker_errortensor_sizecorrected_tensor_sizeserver_chunk_size)deviceserver_errorTz&Cupy Buffers Initialized Successfully.r   r   
non_freezeexp_avg_maskr   r   zPop out errors)flushz)Finished the initialization step at rank z.OnebitAdam - starting compressed communication).r'   lenparam_groups
isinstancetypesGeneratorTypetypelistzipgraddata	is_sparser   statetorch
zeros_liker(   keysnumelr6   r:   r   empty_cachezerosrK   r-   get_rankprintmul_add_addcmul_sqrt
all_reduceget_world_sizeset_r/   compressed_allreducer&   
local_ranktono_gradpopr)   r+   r   enable_backward_allreduce)r;   closuregradslossgather_timeallgather_timeall_timev_diff_buffergrads_groupgroupgrads_this_groupr   prX   r[   rE   rF   beta1beta2updaterB   rB   rC   rD   p   s   










zOnebitAdam.stepc                    s^  t | jD ](\}}d|v r|d |d | d< qd|vr-d|d | v r-|d | d qt | | j| jd d d  d | jk rdt dkrOt	d | j
du rcd| _
| jr_d| j_n#d| j_nt dkrnt	d	 | j
du rd| _
| jr~d| j_nd| j_| jD ]'}|d D ] }d
| j| v r| j| d
 d| j| v r| j| d qqdS )z^
        Overrides load_state_dict() to add special handling when loading checkpoints
        rN   rQ   r   r<   rD   z?Checkpoint loaded and OnebitAdam warmup stage starts/continues.TFzDCheckpoint loaded and OnebitAdam compression stage starts/continues.rG   rL   N)	enumeraterQ   ro   r   load_state_dictr[   r)   r-   rb   rc   r'   r+   r&   r   rp   )r;   
state_dictiry   r{   r@   rB   rC   r     s@   	"




zOnebitAdam.load_state_dict)Nr   r   Tr   r	   Fr
   r
   FFr   )NN)__name__
__module____qualname____doc__r    rD   r   __classcell__rB   rB   r@   rC   r      s$    !
B r   )rS   r\   numpyr8   deepspeed.acceleratorr   deepspeed.utils.torchr   r&   r   r-   optim	Optimizerr   rB   rB   rB   rC   <module>   s   