o
    TibL                     @   sT   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm	Z
 G dd dejjZdS )    N)get_accelerator)required_torch_version)commc                       sV   e Zd ZdZ											
		
			d fdd	ZdddZ fddZ  ZS )ZeroOneAdama
  
    Implements the 0/1 Adam algorithm. Currently GPU-only.
    For usage example please see https://www.deepspeed.ai/tutorials/zero-one-adam/
    For technical details please read https://arxiv.org/abs/2202.06009

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        var_freeze_step (int, optional): The latest step to update the variance,
            using the notation from https://arxiv.org/abs/2202.06009, it denotes the
            max{i|i in T_v}. Note that this is different from the freeze step from the
            1-bit Adam. The var_freeze_step is usually the end of the learning rate warmup
            and thus does not require tuning. (default: 100000)
        var_update_scaler (int, optional): The interval to update the variance. Note that
            the update policy for variance follows an exponential rule, where var_update_scaler
            denotes the kappa in the 0/1 Adam paper. (default: 16)
        local_step_scaler (int, optional): The interval to scale the local steps interval
            according to the learning rate policy. (default: 32678)
        local_step_clipper (int, optional): The largest interval for local steps with
            learning rate policy. This corresponds to the variable H in the 0/1 Adam paper.
            (default: 16)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in 0/1 Adam!
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
        cuda_aware (boolean, required): Set True if the underlying MPI implementation
            supports CUDA-Aware communication. (default: False)
        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    NMbP?Tg?g+?:0yE>F        順      ncclc                    s  |rt dt||||||	d}tt| || |rdnd| _|| _d| _|| _d| _	|
| _
|| _|| _|| _d| _d| _|| _t sIJ dd | _| jdkrotdd	sZJ d
ddlm} t| jd| _	|| jj| _nD| jdkrddlm} ||| _n3| jdkrddlm} t| jd| _	|| jj| _n| jdkrddlm} t| jd| _	|| jj| _| jj| _t | jd t!"| jd | _#d S )Nz.0/1 Adam does not support the AMSGrad variant.)lrbias_correctionbetasepsweight_decaymax_grad_normr      Fz0Please initialize the torch distributed backend.r   g?)min_versionzPlease use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend)NcclBackend"pipeline_enable_backward_allreducempi)
MpiBackendhccl)HcclBackend
compressed)CompressedBackend   )$RuntimeErrordictsuperr   __init__eps_mode	deepspeed
initialize
cuda_awareusing_pipelinevar_freeze_stepvar_update_scalerlocal_step_scalerlocal_step_clipper
freeze_keyreinitial_error_buffercomm_backend_namedistis_initializedcomm_backend_handler   deepspeed.runtime.comm.ncclr   hasattrmpudeepspeed.runtime.comm.mpir   deepspeed.runtime.comm.hcclr   !deepspeed.runtime.comm.compressedr   sizeintnpgcddivider)selfparamsr$   r   r   r   r   eps_inside_sqrtr   r   r(   r)   r*   r+   amsgradr&   r.   defaultsr   r   r   r   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/onebit/zoadam.pyr"   :   sZ   




"zZeroOneAdam.__init__c                 C   s  d}|dur	| }|du rdgt | j }nt|tjr |g}nt|d tkr,|g}n|}t| j|D ]N\}}|du rFdgt |d  }|d rLdnd}t|d |D ]\}}	|jdu rd|	du rdqU|	du rl|jj	}	|	j
rstd| j| }
t |
dkrd|
d< t|j	|
d< t|j	|
d	< | jrd
|
 vr"d|
d< d|
d< d|
d< d|
d< d|
d< t|j	|
d< |
d |
d< |
d | j| j  dkr|
d  | j| j |
d | j| j   7  < |
d | j |
d< t   tj|
d |jd|
d
< tj|
d |jd|
d< t|j	|
d< t   | js"t dkr"td |
d |
d	 }}|
d }|d \}}|
d  d7  < | jr| jdu r|
d |
d  dkrk||d| |	|	 ||j|	d| d nl| jdkrt B | j !|	|
d
 |
d | j"j#}d|v r|j|d jkr|d j$|jd|d< ||d  ||d| | W d   n	1 sw   Y  n||j|	d| d |
d  |d 7  < d}	| js| jdkr|%| j !||
d
 |
d | j"j# d|v r|j|d jkr|d j$|jd|d< ||d  | jrk||& |d   }|d dkr3||d |j	 7 }t # |j	|d  |  | jdu rS||d  |  W d   n	1 s^w   Y  |
d |
d  dkr| jrt v |j	d |  ||& |d   | jdkr|'| j !||
d
 |
d | j"j# d|v r|j|d jkr|d j$|jd|d< ||d  |( j||
d  d d |j	||& |d    |(  d|
d< W d   n	1 sw   Y  | jdu rK|
d |
d  dkr"|
d  d7  < |
d | j)kr"d|
d< |
d  d!9  < |
d d |
d  dkr=| j*r8d| j"_+qUd| j"_,qU| j*rFd| j"_+qUd| j"_,qU|
d  d7  < |
d | j-krkd|
d< t.| j/|
d d! |
d< qU| jstd"dd# d| _|
0d
 |
0d q4| jsd| _td$t   |S | j| jd d d  d | j1krd| _| j*rd| j"_+nd| j"_,| jdu r| j2du r| jD ]}|d D ]}| j| d
 (  | j| d (  q͐qd| _2|S )%a  Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        Nr   r>   r   r   z*0/1 Adam does not support sparse gradientsstepexp_avg
exp_avg_sqworker_errorvar_intervalvar_counterlocal_step_intervallocal_step_counterlrstensor_sizecorrected_tensor_sizeserver_chunk_size)deviceserver_errormomentum_accumulatorz&Cupy Buffers Initialized Successfully.r   F)alphaexp_avg_maskr   r   r   r	   T   zPop out errors)flushz)Finished the initialization step at rank )3lenparam_groups
isinstancetypesGeneratorTypetypelistzipgraddata	is_sparser   statetorch
zeros_liker%   keysnumelr8   r<   r   empty_cachezerosrR   r/   get_rankprintr,   mul_addcmul_add_no_gradr1   compressed_allreducer$   
local_ranktoset_sqrtcopy_zero_r)   r'   r   enable_backward_allreducer*   minr+   popr(   r-   )r=   closuregradslossgrads_groupgroupgrads_this_groupr   prb   re   rG   rH   comm_bufferbeta1beta2grad_onebitupdaterD   rD   rE   rF      s6  






	











$

zZeroOneAdam.stepc                    s  t | jD ](\}}d|v r|d |d | d< qd|vr-d|d | v r-|d | d qt | | j| jd d d  d | jk rd| _| j| jd d d  d d | j| jd d d  d  dkru| jrpd	| j	_
n!d	| j	_n| jr}d| j	_
nd| j	_nd	| _| jrd| j	_
nd| j	_d| _| jD ]6}|d D ]/}d
| j| v r| j| d
 d| j| v r| j| d d| j| v r| j| d qqdS )z^
        Overrides load_state_dict() to add special handling when loading checkpoints
        rV   r[   r   r>   rF   Fr   rJ   TrI   rS   rT   N)	enumerater[   r{   r!   load_state_dictre   r(   var_freeze_keyr'   r$   r   ry   r-   )r=   
state_dictir   r   rB   rD   rE   r   C  sL   	"





zZeroOneAdam.load_state_dict)Nr   Tr   r   Fr	   r	   r
   r   r   r   FFr   )NN)__name__
__module____qualname____doc__r"   rF   r   __classcell__rD   rD   rB   rE   r      s*    -
E Er   )r]   rf   numpyr:   deepspeed.acceleratorr   deepspeed.utils.torchr   r$   r   r/   optim	Optimizerr   rD   rD   rD   rE   <module>   s   