o
    Ti\                     @   sd   d dl Z d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ G dd dejjZdS )    N)comm)required_torch_version)_flatten_dense_tensors_unflatten_dense_tensors)get_acceleratorc                       sd   e Zd ZdZ												
								d fdd	ZdddZ fddZdd Z  ZS )
OnebitLamba  Implements the 1-bit Lamb algorithm. Currently GPU-only.
    For usage example please see https://www.deepspeed.ai/tutorials/onebit-lamb/
    For technical details please see our paper https://arxiv.org/abs/2104.06069.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        freeze_step (int, optional): Number of steps for warmup (uncompressed)
            stage before we start using compressed communication. (default 100000)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in 1-bit Lamb!
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
        cuda_aware (boolean, required): Set True if the underlying MPI implementation
            supports CUDA-Aware communication. (default: False)
        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
        coeff_beta (float, optional): coefficient used for computing
            running averages of lamb coefficient (default: 0.9) note that you may want to
            increase or decrease this beta depending on the freeze_step you choose, as
            1/(1 - coeff_beta) should be smaller than or equal to freeze_step
        factor_max (float, optional): maximum value of scaling factor to the frozen lamb
            coefficient during compression stage (default: 4.0)
        factor_min (float, optional): minimum value of scaling factor to the frozen lamb
            coefficient during compression stage (default: 0.5)
        factor_threshold (float, optional): threshold of how much the scaling factor can
            fluctuate between steps (default: 0.1)
    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    NMbP?順 T?g+?:0yE>F              $@{Gz?ncclr         @      ?皙?c              
      s  |rt dt|||||	|
||d}tt| || |rdnd| _|| _d| _d| _|| _	|| _
|| _|| _|| _|| _d| _|| _t sKJ dd | _| jdkrqtdd	s\J d
ddlm} t| jd| _|| jj| _nD| jdkrddlm} ||| _n3| jdkrddlm} t| jd| _|| jj| _n| jdkrddlm} t| jd| _|| jj| _| jj| _t | jd t!"| jd | _#g | _$i | _%g | _&g | _'g | _(g | _)g | _*d S )Nz01-bit Lamb does not support the AMSGrad variant.)lrbias_correctionbetasepsweight_decaymax_grad_norm	max_coeff	min_coeffr      Fz0Please initialize the torch distributed backend.r   g?)min_versionzPlease use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend)NcclBackend"pipeline_enable_backward_allreducempi)
MpiBackendhccl)HcclBackend
compressed)CompressedBackend   )+RuntimeErrordictsuperr   __init__eps_mode	deepspeedlamb_freeze_key
initializefreeze_step
cuda_aware
coeff_beta
factor_max
factor_minfactor_thresholdusing_pipelinecomm_backend_namedistis_initializedcomm_backend_handler   deepspeed.runtime.comm.ncclr   hasattrmpudeepspeed.runtime.comm.mpir!   deepspeed.runtime.comm.hcclr#   !deepspeed.runtime.comm.compressedr%   sizeintnpgcddividerexp_avg_flatdummy_exp_avgcorrected_tensor_sizesserver_chunk_sizesworker_errorsserver_errorslamb_coeffs)selfparamsr,   r   r/   r   r   r   eps_inside_sqrtr   r   r   r   amsgradr0   r6   r1   r2   r3   r4   defaultsr   r!   r#   r%   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/onebit/lamb.pyr*   =   sl   	





zOnebitLamb.__init__c           '   
      sR
  d}|dur	| }|du rdgt  j }nt|tjr |g}nt|d tkr,|g}n|} jdd=  jrg } jD ]}|	 fdd|d D  q<d j
 jd d d  vrg } jD ]}|	 fdd|d D  q`tdd |D td	d |D  }t jD ]\}	}t|d D ]\}
}|||	 |
   j
| d< qqt j|D ]\}}|du rdgt |d  }|d
 rdnd}t|d |D ]]\}}|jdu r|du rq|du r|jj}|jrtd j
| }t |dkst |dkr(d| v r(d|d< d|d< d|d< t|j|d< t|j|d< t|j|d<  js/d _|d |d |d }}}|d \}}|d }|d }|d  d7  <  jdu r||d| | ||d| || |d  jkr|  |_d} jr|jd  }|| |d   }|d dkr||d |j 7 }|d  }d}|dkr|dkr||  }||kr|}||k r|}|dkr j|d  d j |  |d<  j	| t   ||d  | |  W d   n	1 s	w   Y  q̈ jr(||d| | | j
| d  d}qqt  j!dkrg }d} jD ]}|d D ]}|	 j
| d  |t"|j7 }qAq;|}| j# j$  dkr j# j$ | j# j$   }||7 }tj%||d jj&d j'd< |	 j'd   j(	|  j)	| j#   j!	t*dd |D  t+ j!d |}t||D ]	\}} | j|_q jrt  j,dkrt- .  t/t  j!D ]'}	 j,	tj% j(|	  j!|	 j&d  j0	tj% j)|	  j!|	 j&d qt- .   jr j#dkrt/t  j!D ]}	 jst- .   j,	tj% j(|	  j!|	 j&d  j0	tj% j)|	  j!|	 j&d t- .  t12 dkrVt3d   j45 j!|	  j,d  j0d  j6j7 t12 dkrwt3d!dd"  j,dd=  j0dd= q j45 j!|	  j,|	  j0|	  j6j7 q jr jrt jD ]>\}	}|d
 rdnd}t|d D ](\}
} j
| }|d |d |d }}}|d \}}|8 j
| d  d#|v r|j&|d# j&kr|d# j9|j&d|d#< ||d#  |||	 |
 |  d|  }!||d| |!|! | |d  }"||" }#|d dkr;|#|d |j  }n|#}d}|d  }| |d  }$|"|$ :  }%|d dkrwt;d|#d  |  }&|%|& d|&  }%|% j<kr j<}%|% j=k r j=}%|%|d d j>  kr|d d j>  }%|%|d d j>  k r|d d j>  }%|%|d< |d |% } j	| t   ||d  | |  W d   n	1 sw   Y  qq|dd= d} jsd _d _t3d$t12   |S  jdu r'|d  jkr't3d% d _ j?r#d j6_@|S d j6_A|S )&a  Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
        Nr   c                    s"   g | ]} j | d    qS exp_avg)statedetachclone.0prL   rS   rT   
<listcomp>   s   " z#OnebitLamb.step.<locals>.<listcomp>rM   scaling_coeffc              	      s@   g | ]}t j j| d  tt  j| d    qS rU   )torchlinalgvector_normrW   rB   sqrtnumelitemrZ   r]   rS   rT   r^      s    c                 S      g | ]}t |qS rS   )sumr[   xrS   rS   rT   r^          c                 S   rf   rS   )lenrh   rS   rS   rT   r^      rj   r   r   z,1-bit Lamb does not support sparse gradientsstepr   lamb_coeff_freeze      ?last_factorrV   
exp_avg_sqexp_avg_sq_freshTr   r   r   F   r   r   r   )devicec                 S   s   g | ]}|   qS rS   )rX   rY   rZ   rS   rS   rT   r^     s    z&Cupy Buffers Initialized Successfully.zPop out errors)flushexp_avg_maskz)Finished the initialization step at rank z.OnebitLamb - starting compressed communication)Brk   param_groups
isinstancetypesGeneratorTypetypelistrK   r-   appendrW   rg   	enumeratezipgraddata	is_sparser'   keysr`   
zeros_liker.   mul_add_addcmul_r/   rX   rY   powrc   re   r1   no_gradrE   rd   r@   rD   zerosrs   rF   rG   rH   r   r   rI   r   empty_cacherangerJ   r7   get_rankprintr9   compressed_allreducer,   
local_rankdiv_tomaxminr2   r3   r4   r5   r   enable_backward_allreduce)'rL   closuregradslossgrads_groupexp_avg_last_stepgroupmomentum_scalesunited_scaleijr\   grads_this_groupr   r   rW   rV   rp   rq   beta1beta2r   r   weight_normupdateupdate_norm
lamb_coeffmomentum_groupstensor_sizecorrected_tensor_size
differenceupdated_paramsqgrad_reconstructdenomupdate_prelim
denom_realfactorupdate_ratiorS   r]   rT   rl      s  	


$
*





C

(




	 

1zOnebitLamb.stepc                    s  t | jD ](\}}d|v r|d |d | d< qd|vr-d|d | v r-|d | d qt | | jdd= | j  | jdd= | j	dd= | j
| jd d d  d | jk rt dkrftd | jdu rzd	| _| jrvd| j_nd| j_| jD ]&}|d D ]}d
| j
| d< d| j
| d< d| j
| v r| j
| d qq}nt dkrtd | jd	u rd| _| jrd	| j_nd	| j_| jdd= | jdd= dS )z^
        Overrides load_state_dict() to add special handling when loading checkpoints
        ru   rv   Nr   rM   rl   z?Checkpoint loaded and OnebitLamb warmup stage starts/continues.TFr   rm   rn   ro   r_   zDCheckpoint loaded and OnebitLamb compression stage starts/continues.)r}   rv   popr)   load_state_dictrE   rF   clearrG   rH   rW   r/   r7   r   r   r-   r5   r,   r   r   rI   rJ   )rL   
state_dictr   r   r\   rQ   rS   rT   r     sJ   	
"




zOnebitLamb.load_state_dictc                 C   s   | j S )N)rK   r]   rS   rS   rT   get_lamb_coeffs  s   zOnebitLamb.get_lamb_coeffs)Nr   r	   Tr
   r   Fr   r   r   r   FFr   r   r   r   r   )NN)	__name__
__module____qualname____doc__r*   rl   r   r   __classcell__rS   rS   rQ   rT   r      s2    /
S t;r   )rx   r`   numpyrB   r,   r   r7   deepspeed.utils.torchr   torch._utilsr   r   deepspeed.acceleratorr   optim	Optimizerr   rS   rS   rS   rT   <module>   s   