o
    Ti                     @   s8   d Z ddlZddlZddlmZ G dd dejjZdS )zt
Copyright NVIDIA/apex
This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
    N)FusedLambBuilderc                       sH   e Zd ZdZ											d fd
d	ZdddZdd Z  ZS )	FusedLamba  Implements the LAMB algorithm. Currently GPU-only.

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes.
    https://arxiv.org/abs/1904.00962

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        bias_correction (bool, optional): bias correction (default: True)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        max_grad_norm (float, optional): value used to clip global grad norm
            (default: 0.0)
        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
        amsgrad (boolean, optional): NOT SUPPORTED in FusedLamb!
    MbP?Tg?g+?:0yE>F              $@{Gz?c              
      sZ   t   | _|rtdt|||||||	|
d}tt| || |r%dnd| _g | _	d S )Nz/FusedLamb does not support the AMSGrad variant.)lrbias_correctionbetasepsweight_decaymax_grad_norm	max_coeff	min_coeffr      )
r   loadfused_lamb_cudaRuntimeErrordictsuperr   __init__eps_modelamb_coeffs)selfparamsr
   r   r   r   eps_inside_sqrtr   r   r   r   amsgraddefaults	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/lamb/fused_lamb.pyr   )   s   
zFusedLamb.__init__N      ?c                 C   s  d}|dur	| }|du rdgt | j }nt|tjr |g}nt|d tkr,|g}n|}|du r;dgt | j }nt|tjrE|g}nt|d tkrQ|g}n|}|du r_dgt | j }| jdd= t| j|||D ]\}	}
}}|
du rdgt |	d  }
|du rdgt |	d  }|du rdgt |	d  }nt|ts|g}|	d rdnd}t|	d |
||D ]\}}}}|}|	d dkr|| d |	d  }|dkr|| }|j	du r|du rq|du r|j	j
}|jrtd| j| }t |dkrd|d	< t|j
|d
< t|j
|d< |d
 |d }}|	d \}}|	d }|	d }|d	  d7  < |du r9tjg tjdn|}| j|j
|||||	d |||||	d ||d	 | j||	d }| j| qqm|S )a  Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        Nr   r   r   r   r   gư>z+FusedLamb does not support sparse gradientsstepexp_avg
exp_avg_sqr   r   r   )dtyper
   r   r   )lenparam_groups
isinstancetypesGeneratorTypetypelistr   zipgraddata	is_sparser   statetorch
zeros_liketensorfloatr   lambr   append)r   closuregradsoutput_paramsscale
grad_normslossgrads_groupoutput_params_groupgroupgrads_this_groupoutput_params_this_groupgrad_norm_groupr   pr1   output_param	grad_normcombined_scaleclipr4   r&   r'   beta1beta2r   r   out_p
lamb_coeffr"   r"   r#   r%   E   s   


*zFusedLamb.stepc                 C   s   dd | j D }|S )Nc                 S   s   g | ]}|  qS r"   )item).0rO   r"   r"   r#   
<listcomp>   s    z-FusedLamb.get_lamb_coeffs.<locals>.<listcomp>)r   )r   r   r"   r"   r#   get_lamb_coeffs   s   zFusedLamb.get_lamb_coeffs)
r   Tr   r   Fr   r   r   r	   F)NNNr$   N)__name__
__module____qualname____doc__r   r%   rS   __classcell__r"   r"   r    r#   r      s    
gr   )rW   r,   r5   deepspeed.ops.op_builderr   optim	Optimizerr   r"   r"   r"   r#   <module>   s
   