o
    ziE#                     @   sV   d dl Z d dlZd dlmZ ddlmZmZmZmZ ej	dkZ
dZG dd deZdS )	    N)	Optimizer   )Betas2OptFloatOptLossClosureParamsz1.5.0)	AdaBeliefc                       s|   e Zd ZdZ								ddeded	ed
ededededededdf fddZ fddZ	dde
defddZ  ZS )r   aF  Implements AdaBelief Optimizer Algorithm.
    It has been proposed in `AdaBelief Optimizer, adapting stepsizes by
    the belief in observed gradients`__.

    Arguments:
        params: iterable of parameters to optimize or dicts defining
            parameter groups
        lr: learning rate (default: 1e-2)
        betas: coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps: term added to the denominator to improve
            numerical stability (default: 0.001)
        weight_decay: weight decay (L2 penalty) (default: 0)
        amsgrad: whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        weight_decouple:  If set as True, then the optimizer uses decoupled
            weight decay as in AdamW (default: False)
        fixed_decay : This is used when
            weight_decouple is set as True.
            When fixed_decay == True, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay$.
            When fixed_decay == False, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in
            this case, the weight decay ratio decreases with learning
            rate (lr).  (default: False)
        rectify: (default: False) If set as True, then perform the rectified
            update similar to RAdam

    Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.AdaBelief(model.parameters(), lr=0.01)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ https://arxiv.org/abs/2010.07468

    Note:
        Reference code: https://github.com/juntang-zhuang/Adabelief-Optimizer
    MbP?g?g+?:0yE>r   Fparamslrbetasepsweight_decayamsgradweight_decouplefixed_decayrectifyreturnNc
                    s   |dkrt d||dk rt d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d |dk rMt d	|t|||||d
}
tt| ||
 || _|	| _|| _d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {})r   r   r   r   r   )	
ValueErrorformatdictsuperr   __init___weight_decouple_rectify_fixed_decay)selfr   r   r   r   r   r   r   r   r   defaults	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adabelief.pyr   9   s6   
zAdaBelief.__init__c                    s,   t t| | | jD ]}|dd qd S )Nr   F)r   r   __setstate__param_groups
setdefault)r   stategroupr!   r#   r$   r%   b   s   
zAdaBelief.__setstate__closurec                 C   sB  d}|dur	| }| j D ]}|d D ]}|jdu rq|jj}|jr'td|d }| j| }|d \}}	t|dkrdd|	  d |d	< d|d
< trUtj	|jtj
dnt	|j|d< trhtj	|jtj
dnt	|j|d< |rtr}tj	|jtj
dnt	|j|d< |d |d }
}|d
  d7  < d||d
   }d|	|d
   }| jr| js|jd|d |d    n|jd|d   n|d dkr|j|j|d d |
|j|d| d ||
 }||	j||d|	 d |r|d }tj|||d ||d  t| |d }n||d  t| |d }| js>|d | }|jj|
|| d q|d	 d|d
  |	|d
   d|	|d
     |d< |d dkr|d	 |d }}|d |d  | |d  |d  | }t|}||d  | }|j| |
| q|j|d  |
 qq|S )zPerforms a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   zOAdaBelief does not support sparse gradients, please consider SparseAdam insteadr   r   r   g       @r   rho_infstep)memory_formatexp_avgexp_avg_varmax_exp_avg_varr   r   r   )alpha)value)outr      rho_t   g      @)r&   graddata	is_sparseRuntimeErrorr(   lenversion_highertorch
zeros_likepreserve_formatr   r   mul_add_addcmul_maxsqrtmathr   addcdiv_)r   r*   lossr)   pr7   r   r(   beta1beta2r.   r/   bias_correction1bias_correction2grad_residualr0   denom	step_sizer+   r5   rtr#   r#   r$   r,   g   s   




	







yzAdaBelief.step)r	   r
   r   r   FFFF)N)__name__
__module____qualname____doc__r   floatr   boolr   r%   r   r   r,   __classcell__r#   r#   r!   r$   r      sB    -	
)r   )rE   r=   torch.optim.optimizerr   typesr   r   r   r   __version__r<   __all__r   r#   r#   r#   r$   <module>   s    
