o
    zi                     @   sL   d dl Z d dlZd dlmZ ddlmZmZmZmZ dZ	G dd deZ
dS )    N)	Optimizer   )Betas2OptFloatOptLossClosureParams)AdaModc                       s^   e Zd ZdZ					ddeded	ed
edededdf fddZddede	fddZ
  ZS )r   a  Implements AdaMod algorithm.

    It has been proposed in `Adaptive and Momental Bounds for Adaptive
    Learning Rate Methods`__.

    Arguments:
        params: iterable of parameters to optimize or dicts defining
            parameter groups
        lr: learning rate (default: 1e-3)
        betas: coefficients used for computing running averages of gradient
            and its square (default: (0.9, 0.999))
        beta3: smoothing coefficient for adaptive learning rates
            (default: 0.9999)
        eps: term added to the denominator to improve numerical stability
            (default: 1e-8)
        weight_decay: weight decay (L2 penalty) (default: 0)

    Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.AdaMod(model.parameters(), lr=0.1)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ https://arxiv.org/abs/1910.12249

    Note:
        Reference code: https://github.com/lancopku/AdaMod
    MbP?g?+?r   :0yE>r   paramslrbetasbeta3epsweight_decayreturnNc                    s   |dkrt d||dk rt d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d d|  krLdk sTn t d	||dk r_t d
|t|||||d}tt| || d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid beta3 parameter: {}zInvalid weight_decay value: {})r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr   r   r   r   r   r   defaults	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adamod.pyr   *   s,   	
zAdaMod.__init__closurec                 C   s  d}|dur	| }| j D ]}|d D ]}|jdu rq|jj}|jr'd}t|| j| }t|dkrKd|d< t||d< t||d< t||d< |d |d |d }}	}
|d	 \}}|d  d
7  < |	|j
|d
| d |		|j||d
| d |	 
|d }d
||d   }d
||d   }|d t| | }|d dkr|jj
|j|d  |d  d t||}|| |
	|d j
|d
|d  d t||
}|	| |j
|  qq|S )zPerforms a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   z(AdaMod does not support sparse gradientsr   stepexp_avg
exp_avg_sq
exp_avg_lrr   r   )alpha)valuer   r   r   r   )param_groupsgraddata	is_sparseRuntimeErrorstatelentorch
zeros_likemul_add_addcmul_sqrtmath	full_likediv_min)r   r   lossgrouppr'   msgr+   r!   r"   r#   beta1beta2denombias_correction1bias_correction2	step_sizer   r   r   r    J   s`   





;zAdaMod.step)r	   r
   r   r   r   )N)__name__
__module____qualname____doc__r   floatr   r   r   r   r    __classcell__r   r   r   r   r      s.    ! r   )r3   r-   torch.optim.optimizerr   typesr   r   r   r   __all__r   r   r   r   r   <module>   s    