o
    zi7                     @   s<   d dl Z d dlmZ ddlmZmZmZ G dd deZdS )    N)	Optimizer   )OptFloatOptLossClosureParamsc                       s`   e Zd ZdZ						ddededed	ed
ededef fddZddede	fddZ
  ZS )Apolloa1  Implements Apollo Optimizer Algorithm.

    It has been proposed in `Apollo: An Adaptive Parameter-wise Diagonal
    Quasi-Newton Method for Nonconvex Stochastic Optimization`__.

    Arguments:
        params: iterable of parameters to optimize or dicts defining
            parameter groups
        lr: learning rate (default: 1e-2)
        beta: coefficient used for computing
            running averages of gradient (default: 0.9)
        eps: term added to the denominator to improve
            numerical stability (default: 1e-4)
        warmup: number of warmup steps (default: 0)
        init_lr: initial learning rate for warmup (default: 0.01)
        weight_decay: weight decay (L2 penalty) (default: 0)

    Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.Apollo(model.parameters(), lr=0.01)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ https://arxiv.org/abs/2009.13586

    Note:
        Reference code: https://github.com/XuezheMax/apollo
    {Gz??-C6?r   paramslrbetaepswarmupinit_lrweight_decayc           	   	      s   |dkrt d||dk rt d|d|  kr dk s(n t d|d|ks3t d|d|ks>t d|d|  krHdksPn t d|t|||||||d	}tt| || d S )
Ng        zInvalid learning rate: {}zInvalid epsilon value: {}g      ?zInvalid beta parameter: {}zInvalid weight_decay value: {}zInvalid warmup updates: {}z!Invalid initial learning rate: {})r   r   r   r   r   base_lrr   )
ValueErrorformatdictsuperr   __init__)	selfr   r   r   r   r   r   r   defaults	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/apollo.pyr   &   s4   
	zApollo.__init__Nclosurereturnc                 C   s   d}|dur	| }| j D ]}|d D ]}|jdu rq| j| }t|dkrGd|d< tj|tjd|d< tj|tjd|d< tj|tjd|d< |d |d	 k rd|d
 |d  |d  |d	  |d  }n|d }|jj}|jrst	d|d dkr|j
||d d}|d }|d }	|d }
|d }|d  d7  < d||d   }d| | }||	 }|	j||d |jdd
|d }|| ||}||| | |
|  }|
|| |
 jdd}||	| |jj|| d qq|S )zPerforms a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   r   step)memory_formatexp_avg_gradapprox_hessianupdater   r   r   r   z'Atom does not support sparse gradients.r   )alphar   r      )pr   )min)param_groupsgradstatelentorch
zeros_likepreserve_formatdata	is_sparseRuntimeErroraddadd_normdiv_mulmul_sumaddcmul_absclamp_copy_div)r   r   lossgroupr'   r+   curr_lrr*   r   r"   Bd_pbias_correctionr%   
delta_graddenomv_sqdeltar   r   r   r    L   sr   







GzApollo.step)r   r	   r
   r   r   r   )N)__name__
__module____qualname____doc__r   floatintr   r   r   r    __classcell__r   r   r   r   r      s0    !&r   )r-   torch.optim.optimizerr   typesr   r   r   r   r   r   r   r   <module>   s    