o
    zi                     @   s`   d dl Z d dlmZmZ d dlZd dlmZ ddlmZm	Z	m
Z
mZ eZdZG dd deZdS )	    N)ListOptional)	Optimizer   )Betas2OptFloatOptLossClosureParams)
Adahessianc                       s   e Zd ZdZ						dded	ed
ededededee ddf fddZ	dede
deej fddZddedefddZ  ZS )r
   a  Implements Adahessian Algorithm.
    It has been proposed in `ADAHESSIAN: An Adaptive Second Order Optimizer
    for Machine Learning`.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 0.15)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-4)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        hessian_power (float, optional): Hessian power (default: 0.5)
        seed (int, optional): Random number generator seed (default: None)

        Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.Adahessian(model.parameters(), lr = 1.0)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward(create_graph=True)
        >>> optimizer.step()

        __ https://arxiv.org/abs/2006.00719

        Note:
            Reference code: https://github.com/amirgholami/adahessian
    333333?g?g+?-C6?r         ?Nparamslrbetasepsweight_decayhessian_powerseedreturnc           	         s   |dkrt d||dkrt d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d d|  krLdksTn t d	||d ur]t| t|||||d
}tt| || d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid Hessian power value: {})r   r   r   r   r   )
ValueErrorformattorchmanual_seeddictsuperr
   __init__)	selfr   r   r   r   r   r   r   defaults	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adahessian.pyr   ,   s4   

zAdahessian.__init__gradsc                 C   s   t |D ]\}}|jdu rd}t||qdd |D }tjj|||ddd}g }|D ](}	|	 }
t|
dkr>|		 }nt|
dkrPtj
|		 dd	gdd
}|| q-|S )aJ  Get an estimate of Hessian Trace.
        This is done by computing the Hessian vector product with a random
        vector v at the current gradient point, to estimate Hessian trace by
        computing the gradient of <gradsH,v>.
        :param gradsH: a list of torch variables
        :return: a list of torch tensors
        NzzGradient tensor {:} does not have grad_fn. When calling loss.backward(), make sure the option create_graph is set to True.c                 S   s&   g | ]}d t j|d t jd d qS )   )highmemory_formatr   )r   randint_likepreserve_format).0pr"   r"   r#   
<listcomp>d   s    z(Adahessian.get_trace.<locals>.<listcomp>T)grad_outputsonly_inputsretain_graphr%         )dimkeepdim)	enumerategrad_fnRuntimeErrorr   r   autogradgradsizelenabsmeanappend)r   r   r$   ir8   msgvhvshutchinson_tracehv
param_size
tmp_outputr"   r"   r#   	get_traceQ   s*   



zAdahessian.get_traceclosurec                 C   s  d}|dur	| }g }g }g }| j D ]}|d D ]}|jdur/|| || ||j qq| ||}t||||D ]\}}}	}
| j| }t|dkrcd|d< t|j	|d< t|j	|d< |d |d }}|d \}}|d  d7  < |
|j|	 d| d	 |
|j|
|
d| d
 d||d   }d||d   }|d }| | t||  |d }|j	|d || | |d |j	    |_	q>|S )zPerform a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   r   stepexp_avgexp_hessian_diag_sqr   r   )alpha)valuer   r   r   r   )param_groupsr8   r=   rF   zipstater:   r   
zeros_likedatamul_add_detach_addcmul_sqrtmath)r   rG   lossr   groupsr$   groupr+   
hut_tracesr8   	hut_tracerO   rI   rJ   beta1beta2bias_correction1bias_correction2kdenomr"   r"   r#   rH      s\   











zAdahessian.step)r   r   r   r   r   N)N)__name__
__module____qualname____doc__r	   floatr   r   intr   Gradsr   r   TensorrF   r   r   rH   __classcell__r"   r"   r    r#   r
      s6     	%3r
   )rW   typingr   r   r   torch.optim.optimizerr   typesr   r   r   r	   ri   __all__r
   r"   r"   r"   r#   <module>   s    