o
    پi<                     @   s$   d Z ddlZG dd dejjZdS )z AdaHessian Optimizer

Lifted from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py
Originally licensed MIT, Copyright 2020, David Samuel
    Nc                       sp   e Zd ZdZ								d fd	d
	Zedd Zdd Zdd Ze	
 dd Ze	
 dddZ  ZS )
Adahessiana  
    Implements the AdaHessian algorithm from "ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning"

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
        lr (float, optional): learning rate (default: 0.1)
        betas ((float, float), optional): coefficients used for computing running averages of gradient and the
            squared hessian trace (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0)
        hessian_power (float, optional): exponent of the hessian trace (default: 1.0)
        update_each (int, optional): compute the hessian trace approximation only after *this* number of steps
            (to save time) (default: 1)
        n_samples (int, optional): how many times to sample `z` for the approximation of the hessian trace (default: 1)
    皙?g?g+?:0yE>              ?   Fc
                    s  d|kst d| d|kst d| d|d   kr"dk s,n t d|d  d|d   kr8dk sBn t d|d  d|  krLdksTn t d	| || _|| _|	| _d
| _t | j| _t	|||||d}
t
t| ||
 |  D ]}d|_d| j| d< qd S )Nr   zInvalid learning rate: zInvalid epsilon value: r   r   z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid Hessian power value: i)lrbetasepsweight_decayhessian_powerhessian step)
ValueError	n_samplesupdate_eachavg_conv_kernelseedtorch	Generatormanual_seed	generatordictsuperr   __init__
get_paramshessstate)selfparamsr	   r
   r   r   r   r   r   r   defaultsp	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adahessian.pyr      s6   zAdahessian.__init__c                 C   s   dS )NTr$   r   r$   r$   r%   is_second_orderF   s   zAdahessian.is_second_orderc                 C   s   dd | j D S )zH
        Gets all parameters in all param_groups with gradients
        c                 s   s(    | ]}|d  D ]}|j r|V  qqdS )r   N)requires_grad).0groupr!   r$   r$   r%   	<genexpr>O   s   & z(Adahessian.get_params.<locals>.<genexpr>)param_groupsr&   r$   r$   r%   r   J   s   zAdahessian.get_paramsc                 C   s@   |   D ]}t|jts| j| d | j dkr|j  qdS )z;
        Zeros out the accumulated hessian traces.
        r   r   N)r   
isinstancer   floatr   r   zero_)r   r!   r$   r$   r%   zero_hessianQ   s
   $
zAdahessian.zero_hessianc           	   	      s  g }t dd   D ]} j| d  j dkr||  j| d  d7  < qt|dkr2dS  jj|d jkrHt	|d j
 j _dd |D }t jD ]3} fd	d|D }tjj|||d
| jd k d}t|||D ]\}}}| j||  j 7  _quqTdS )z}
        Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter.
        c                 S   s
   | j d uS Ngrad)r!   r$   r$   r%   <lambda>a   s   
 z(Adahessian.set_hessian.<locals>.<lambda>r   r   r   Nc                 S   s   g | ]}|j qS r$   r2   r)   r!   r$   r$   r%   
<listcomp>l   s    z*Adahessian.set_hessian.<locals>.<listcomp>c              	      s0   g | ]}t jd d|  j|jdd d qS )r      )r   deviceg       @r   )r   randintsizer   r8   r5   r&   r$   r%   r6   p   s   0 T)grad_outputsonly_inputsretain_graph)filterr   r   r   appendlenr   r8   r   r   r   r   ranger   autogradr3   zipr   )	r   r   r!   gradsizsh_zsh_zzr$   r&   r%   set_hessianZ   s&   
zAdahessian.set_hessianNc                 C   s  d}|dur	| }|    |   | jD ]}|d D ]}|jdu s&|jdu r'q| jrD| dkrDt|jj	ddgdd
|j |_|d|d	 |d
    | j| }t|dkrnd|d< t||d< t||d< |d |d }}|d \}}	|d  d7  < ||j|jd| d ||	j|j|jd|	 d d||d   }
d|	|d   }|d }|| |d |d }|d	 |
 }|j||| d qq|S )z
        Performs a single optimization step.
        Arguments:
            closure (callable, optional) -- a closure that reevaluates the model and returns the loss (default: None)
        Nr      r7      T)dimkeepdimr   r	   r   r   stepexp_avgexp_hessian_diag_sqr
   )alpha)valuer   r   )r0   rJ   r,   r3   r   r   rM   r   absmean	expand_asclonemul_r   r@   
zeros_likeadd_addcmul_pow_addcdiv_)r   closurelossr*   r!   r   rP   rQ   beta1beta2bias_correction1bias_correction2kdenom	step_sizer$   r$   r%   rO   v   s<   
(
&zAdahessian.step)r   r   r   r   r   r   r   Fr1   )__name__
__module____qualname____doc__r   propertyr'   r   r0   r   no_gradrJ   rO   __classcell__r$   r$   r"   r%   r   	   s&    ,
	
r   )rj   r   optim	Optimizerr   r$   r$   r$   r%   <module>   s    