o
    پi7                     @   sD   d Z ddlmZ ddlmZ ddlZddlmZ G dd deZdS )	an   PyTorch impl of LaProp optimizer

Code simplified from https://github.com/Z-T-WANG/LaProp-Optimizer, MIT License

Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839

@article{ziyin2020laprop,
  title={LaProp: a Better Way to Combine Momentum with Adaptive Gradient},
  author={Ziyin, Liu and Wang, Zhikang T and Ueda, Masahito},
  journal={arXiv preprint arXiv:2002.04839},
  year={2020}
}

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

    )Tuple)	OptimizerN   )ParamsTc                       sr   e Zd ZdZ						ddeded	eeef d
edededef fddZ fddZ	e
 dddZ  ZS )LaPropzw LaProp Optimizer

    Paper: LaProp: Separating Momentum and Adaptivity in Adam, https://arxiv.org/abs/2002.04839
    -C6:?g?g+?V瞯<        Fparamslrbetasepsweight_decaycautioncorrected_weight_decayc           	         s   d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d t||||||d	}tt| || d S )
Nr
   zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)	selfr   r   r   r   r   r   r   defaults	__class__ E/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/laprop.pyr       s"   
zLaProp.__init__c                    s4   t  | | jD ]}|dd |dd q	d S )Nr   Fr   )r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   <   s
   
zLaProp.__setstate__Nc                 C   sN  d}|durt   | }W d   n1 sw   Y  | jD ]}|d D ]}|jdu r/q'|j}|jr9td| j| }t|dkr^d|d< t ||d< d|d< d|d	< t ||d
< |d |d
 }}|d \}	}
|d  d7  < d|
 }d|	 }|	|
j
|||d |d |	 ||d   |d< |d	 |
 | |d	< |d dkr|d |d  nd}|d	 }d| }|| |d }|| }|	|	j||d | d |d r|| dk|j}|| jdd || }|j|| d |d dkr#|d r|d d | jd  }n|d }|j|| |d  d q'q |S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   z(LaProp does not support sparse gradientsr   stepexp_avgr
   exp_avg_lr_1exp_avg_lr_2
exp_avg_sqr   r   )valuer   r   r   )alphar   gMbP?)minr   r      )torchenable_gradr   grad	is_sparseRuntimeErrorr!   len
zeros_likemul_addcmul_divsqrt_add_todtypediv_meanclamp_r   )r   closurelossr"   pr.   r!   r$   r'   beta1beta2one_minus_beta2one_minus_beta1bias_correction1bias_correction2	step_sizedenomstep_of_this_gradmaskwd_scaler   r   r   r#   B   s\   


 
9zLaProp.step)r   r   r	   r
   FF)N)__name__
__module____qualname____doc__r   floatr   boolr   r   r,   no_gradr#   __classcell__r   r   r   r   r      s4    
r   )	rN   typingr   torch.optimr   r,   _typesr   r   r   r   r   r   <module>   s    