o
    پij                     @   s8   d Z ddlZddlmZ ddlmZ G dd deZdS )a	   RMSProp modified to behave like Tensorflow impl

Originally cut & paste from PyTorch RMSProp
https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py
Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Modifications Copyright 2021 Ross Wightman
    N)	Optimizer   )ParamsTc                       s   e Zd ZdZ										dd	ed
edededededededededef fddZ fddZe	
 dddZ  ZS )	RMSpropTFa  Implements RMSprop algorithm (TensorFlow style epsilon)

    NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
    and a few other modifications to closer match Tensorflow for matching hyper-params.

    Noteworthy changes include:
    1. Epsilon applied inside square-root
    2. square_avg initialized to ones
    3. LR scaling of update accumulated in momentum buffer

    Proposed by G. Hinton in his
    `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

    The centered version first appears in `Generating Sequences
    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        momentum: momentum factor
        alpha: smoothing (decay) constant
        eps: term added to the denominator to improve numerical stability
        centered: if ``True``, compute the centered RMSProp, the gradient is normalized by an estimation of its variance
        weight_decay: weight decay (L2 penalty) (default: 0)
        decoupled_decay: decoupled weight decay as per https://arxiv.org/abs/1711.05101
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when decoupled_decay is True
        lr_in_momentum: learning rate scaling is included in the momentum buffer update as per defaults in Tensorflow
        caution: apply caution
    {Gz??绽|=r           FTparamslralphaepsweight_decaymomentumcentereddecoupled_decaycorrected_weight_decaylr_in_momentumcautionc                    s   d|kst d|d|kst d|d|ks!t d|d|ks,t d|d|ks7t d|t||||||||	|
|d
}tt| || d S )Nr	   zInvalid learning rate: {}zInvalid epsilon value: {}zInvalid momentum value: {}zInvalid weight_decay value: {}zInvalid alpha value: {})
r   r   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr
   r   r   r   r   r   r   r   r   r   r   defaults	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/rmsprop_tf.pyr   3   s.   zRMSpropTF.__init__c                    sP   t t| | | jD ]}|dd |dd |dd |dd qd S )Nr   r   r   Fr   r   )r   r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r    Z   s   
zRMSpropTF.__setstate__Nc                 C   s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d D ]}|jdu r0q'|j}|jr:td| j| }t|dkrhd|d< t ||d< |d dkr]t 	||d< |d	 rht 	||d
< |d }d|d  }|d  d7  < |d dkr|d r|d r|d d | j
d  }	n|d }	|d|	|d    n	|j||d d}|j|d| |d |d	 r|d
 }
|
j||
 |d |j|
|
dd|d  }n	||d  }|d dkr4|d }||d  dd }|d r|j|||d d |d r|||}||  q'||| |d r)|||}|j||d  d q'|j|||d  d q'q |S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr
   z)RMSprop does not support sparse gradientsr   step
square_avgr   momentum_bufferr   grad_avgg      ?r   r   r   r   r   r      )r   )valuer   c                 S   s2   | | dk |j}|| jdd | | S )Nr   gMbP?)min)todtypediv_meanclamp_)_m_gmaskr   r   r   _apply_caution   s   z&RMSpropTF.step.<locals>._apply_cautionr   r   )torchenable_gradr!   grad	is_sparseRuntimeErrorr#   len	ones_like
zeros_liker   mul_addadd_powaddcmulsqrt_addcdiv_)r   closurelossr$   pr8   r#   r&   one_minus_alphawd_scaler(   avgbufr5   r   r   r   r%   b   sh   


 




EzRMSpropTF.step)
r   r   r   r   r	   FFFTF)N)__name__
__module____qualname____doc__r   floatboolr   r    r6   no_gradr%   __classcell__r   r   r   r   r      sL    !	
'r   )rO   r6   torch.optimr   _typesr   r   r   r   r   r   <module>   s
    