o
    پi                     @   s4   d Z ddlZddlZddlmZ G dd deZdS )a  RAdam Optimizer.
Implementation lifted from: https://github.com/LiyuanLucasLiu/RAdam
Paper: `On the Variance of the Adaptive Learning Rate and Beyond` - https://arxiv.org/abs/1908.03265

NOTE: This impl has been deprecated in favour of torch.optim.RAdam and remains as a reference
    N)	Optimizerc                       sH   e Zd ZdZ				d fdd	Z fdd	Ze dddZ  Z	S )RAdamLegacyz PyTorch RAdam optimizer

    NOTE: This impl has been deprecated in favour of torch.optim.AdamW and remains as a reference
    MbP?g?g+?:0yE>r   c                    s6   t ||||dd tdD d}tt| || d S )Nc                 S   s   g | ]}g d qS ))NNN ).0_r   r   D/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/radam.py
<listcomp>   s    z(RAdamLegacy.__init__.<locals>.<listcomp>
   )lrbetasepsweight_decaybuffer)dictrangesuperr   __init__)selfparamsr   r   r   r   defaults	__class__r   r
   r      s   zRAdamLegacy.__init__c                    s   t t| | d S N)r   r   __setstate__)r   stater   r   r
   r   "   s   zRAdamLegacy.__setstate__Nc                 C   s  d }|d urt   | }W d    n1 sw   Y  | jD ];}|d D ]3}|jd u r0q'|j }|jr<td| }| j| }t|dkr^d|d< t 	||d< t 	||d< n|d 
||d< |d 
||d< |d |d }}	|d \}
}|	|j||d| d	 ||
j|d|
 d
 |d  d7  < |d t|d d  }|d |d kr|d |d }}nc|d |d< ||d  }dd|  d }|d|d  | d|   }||d< |dkr|d td| |d  |d  |d  | | |d   d|
|d    }n|d d|
|d    }||d< |d dkr6|j||d  |d  d
 |dkrN|	 |d }|j||| d	 n|j|| d
 || q'q |S )Nr   z'RAdam does not support sparse gradientsr   stepexp_avg
exp_avg_sqr      )value)alphar   r         r      r   r   )torchenable_gradparam_groupsgradfloat	is_sparseRuntimeErrorr   len
zeros_liketype_asmul_addcmul_add_intmathsqrtaddcdiv_copy_)r   closurelossgrouppr*   p_fp32r   r   r    beta1beta2bufferednum_sma	step_sizebeta2_tnum_sma_maxdenomr   r   r
   r   %   s|   






;zRAdamLegacy.step)r   r   r   r   r   )
__name__
__module____qualname____doc__r   r   r'   no_gradr   __classcell__r   r   r   r
   r      s    r   )rI   r5   r'   torch.optim.optimizerr   r   r   r   r   r
   <module>   s
    