o
    پi2'                     @   s0   d dl Z d dlZd dlmZ G dd deZdS )    N)	Optimizerc                       sb   e Zd ZdZ									d fdd		Z fd
dZe dd Ze dddZ	  Z
S )	AdaBeliefa  Implements AdaBelief algorithm. Modified from Adam in PyTorch

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-16)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        decoupled_decay (boolean, optional): (default: True) If set as True, then
            the optimizer uses decoupled weight decay as in AdamW
        fixed_decay (boolean, optional): (default: False) This is used when weight_decouple
            is set as True.
            When fixed_decay == True, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay$.
            When fixed_decay == False, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the
            weight decay ratio decreases with learning rate (lr).
        rectify (boolean, optional): (default: True) If set as True, then perform the rectified
            update similar to RAdam
        degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update
            when variance of gradient is high
    reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020

    For a complete table of recommended hyperparameters, see https://github.com/juntang-zhuang/Adabelief-Optimizer'
    For example train/args for EfficientNet see these gists
      - link to train_script: https://gist.github.com/juntang-zhuang/0a501dd51c02278d952cf159bc233037
      - link to args.yaml: https://gist.github.com/juntang-zhuang/517ce3c27022b908bb93f78e4f786dc3
    MbP?g?g+?缉ؗҜ<r   FTc                    s<  d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d t|ttfr~t|dkr~t|d tr~|D ]%}d	|v r}|d	 d |d ksr|d	 d |d kr}d
d tdD |d< qXt||||||
||	|dd tdD d
}tt	| 
|| d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {}betasc                 S      g | ]}g d qS )NNN .0_r   r   H/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adabelief.py
<listcomp>C       z&AdaBelief.__init__.<locals>.<listcomp>
   bufferc                 S   r
   r   r   r   r   r   r   r   O   r   )
lrr	   epsweight_decayamsgraddegenerated_to_sgddecoupled_decayrectifyfixed_decayr   )
ValueErrorformat
isinstancelisttuplelendictrangesuperr   __init__)selfparamsr   r	   r   r   r   r   r   r   r   paramdefaults	__class__r   r   r&   *   s4   (0zAdaBelief.__init__c                    s,   t t| | | jD ]}|dd qd S )Nr   F)r%   r   __setstate__param_groups
setdefault)r'   stategroupr+   r   r   r-   S   s   
zAdaBelief.__setstate__c                 C   sf   | j D ]-}|d D ]&}| j| }|d }d|d< t||d< t||d< |r/t||d< q	qd S )Nr(   r   r   stepexp_avgexp_avg_varmax_exp_avg_var)r.   r0   torch
zeros_like)r'   r1   pr0   r   r   r   r   resetX   s   

zAdaBelief.resetNc                 C   s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d D ]}|jdu r0q'|j}|jt jt jhv r@| }|jrGt	d|}|jt jt jhv rV| }|d }|d \}}	| j
| }
t|
dkrd|
d< t ||
d< t ||
d	< |rt ||
d
< |d r|d s|d|d |d    n|d|d   n|d dkr|j||d d |
d |
d	 }}|
d  d7  < d||
d   }d|	|
d   }||j|d| d || }||	j||d|	 d |r|
d
 }t j|||d |d | t| |d }n||d  t| |d }|d s>|d | }|j||| d n|d t|
d d  }|
d |d kr]|d |d }}ne|
d |d< |	|
d  }dd|	  d }|d|
d  | d|   }||d< |dkrtd| |d  |d  |d  | | |d  d||
d    }n|d rdd||
d    }nd}||d< |dkr| |d }|j||| |d  d n|dkr|j|| |d  d |jt jt jhv r|| q'q |S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr(   zOAdaBelief does not support sparse gradients, please consider SparseAdam insteadr   r	   r   r2   r3   r4   r5   r   r   r   r   r   )alphar   )valuer   )outr   r   r            r   )r6   enable_gradr.   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr0   r"   r7   mul_add_addcmul_maxsqrtmathaddcdiv_intcopy_)r'   closurelossr1   r8   rB   p_fp32r   beta1beta2r0   r3   r4   bias_correction1bias_correction2grad_residualr5   denom	step_sizebufferednum_smabeta2_tnum_sma_maxr   r   r   r2   j   s   


&





czAdaBelief.step)	r   r   r   r   FTFTT)N)__name__
__module____qualname____doc__r&   r-   r6   no_gradr9   r2   __classcell__r   r   r+   r   r      s"    &)
r   )rN   r6   torch.optim.optimizerr   r   r   r   r   r   <module>   s    