o
    پiY                     @   s4   d Z ddlZddlmZ ddlZG dd deZdS )a6   Nvidia NovoGrad Optimizer.
Original impl by Nvidia from Jasper example:
    - https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper
Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks`
    - https://arxiv.org/abs/1905.11286
    N)	Optimizerc                       sL   e Zd ZdZ						d fdd	Z fd	d
Ze dddZ  Z	S )
NvNovoGrada(  
    Implements Novograd algorithm.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.95, 0.98))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        grad_averaging: gradient averaging
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
    MbP?gffffff?g\(\?:0yE>r   Fc           	         s   d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d t||||||d	}tt| || d S )
Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {})lrbetasepsweight_decaygrad_averagingamsgrad)
ValueErrorformatdictsuperr   __init__)	selfparamsr   r	   r
   r   r   r   defaults	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/nvnovograd.pyr       s"   
	zNvNovoGrad.__init__c                    s,   t t| | | jD ]}|dd qd S )Nr   F)r   r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   =   s   
zNvNovoGrad.__setstate__Nc                 C   s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d D ]}|jdu r.q&|j}|jr8td|d }| j| }t|dkrnd|d< t ||d< t 	g 
|d j|d< |rnt 	g 
|d j|d	< |d |d }}	|r}|d	 }
|d
 \}}|d  d7  < t t |d}|	dkr|	| n|	|j|d| d |rt j|
|	|
d |
 |d }n	|	 |d }|| |d dkr|j||d d |d r|d|  ||| |j||d  d q&q |S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
            and returns the loss.
        Nr   z#Sparse gradients are not supported.r   r   stepexp_avg
exp_avg_sqmax_exp_avg_sqr	   r      )alpha)outr
   r   r   r   )torchenable_gradr   grad	is_sparseRuntimeErrorr   len
zeros_likezerostodevicesumpowcopy_mul_add_maxsqrtdiv_)r   closurelossr   pr(   r   r   r    r!   r"   beta1beta2normdenomr   r   r   r   B   sT   




4zNvNovoGrad.step)r   r   r   r   FF)N)
__name__
__module____qualname____doc__r   r   r&   no_gradr   __classcell__r   r   r   r   r      s    r   )rB   r&   torch.optim.optimizerr   mathr   r   r   r   r   <module>   s
    