o
    }oiR                     @   s6   d dl Z d dlmZ dgZdd ZG dd deZdS )    N)	OptimizerNovogradc                 C   st   | dk rt d|  |dk rt d| d|d   kr"dk r1n nd|d   kr0dk s8n t d| d S )Nr   zInvalid learning rate: zInvalid epsilon value: g              ?   z"Betas have to be between 0 and 1: )
ValueError)lrepsbetas r
   L/home/ubuntu/.local/lib/python3.10/site-packages/nemo/core/optim/novograd.py_check_valid_opt_params   s   6r   c                       sJ   e Zd ZdZ									d fdd	Z fd	d
ZdddZ  ZS )r   a  Implements Novograd algorithm.
    It has been proposed  in "Stochastic Gradient Methods with Layer-wise
    Adaptive Moments for Training of Deep Networks"
    (https://arxiv.org/abs/1905.11286)
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper "On the Convergence of Adam and Beyond"
    MbP?gffffff?g\(\?:0yE>r   Fc                    sH   t ||| t||||||d}|| _|	| _|
| _tt| || d S )N)r   r	   r   weight_decaygrad_averagingamsgrad)r   dictluc	luc_trustluc_epssuperr   __init__)selfparamsr   r	   r   r   r   r   r   r   r   defaults	__class__r
   r   r   0   s   zNovograd.__init__c                    s,   t t| | | jD ]}|dd qd S )Nr   F)r   r   __setstate__param_groups
setdefault)r   stategroupr   r
   r   r   F   s   
zNovograd.__setstate__Nc                 C   s$  d}|dur	| }| j D ]}|d D ]}|jdu rq|jj}|jr&td|d }| j| }|sYd|d< t|j|d< tg 	|d j
|d< |rYtg 	|d j
|d	< |d |d }}	|rh|d	 }
|d
 \}}|d  d7  < | d}|	dkr|	| n|	|j|d| d |rtj|
|	|
d |
 |d }n	|	 |d }|| |d dkr|j|j|d d |d r|d|  ||| | jrt|j}t|j}| j| || j  }t||d }|jj|| d q|jj||d  d qq|S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   z#Sparse gradients are not supported.r   r   stepexp_avg
exp_avg_sqmax_exp_avg_sqr	   r      r   )alpha)outr   r   r   r   )r   graddata	is_sparseRuntimeErrorr!   torch
zeros_likezerostodevicenormpowcopy_mul_add_maxsqrtdiv_r   r   r   min)r   closurelossr"   pr*   r   r!   r$   r%   r&   beta1beta2r3   denom	data_norm	grad_norm
luc_factorr
   r
   r   r#   K   s\   


;zNovograd.step)	r   r   r   r   FFFr   r   )N)__name__
__module____qualname____doc__r   r   r#   __classcell__r
   r
   r   r   r      s    )r.   torch.optim.optimizerr   __all__r   r   r
   r
   r
   r   <module>   s
   	