o
    zi                     @   st   d dl Z d dlmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZmZ eeef Zeeef ZG dd deZdS )    N)AnyDictTuple)	Optimizer   )OptFloatOptLossClosureParamsStatec                       s   e Zd ZdZ									d)d	ed
ededededededededef fddZ	de
dedefddZde
deedf deeef fddZdejdefdd Zd!ejd"ejd#ejddfd$d%Zd*d&edefd'd(Z  ZS )+	AdafactoraE  Implements Adafactor algorithm.

    It has been proposed in: `Adafactor: Adaptive Learning Rates with
    Sublinear Memory Cost`__.

    Arguments:
        params: iterable of parameters to optimize or dicts defining
            parameter groups
        lr: external learning rate (default: None)
        eps2: regularization constans for square gradient
            and parameter scale respectively (default: (1e-30, 1e-3))
        clip_threshold: threshold of root mean square of
            final gradient update (default: 1.0)
        decay_rate: coefficient used to compute running averages of square
            gradient (default: -0.8)
        beta1: coefficient used for computing running averages of gradient
            (default: None)
        weight_decay: weight decay (L2 penalty) (default: 0)
        scale_parameter: if true, learning rate is scaled by root mean square
            of parameter (default: True)
        relative_step: if true, time-dependent learning rate is computed
            instead of external learning rate (default: True)
        warmup_init: time-dependent learning rate computation depends on
            whether warm-up initialization is being used (default: False)

    Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.Adafactor(model.parameters())
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ https://arxiv.org/abs/1804.04235

    Note:
        Reference code: https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py  # noqa
    NgKH9gMbP?      ?皙        TFparamslreps2clip_threshold
decay_ratebeta1weight_decayscale_parameterrelative_stepwarmup_initc                    sd   |d ur|dkrt d||dk rt d|t||||||||	|
d	}tt| || d S )Nr   zInvalid learning rate: {}zInvalid weight_decay value: {})	r   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)selfr   r   r   r   r   r   r   r   r   r   defaults	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adafactor.pyr   4   s$   zAdafactor.__init__param_groupparam_statereturnc                 C   sj   |d }|d r |d rd|d  nd}t |dt|d  }d}|d r1t|d	 d
 |d }|| S )Nr   r   r   gư>stepg{Gz?r   r   r   r   RMS)minmathsqrtmax)r   r%   r&   rel_step_szmin_stepparam_scaler#   r#   r$   _get_lrU   s   zAdafactor._get_lrparam_shape.c                 C   s    t |dk}|d d u}||fS )N   r   )len)r   r%   r2   factoreduse_first_momentr#   r#   r$   _get_optionsc   s   zAdafactor._get_optionstensorc                 C   s   | d| d  S )Nr3   g      ?)normnumel)r   r8   r#   r#   r$   _rmsj   s   zAdafactor._rmsexp_avg_sq_rowexp_avg_sq_coloutputc                 C   s<   ||j dd  d}|d }tj|||d d S )Ndimout)meanrsqrt_	unsqueezersqrttorchmul)r   r<   r=   r>   r_factorc_factorr#   r#   r$   _approx_sq_gradm   s   zAdafactor._approx_sq_gradclosurec              	   C   s  d}|dur	| }| j D ]4}|d D ],}|jdu rq|jj}|jr'td| j| }|j}| ||\}}	t|dkrzd|d< |	rJt	
||d< |rot	|dd ||d< t	|dd	 |dd  ||d
< nt	
||d< d|d< |d  d7  < | |j|d< | ||}
dt|d |d  }|d |d d  }|r|d }|d
 }||j|jddd| d ||j|jd	dd| d | ||| || n|d }||j|d| d t	j||d| |td| ||d   ||
 |	r$|d }||d j|d|d  d |}|d dkr9|jj|j|d  |
 d |j|  qq|S )zPerforms a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   z,Adafactor does not support sparse gradients.r   r(   exp_avgr?   r<   rB   r=   
exp_avg_sqr)   r   r   r   r3   r   r@   )alpharC   r   r   r   )param_groupsgraddata	is_sparseRuntimeErrorstateshaper7   r4   rI   
zeros_likezerostype_asr;   r1   r+   powmul_add_rE   rM   rH   div_r-   )r   rN   lossgroupprS   rW   
grad_shaper5   r6   r   beta2tupdater<   r=   rP   rO   r#   r#   r$   r(   {   s   





PzAdafactor.step)	Nr   r   r   Nr   TTF)N)__name__
__module____qualname____doc__r	   r   Eps2floatboolr   
ParamGroupr
   r1   r   intr7   rI   Tensorr;   rM   r   r(   __classcell__r#   r#   r!   r$   r      sf    )	
!



r   )r+   typingr   r   r   rI   torch.optim.optimizerr   typesr   r   r	   r
   rk   rj   strrm   r   r#   r#   r#   r$   <module>   s    