o
    پi&                     @   sH   d Z ddlZddlmZmZ ddlZddlmZ G dd dejj	Z
dS )z Adafactor Optimizer

Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

Modified by Ross Wightman to fix some issues with factorization dims for non nn.Linear layers

Original header/copyright below.
    N)OptionalTuple   )ParamsTc                       s   e Zd ZdZ											
		d%dedee dededededeeeef  dededede	def fddZ
 fddZedd Zed&ddZedd  Zd!d" Ze d'd#d$Z  ZS )(	Adafactora  Implements Adafactor algorithm.

    This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
    (see https://arxiv.org/abs/1804.04235)

    Note that this optimizer internally adjusts the learning rate depending on the
    *scale_parameter*, *relative_step* and *warmup_init* options.

    To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
    `relative_step=False`.

    Ags:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: external learning rate
        eps: regularization constants for square gradient and parameter scale respectively
        eps_scale: regularization constants for parameter scale respectively
        clip_threshold: threshold of root-mean-square of final gradient update
        decay_rate: coefficient used to compute running averages of square gradient
        beta1: coefficient used for computing running averages of gradient
        weight_decay: weight decay
        scale_parameter: if True, learning rate is scaled by root-mean-square of parameter
        warmup_init: time-dependent learning rate computation depends on whether warm-up initialization is being used
    NKH9MbP?      ?皙        TF   paramslreps	eps_scaleclip_threshold
decay_ratebetasweight_decayscale_parameterwarmup_initmin_dim_size_to_factorcautionc                    s`   | }|
r|st d|d u rd n|d }t||||||||	||
||d}tt| || d S )Nz'warmup_init requires relative_step=Truer   )r   r   r   r   r   beta1r   r   relative_stepr   r   r   )
ValueErrordictsuperr   __init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__ H/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adafactor.pyr   .   s&   zAdafactor.__init__c                    s4   t  | | jD ]}|dd |dd q	d S )Nr   Fr   r   )r   __setstate__param_groups
setdefault)r   stategroupr!   r#   r$   r%   R   s
   
zAdafactor.__setstate__c                 C   sj   | d r1| d rd|d  nd}t |dt|d  }d}| d r+t| d |d	 }|| | d
< | d
 S )Nr   r   gư>stepg{Gz?r	   r   r   RMSr   )minmathsqrtmax)param_groupparam_statemin_steplr_tparam_scaler#   r#   r$   _get_lrX   s   zAdafactor._get_lrc                 C   s|   | d d u}d }t |}|dkr"|d |kr"|d |kr"d}||fS |dkr:|d |kr:|d |kr:|d |d f}||fS )Nr      r   r   )r   r   )len)r0   param_shapemin_size_to_factoruse_first_momentfactoredndimr#   r#   r$   _get_optionsc   s     zAdafactor._get_optionsc                 C   s   |  d|  d  S )Nr6   g      ?)normnumel)tensorr#   r#   r$   _rmsu   s   zAdafactor._rmsc                 C   s6   ||j |dd  |}|| }t||S )NT)dimkeepdim)meanrsqrt_	unsqueezersqrttorchmul)r   exp_avg_sq_rowexp_avg_sq_coldim_coldim_rowr_factorc_factorr#   r#   r$   _approx_sq_grady   s   zAdafactor._approx_sq_gradc                 C   s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d D ]}|jdu r0q'|j}|jt jt jhv r@| }|jrGt	d| j
| }| j||j|d d\}}t|dkrd|d< |rlt ||d< |dur|\}	}
d	d
 }t ||j|
||d< t ||j|	||d< nt ||d< d|d< n+|r|d ||d< |dur|d ||d< |d ||d< n	|d ||d< |}|jt jt jhv r| }|d  d7  < | ||d< | ||}dt|d |d  }|d |d  }|durC|\}	}
|d }|d }||j|j|
dd| d ||j|j|	dd| d | |||	|
}|| n|d }||j|d| d | |}|| ||d  jdd || |r|d }||d j|d|d  d |d r|| dk|j}|| jdd || }n|}|d dkr|j||d  | d ||  |jt jt jhv r|| q'q |S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model and returns the loss.
        Nr   z,Adafactor does not support sparse gradients.r   )r;   r   r*   exp_avgc                 S   s   | d | | |d d   S )Nr   r#   )shaperD   r#   r#   r$   _remove_dim   s   z#Adafactor.step.<locals>._remove_dimrL   rM   
exp_avg_sqr+   r   r	   r   r6   r   )rD   )alphar   )r,   r   r   r   r   )rJ   enable_gradr&   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr(   r?   rT   r9   
zeros_likezerostorC   r5   r-   powmul_add_rF   rR   rI   div_clamp_copy_)r   closurelossr)   prY   r(   factored_dimsr<   rN   rO   rU   p_fp32r3   beta2tupdaterL   rM   rV   rS   maskr#   r#   r$   r*      s   





   
 


ZzAdafactor.step)Nr   r   r	   r
   Nr   TFr   F)r   )N)__name__
__module____qualname____doc__r   r   r]   r   boolintr   r%   staticmethodr5   r?   rC   rR   rJ   no_gradr*   __classcell__r#   r#   r!   r$   r      s`    	
$


r   )rt   r-   typingr   r   rJ   _typesr   optim	Optimizerr   r#   r#   r#   r$   <module>   s    