o
    پi-                  "   @   s0  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 G dd de
ZG dd	 d	e	Zd
ee dee dee dee dee dee dededededededededededef"ddZd
ee dee dee dee dee dee dededededededededededef"ddZdS )z Adan Optimizer

Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
    https://arxiv.org/abs/2208.06677

Implementation adapted from https://github.com/sail-sg/Adan
    N)ListOptionalTuple)Tensor)	Optimizerc                   @   s$   e Zd ZdZdZdd Zdd ZdS )MultiTensorApplyFc              
   C   sF   z	dt _|| _W d S  ty" } zdt _|t _W Y d }~d S d }~ww )NTF)r   	available
chunk_sizeImportError
import_err)selfr	   err r   C/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adan.py__init__"   s   zMultiTensorApply.__init__c                 G   s   || j ||g|R  S N)r	   )r   opnoop_flag_buffertensor_listsargsr   r   r   __call__*   s   zMultiTensorApply.__call__N)__name__
__module____qualname__r   warnedr   r   r   r   r   r   r      s
    r   c                       s   e Zd ZdZ							dded	eeeef d
ededededee f fddZ fddZ	e
 dd Ze
 dddZ  ZS )Adana   Implements a pytorch variant of Adan.

    Adan was proposed in Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models
    https://arxiv.org/abs/2208.06677

    Arguments:
        params: Iterable of parameters to optimize or dicts defining parameter groups.
        lr: Learning rate.
        betas: Coefficients used for first- and second-order moments.
        eps: Term added to the denominator to improve numerical stability.
        weight_decay: Decoupled weight decay (L2 penalty)
        no_prox: How to perform the weight decay
        caution: Enable caution from 'Cautious Optimizers'
        foreach: If True would use torch._foreach implementation. Faster but uses slightly more memory.
    MbP?g\(\?gq=
ףp?gGz?:0yE>        FNlrbetasepsweight_decayno_proxcautionforeachc	           
   	      s   d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d d|d	   krNdk sXn t d
|d	 t|||||||d}	t ||	 d S )Nr   zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {}   z%Invalid beta parameter at index 2: {})r    r!   r"   r#   r$   r%   r&   )
ValueErrorformatdictsuperr   )
r   paramsr    r!   r"   r#   r$   r%   r&   defaults	__class__r   r   r   ?   s(   
	zAdan.__init__c                    s8   t t| | | jD ]}|dd |dd qd S )Nr$   Fr%   )r-   r   __setstate__param_groups
setdefault)r   stategroupr0   r   r   r2   _   s
   
zAdan.__setstate__c                 C   s`   | j D ]*}d|d< |d D ]}|jr,| j| }t||d< t||d< t||d< qqd S )Nr   stepr.   exp_avg
exp_avg_sqexp_avg_diff)r3   requires_gradr5   torch
zeros_like)r   r6   pr5   r   r   r   restart_opte   s   

zAdan.restart_optc                 C   s2  d}|durt   | }W d   n1 sw   Y  zdt jjj v }W n   d}Y | jD ]}g }g }g }g }g }	g }
|d \}}}d|v rT|d  d7  < nd|d< d||d   }d||d   }d||d   }|d D ]a}|jdu r|qt|| ||j | j	| }t
|d	krt ||d
< t ||d< t ||d< d|vs|d dkr|j  |d< ||d
  ||d  |	|d  |
|d  qt|sq2|d du r|d  p|}n|d }|rt}nt}|||f|||	|
|||||t||d |d |d |d |d d q2|S )z$Performs a single optimization step.NScalarFr!   r7   r(   r'   r.   r   r8   r9   r:   neg_pre_gradr&   r%   r    r#   r"   r$   )exp_avgsexp_avg_sqsexp_avg_diffsneg_pre_gradsbeta1beta2beta3bias_correction1bias_correction2bias_correction3_sqrtr    r#   r"   r$   r%   )r<   enable_gradopsaten_foreach_maximum_	overloadsr3   gradappendr5   lenr=   clone_multi_tensor_adan_single_tensor_adanmathsqrt)r   closurelosshas_scalar_maximumr6   params_with_gradgradsrB   rC   rD   rE   rF   rG   rH   rI   rJ   bias_correction3r>   r5   use_foreachfuncr   r   r   r7   u   s   





z	Adan.step)r   r   r   r   FFNr   )r   r   r   __doc__floatr   boolr   r   r2   r<   no_gradr?   r7   __classcell__r   r   r0   r   r   .   s:     
r   r.   r]   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r    r#   r"   r$   r%   c                C   s  t | D ]\}}|| }|| }|| }|| }|| }|| ||j|d| d ||j|d| d ||| ||j||d| d | | |}|| |
 }||	 }|r||| dk|j}|| j	dd || }|r|d||   |j
||| d |j
||| d n|j
||| d |j
||| d |d||   | j|dd qd S )Nr(   alphavaluer   r   )min      )	enumerateadd_mul_addcmul_rX   todtypediv_meanclamp_addcdiv_zero_)r.   r]   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r    r#   r"   r$   r%   iparamrQ   r8   r9   r:   neg_grad_or_diffdenomstep_size_diff	step_sizemaskr   r   r   rV      s6   
rV   c                C   s  t | dkrd S t|| t|| tj||d| d t|| tj||d| d t|| t|| t|| tj|||d| d t|}t|| t|| || |
 }||	 }|rt||}dd t||D }dd |D }t	|d t|| t||}|rt| d||   tj
| ||| d tj
| ||| d ntj
| ||| d tj
| ||| d t| d||   t| tj||d	d d S )
Nr   r(   rf   rh   c                 S   s    g | ]\}}|d k |jqS )r   )rp   rq   ).0mgr   r   r   
<listcomp>7  s     z&_multi_tensor_adan.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )rs   )r~   r   r   r   r   r   8  s    r   rk   )rS   r<   _foreach_add__foreach_mul__foreach_addcmul__foreach_sqrt_foreach_div__foreach_mulziprO   _foreach_addcdiv__foreach_zero_)r.   r]   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r    r#   r"   r$   r%   rz   r{   r|   masks
mask_scaler   r   r   rU     s@   

rU   )ra   rW   typingr   r   r   r<   r   torch.optim.optimizerr   objectr   r   rb   rc   rV   rU   r   r   r   r   <module>   s     !	

:	
