o
    پiA                  %   @   s  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ G dd	 d	e	Z		
d#dee dee dee dee dee dee dee dedededededededededee ddf$ddZdee dee dee dee dee dee dedededededededededee f dd Zdee dee dee dee dee dee dedededededededededee f d!d"ZdS )$aM   AdamW Optimizer
Impl copied from PyTorch master

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

NOTE: This impl has been deprecated in favour of torch.optim.AdamW and remains as a reference
    N)ListOptionalTuple)Tensor)	Optimizer   )ParamsTc                       s   e Zd ZdZ										dded	ed
eeef dededededededee def fddZ	 fddZ
e dddZ  ZS )AdamWLegacya  Implements AdamW algorithm.

    NOTE: This impl has been deprecated in favour of torch.optim.AdamW and remains as a reference

    References:
        - Adam: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980
        - Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
        - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        betas: coefficients used for computing running averages of gradient and its square
        eps: term added to the denominator to improve numerical stability
        weight_decay: weight decay coefficient
        amsgrad: whether to use the AMSGrad variant of this algorithm
            from the paper `On the Convergence of Adam and Beyond`
        caution: apply caution when using AdamW
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
        maximize: maximize the params based on the objective, instead of minimizing
        foreach: whether foreach implementation of optimizer is used.
            If unspecified by the user (so foreach is None), we will try to use
            foreach over for-loop implementation on CUDA, since it is faster in general.
        capturable: whether this instance is safe to capture in a CUDA graph.
            Passing True can impair ungraphed performance, so if you don't intend to
            graph capture this instance, leave it False
    MbP?g?g+?:0yE>{Gz?FNparamslrbetasepsweight_decayamsgradcautioncorrected_weight_decaymaximizeforeach
capturablec                    s   d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d t||||||||
|	|d	
}tt| || d S )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})
r   r   r   r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr	   __init__)selfr   r   r   r   r   r   r   r   r   r   r   defaults	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adamw.pyr   1   s*   zAdamWLegacy.__init__c                    s   t t| | t| j }t|dkot|d d }|s0|D ]}t	t
|d |d< q"| jD ]&}|dd |dd |dd |dd  |dd |d	d q3d S )
Nr   stepr   Fr   r   r   r   r   )r   r	   __setstate__liststatevalueslentorch	is_tensortensorfloatparam_groups
setdefault)r    r)   state_valuesstep_is_tensorsgroupr"   r$   r%   r'   U   s   
zAdamWLegacy.__setstate__c                 C   s  |    d}|dur!t  | }W d   n1 sw   Y  | jD ]}g }g }g }g }g }g }	|d \}
}|d }|d D ]m}|jdu rHq@|| |jjrUtd||j | j| }t	|dkrt
d|d< tj|tjd	|d
< tj|tjd	|d< |rtj|tjd	|d< ||d
  ||d  |r||dd |	|d  q@t||||||	|d ||
||d |d |d |d |d |d |d r| jd ndd q$|S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   z'AdamW does not support sparse gradientsr   r   r&   )memory_formatexp_avg
exp_avg_sqmax_exp_avg_sqr   r   r   r   r   r   r   r   )r   r   beta1beta2r   r   r   r   r   r   max_lr) _cuda_graph_capture_health_checkr,   enable_gradr0   gradappend	is_sparseRuntimeErrorr)   r+   r.   
zeros_likepreserve_formatgetadamwr!   )r    closurelossr5   params_with_gradgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsr:   r;   r   pr)   r$   r$   r%   r&   d   sl   




zAdamWLegacy.step)
r
   r   r   r   FFFFNFN)__name__
__module____qualname____doc__r   r/   r   boolr   r   r'   r,   no_gradr&   __classcell__r$   r$   r"   r%   r	      sL    
	
$r	   Fr   rJ   rK   rL   rM   rN   r   r   r   r:   r;   r   r   r   r   r   r<   returnc       	         C   s   t dd |D std|du r&z| pdtjjj v }W n   d}Y |r0tj s0t	}nt
}|| |||||||	|
|||||||d dS )zgFunctional API that performs AdamW algorithm computation.
      See AdamWLegacy class for details.
    c                 s   s    | ]	}t |tjV  qd S rP   )
isinstancer,   r   ).0tr$   r$   r%   	<genexpr>   s    zadamw.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNScalarF)
r   r:   r;   r   r   r   r   r   r   r<   )allrB   r,   opsaten_foreach_maximum_	overloadsjitis_scripting_multi_tensor_adamw_single_tensor_adamw)r   rJ   rK   rL   rM   rN   r   r   r   r:   r;   r   r   r   r   r   r<   funcr$   r$   r%   rF      s<   
rF   c       
   !      C   s  t | D ]\}}|s|| n||  }|| }|| }|| }|d7 }|d u r)|	n|	d | }|d||
   ||j|d| d ||j||d| d |rb|| }tj|||d |}n|}|r|}dt|| }dt|| }|	| }| }| }| ||  || }|r|| dk	|j
} | |  jdd	 ||  }||| q| }d||  }d||  }|	| }t|}| | |}|r|| dk	|j
} | |  jdd	 ||  }|j||| d qd S )
Nr      r   alpha)value)outr   r
   )min)	enumeratemul_add_addcmul_r,   maxpownegsqrttodtypediv_meanclamp_addcdiv_itemmath)!r   rJ   rK   rL   rM   rN   r   r:   r;   r   r   r   r   r   r   r<   iparamr?   r7   r8   step_twd_scaler9   
denom_baser&   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtdenommaskr$   r$   r%   rf      sP   
rf   c       
            sH  t | dkrd S |rtdd t| |D sJ d|r#tt|}dd |D }dd |D }dd |D }d	d | D } t|d
 |d u rKnd | }t| d
||
   t|  tj||d
  d t| t|||d
  |r% fdd|D }fdd|D }t	|d
 t	|d
 t
| t
| t|}t| t
| t|}|rdd |D }t|| t|}nt|}t|t|| t||}t| t||}|rt||}dd t||D }dd |D }t|d t|| t||}t| || d S  fdd|D }fdd|D }fdd|D }dd |D }|r]dd |D }t|| t|}nt|}t|| t|| |rt||}dd t||D }dd |D }t|d t|| t||}t| ||| d S )Nr   c                 s   s     | ]\}}|j o|j V  qd S rP   )is_cuda)rZ   rO   r&   r$   r$   r%   r\   Y  s    
z&_multi_tensor_adamw.<locals>.<genexpr>z@If capturable=True, params and state_steps must be CUDA tensors.c                 S   $   g | ]}t |rt |n|qS r$   r,   
is_complexview_as_realrZ   xr$   r$   r%   
<listcomp>`     $ z'_multi_tensor_adamw.<locals>.<listcomp>c                 S   r   r$   r   r   r$   r$   r%   r   a  r   c                 S   r   r$   r   r   r$   r$   r%   r   b  r   c                 S   r   r$   r   r   r$   r$   r%   r   c  r   r   rh   ri   c                       g | ]}t  |qS r$   r,   rs   rZ   r&   r:   r$   r%   r   v      c                    r   r$   r   r   r;   r$   r%   r   w  r   c                 S   r   r$   r   r   r$   r$   r%   r     r   c                 S       g | ]\}}|d k |jqS r   rv   rw   rZ   mgr$   r$   r%   r          c                 S      g | ]}|  qS r$   ry   rZ   r   r$   r$   r%   r         r
   c                       g | ]
}d  |    qS r   r|   r   r   r$   r%   r         c                    r   r   r   r   r   r$   r%   r     r   c                    s   g | ]} | d  qS )r$   rZ   bc)r   r$   r%   r     r   c                 S   s   g | ]}t |qS r$   )r}   ru   r   r$   r$   r%   r     s    c                 S   r   r$   r   r   r$   r$   r%   r     r   c                 S   r   r   r   r   r$   r$   r%   r     r   c                 S   r   r$   r   r   r$   r$   r%   r     r   )r+   r^   zipr,   _foreach_negtuple_foreach_add__foreach_mul__foreach_addcmul__foreach_sub__foreach_neg__foreach_div_foreach_reciprocal__foreach_sqrtra   _foreach_div__foreach_mul_foreach_add_foreach_addcdiv_)r   rJ   rK   rL   rM   rN   r   r:   r;   r   r   r   r   r   r   r<   r   r   r   r   r   r   eps_over_step_sizer   masks
mask_scaler$   )r:   r;   r   r%   re   B  s   









re   )NF)rT   r}   typingr   r   r   r,   r   torch.optim.optimizerr   _typesr   r	   rU   r/   rF   rf   re   r$   r$   r$   r%   <module>   s    	 #

=	

V	
