o
    پi,;                  !   @   st  d Z ddlZddlmZmZmZ ddlZddlmZ ddlm	Z	 G dd dej
jZ			d d
ee dee dee dee dee dee dedededededededededdf ddZd
ee dee dee dee dee dededededededededee fddZd
ee dee dee dee dee dededededededededee fddZdS )!ad   NAdamW Optimizer

Based on simplified algorithm in https://github.com/mlcommons/algorithmic-efficiency/tree/main/baselines/nadamw

Added multi-tensor (foreach) path.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    N)ListOptionalTuple)Tensor   )ParamsTc                       s   e Zd ZdZ									dded	ed
eeef dedededededee def fddZ	 fddZ
e dddZ  ZS )NAdamWa   Implements NAdamW algorithm.

    See Table 1 in https://arxiv.org/abs/1910.05446 for the implementation of
    the NAdam algorithm (there is also a comment in the code which highlights
    the only difference of NAdamW and AdamW).

    For further details regarding the algorithm we refer to
        - Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
        - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: iterable of parameters to optimize or dicts defining parameter groups
        lr: learning rate
        betas: coefficients used for computing running averages of gradient and its square
        eps: term added to the denominator to improve numerical stability
        weight_decay: weight decay coefficient
        caution: enable caution
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
    MbP?g?g+?:0yE>{Gz?FNparamslrbetasepsweight_decaycautioncorrected_weight_decaymaximizeforeach
capturablec                    s   d|kst d| d|kst d| d|d   kr"dk s,n t d|d  d|d   kr8dk sBn t d|d  d|ksMt d	| t|||||||	||
d
	}t || d S )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: )	r   r   r   r   r   r   r   r   r   )
ValueErrordictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   defaults	__class__ E/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/nadamw.pyr   *   s,   zNAdamW.__init__c                    s   t  | t| j }t|dkot|d d }|s.|D ]}tt	|d |d< q | j
D ]}|dd |dd q1d S )Nr   stepr   Fr   )r   __setstate__liststatevalueslentorch	is_tensortensorfloatparam_groups
setdefault)r   r&   state_valuesstep_is_tensorsgroupr   r!   r"   r$   N   s   
zNAdamW.__setstate__c                 C   sv  |    d}|dur!t  | }W d   n1 sw   Y  | jD ]}g }g }g }g }g }|d \}	}
|d D ]V}|jdu rBq:|| |jjrOtd||j | j| }t	|dkr{t
d|d< tj|tjd|d	< tj|tjd|d
< ||d	  ||d
  ||d  q:t||||||	|
|d |d |d |d |d |d |d r| jd ndd q$|S )zPerforms a single optimization step.

            Args:
              closure (callable, optional): A closure that reevaluates the model
                  and returns the loss.
        Nr   r   z(NAdamW does not support sparse gradientsr   r   r#   )memory_formatexp_avg
exp_avg_sqr   r   r   r   r   r   r   	beta1beta2r   r   r   r   r   r   max_lr) _cuda_graph_capture_health_checkr)   enable_gradr-   gradappend	is_sparseRuntimeErrorr&   r(   r+   
zeros_likepreserve_formatnadamwr   )r   closurelossr2   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepsr7   r8   pr&   r!   r!   r"   r#   Y   sZ   




zNAdamW.step)	r	   r
   r   r   FFFNFN)__name__
__module____qualname____doc__r   r,   r   boolr   r   r$   r)   no_gradr#   __classcell__r!   r!   r   r"   r      sF    
	
$r   Fr   rF   rG   rH   rI   r   r   r7   r8   r   r   r   r   r   r9   returnc                C   s   t dd |D std|du r&z| pdtjjj v }W n   d}Y |r0tj s0t	}nt
}|| |||||||	|
|||||d dS )zcFunctional API that performs NAdamW algorithm computation.
      See NAdamW class for details.
    c                 s   s    | ]	}t |tjV  qd S rK   )
isinstancer)   r   ).0tr!   r!   r"   	<genexpr>   s    znadamw.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNScalarFr6   )allr?   r)   opsaten_foreach_maximum_	overloadsjitis_scripting_multi_tensor_nadamw_single_tensor_nadamw)r   rF   rG   rH   rI   r   r   r7   r8   r   r   r   r   r   r9   funcr!   r!   r"   rB      s8   
rB   c       	         C   s  t | D ]\}}|s|| n||  }|| }|| }|| }|d7 }|d u r)|n|d | }|d||   ||j|d| d ||j||d| d |r|}dt|| }dt|| }|| }| }| }||j|d| d}| ||  |	| }|
r|| dk	|j
}|| jdd || ||| q| }d||  }d||  }|| }t|}||j|d| d}| | |	}|
r|| dk	|j
}|| jdd || |j||| d qd S )	Nr      r   alpha)valuer   r	   )min)	enumeratemul_add_addcmul_r)   pownegsqrtmultodtypediv_meanclamp_addcdiv_itemmath)r   rF   rG   rH   rI   r7   r8   r   r   r   r   r   r   r9   iparamr<   r4   r5   step_twd_scaler#   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtdenommaskr!   r!   r"   ra      sJ   


ra   c       	            s2  t | dkrd S |rtdd t| |D sJ d|r#tt|}dd |D }dd |D }dd |D }d	d | D } t|d
 |d u rKnd | }t| d
||   t|  tj||d
  d t| t|||d
  |r  fdd|D }fdd|D }t	|d
 t	|d
 t
| t
| t|}t| t
| t|}t| }tj||d
  d t|}t|t|| t||	}t| t||}|
rt||}dd t||D }dd |D }t|d t|| t|| t| || d S  fdd|D }fdd|D }fdd|D }dd |D }t| }tj||d
  d t|}t|| t||	}|
rt||}dd t||D }dd |D }t|d t|| t|| t| ||| d S )Nr   c                 s   s     | ]\}}|j o|j V  qd S rK   )is_cuda)rU   rJ   r#   r!   r!   r"   rW   ;  s    
z'_multi_tensor_nadamw.<locals>.<genexpr>z@If capturable=True, params and state_steps must be CUDA tensors.c                 S   $   g | ]}t |rt |n|qS r!   r)   
is_complexview_as_realrU   xr!   r!   r"   
<listcomp>B     $ z(_multi_tensor_nadamw.<locals>.<listcomp>c                 S   r   r!   r   r   r!   r!   r"   r   C  r   c                 S   r   r!   r   r   r!   r!   r"   r   D  r   c                 S   r   r!   r   r   r!   r!   r"   r   E  r   r   rc   rd   c                       g | ]}t  |qS r!   r)   rl   rU   r#   r7   r!   r"   r   W      c                    r   r!   r   r   r8   r!   r"   r   X  r   c                 S       g | ]\}}|d k |jqS r   rp   rq   rU   mgr!   r!   r"   r   w       c                 S      g | ]}|  qS r!   rs   rU   r   r!   r!   r"   r   x      r	   c                       g | ]
}d  |    qS r   rv   r   r   r!   r"   r         c                    r   r   r   r   r   r!   r"   r     r   c                    s   g | ]} | d  qS )r!   rU   bc)r   r!   r"   r     r   c                 S   s   g | ]}t |qS r!   )rw   rn   r   r!   r!   r"   r     s    c                 S   r   r   r   r   r!   r!   r"   r     r   c                 S   r   r!   r   r   r!   r!   r"   r     r   )r(   rY   zipr)   _foreach_negtuple_foreach_add__foreach_mul__foreach_addcmul__foreach_sub__foreach_neg__foreach_div_foreach_reciprocal__foreach_sqrt_foreach_mul_foreach_div__foreach_addr\   _foreach_addcdiv_)r   rF   rG   rH   rI   r7   r8   r   r   r   r   r   r   r9   r{   r|   r}   r~   r   exp_avg_sq_sqrteps_over_step_sizer   masks
mask_scaler!   )r7   r8   r   r"   r`   &  s   









r`   )NF)rO   rw   typingr   r   r   r)   r   _typesr   optim	Optimizerr   rP   r,   rB   ra   r`   r!   r!   r!   r"   <module>   s    
 	

9	

S	
