o
    پiZ)                     @   sP   d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	 G dd deZ
dS )	a   PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb

This optimizer code was adapted from the following (starting with latest)
* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
* https://github.com/cybertronai/pytorch-lamb

Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is
similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX.

In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Original copyrights for above sources are below.

Modifications Copyright 2021 Ross Wightman
    N)OptionalTuple)	Optimizer   )ParamsTc                       s   e Zd ZdZ												dd	ed
ededeeef dedededee dededededef fddZ	 fddZ
dd Ze d ddZ  ZS )!Lamba  Implements a pure pytorch variant of FuseLAMB (NvLamb variant) optimizer from apex.optimizers.FusedLAMB
    reference: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py

    LAMB was proposed in:
    - Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:  https://arxiv.org/abs/1904.00962
    - On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ

    Args:
        params: Iterable of parameters to optimize or dicts defining parameter groups.
        lr: Learning rate
        betas: Coefficients used for computing running averages of gradient and its norm.
        eps: Term added to the denominator to improve numerical stability.
        weight_decay: Weight decay
        grad_averaging: Whether apply (1-beta2) to grad when calculating running averages of gradient.
        max_grad_norm: Value used to clip global grad norm.
        trust_clip: Enable LAMBC trust ratio clipping.
        always_adapt: Apply adaptive learning rate to 0.0 weight decay parameter.
        caution: Apply caution.
        decoupled: apply decoupled weight decay
        corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr) when using decoupled_decay
    MbP?Tg?g+?ư>{Gz?      ?Fparamslrbias_correctionbetasepsweight_decaygrad_averagingmax_grad_norm
trust_clipalways_adaptcautiondecoupled_decaycorrected_weight_decayc                    s2   t ||||||||	|
|||d}t || d S )N)r   r   r   r   r   r   r   r   r   r   r   r   )dictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   defaults	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/lamb.pyr   Z   s   zLamb.__init__c                    s@   t  | | jD ]}|dd |dd |dd q	d S )Nr   Fr   r   )r   __setstate__param_groups
setdefault)r   stategroupr   r!   r"   r#   z   s   
zLamb.__setstate__c                 C   s   | j d }|d u rd S g }| jD ]"}|d D ]}|jd u rq|j}|jr(td|tj| qqtjt	|}|| j
dd}|S )Nr   r   zDLamb does not support sparse gradients, consider SparseAdam instead.r   min)r   r$   grad	is_sparseRuntimeErrorappendtorchlinalgvector_normstackclamp_)r   r   normsr'   pr*   global_normclip_global_normr!   r!   r"   _get_clip_grad_norm   s    


zLamb._get_clip_grad_normNc              
   C   s  d}|durt   | }W d   n1 sw   Y  |  }| jD ]G}|d r-dnd}|d \}}|d r;dnd}|rCd| nd}	d|v rR|d  d7  < nd|d< |rid||d   }
d||d   }nd	\}
}|d
 D ]}|jdu ryqq|j}|dur|| | j| }t|dkrt ||d< t ||d< |d |d }}|	|j
||	d |	|j||d| d | t| 
|d }||
 |}|d r|| dk|j}|| jdd |	| |d }|dkr(|ddr!|d r|d d | jd  }n|d }|j
|| | d n|j
||d |dks2|d ra|d}|d}|| }t |dkt |dk|dd}|d r\t j|dd}|	| |j
||d  d qqq$|S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   step)r   r   r   exp_avg
exp_avg_sq)alpha)valuer   r   r   r(   r   r   Fr   r      r   g       @r   )max)r.   enable_gradr7   r$   r*   div_r&   len
zeros_likemul_add_addcmul_sqrtmathtodtypemeanr2   getr   normwhereclamp)r   closurelossclip_grad_normr'   r   beta1beta2r   beta3bias_correction1bias_correction2r4   r*   r&   r9   r:   denomupdatemaskr   wd_scalew_normg_normtrust_ratior!   r!   r"   r8      sx   










?z	Lamb.step)r   Tr	   r
   r   Tr   FFFFF)N)__name__
__module____qualname____doc__r   floatboolr   r   r   r#   r7   r.   no_gradr8   __classcell__r!   r!   r   r"   r   C   sZ    
	
 r   )ra   rG   typingr   r   r.   torch.optimr   _typesr   r   r!   r!   r!   r"   <module>   s    9