o
    پi.                     @   s,   d Z ddlZddlmZ G dd deZdS )an   PyTorch LARS / LARC Optimizer

An implementation of LARS (SGD) + LARC in PyTorch

Based on:
  * PyTorch SGD: https://github.com/pytorch/pytorch/blob/1.7/torch/optim/sgd.py#L100
  * NVIDIA APEX LARC: https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py

Additional cleanup and modifications to properly support PyTorch XLA.

Copyright 2021 Ross Wightman
    N)	Optimizerc                       sR   e Zd ZdZ									d fdd	Z fd	d
Ze dddZ  Z	S )Larsa   LARS for PyTorch
    
    Paper: `Large batch training of Convolutional Networks` - https://arxiv.org/pdf/1708.03888.pdf

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
        lr (float, optional): learning rate (default: 1.0).
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)
        trust_coeff (float): trust coefficient for computing adaptive lr / trust_ratio (default: 0.001)
        eps (float): eps for division denominator (default: 1e-8)
        trust_clip (bool): enable LARC trust ratio clipping (default: False)
        always_adapt (bool): always apply LARS LR adapt, otherwise only when group weight_decay != 0 (default: False)
          ?r   FMbP?:0yE>c                    s   |dk rt d| |dk rt d| |dk r!t d| |r/|dks+|dkr/t dt||||||||	|
d	}t || d S )Ng        zInvalid learning rate: zInvalid momentum value: zInvalid weight_decay value: r   z8Nesterov momentum requires a momentum and zero dampening)	lrmomentum	dampeningweight_decaynesterovtrust_coeffeps
trust_clipalways_adapt)
ValueErrordictsuper__init__)selfparamsr   r   r	   r
   r   r   r   r   r   defaults	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/lars.pyr   #   s(   zLars.__init__c                    s(   t  | | jD ]}|dd q	d S )Nr   F)r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   F   s   
zLars.__setstate__Nc              
   C   s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d }|d }|d }|d }|d }|d }	|d D ]}
|
jdu rFq>|
j}|d	ksQ|d
 r|
d}|d}|| |||  |	  }t |d	kt |d	k|dd}|d rt j||d  dd}|j|
|d || |d	kr| j	|
 }d|vrt 
|  }|d< n|d }||j|d| d |r|j||d}n|}|
j||d  d q>q |S )zPerforms a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model and returns the loss.
        Nr
   r   r	   r   r   r   r   r   r   g       @r   r   r   )max)alphamomentum_buffer)torchenable_gradr   gradnormwhereclampadd_mul_r   clonedetachadd)r   closurelossr   r
   r   r	   r   r   r   pr%   w_normg_normtrust_ratioparam_statebufr   r   r   stepK   sR   






&z	Lars.step)	r   r   r   r   Fr   r   FF)N)
__name__
__module____qualname____doc__r   r   r#   no_gradr6   __classcell__r   r   r   r   r      s    #r   )r:   r#   torch.optim.optimizerr   r   r   r   r   r   <module>   s    