o
    پi"                     @   s4  d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 G dd deZ
			ddd
deej deej deej dedededededededee fddZdeej deej deej dededededededee fddZdeej deej deej dededededededee fddZdS )ah   Lion Optimizer
Paper: `Symbolic Discovery of Optimization Algorithms` - https://arxiv.org/abs/2302.06675
Original Impl: https://github.com/google/automl/tree/master/lion

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    )ListOptionalTupleN)	Optimizer   )ParamsTc                       s|   e Zd ZdZ							ddeded	eeef d
ededededee f fddZ	 fddZ
e dddZ  ZS )LionzImplements Lion algorithm.-C6?g?gGz?        FNparamslrbetasweight_decaycautioncorrected_weight_decaymaximizeforeachc	           
   	      s   d|kst d|d|d   krdk s!n t d|d d|d   kr-dk s7n t d|d t|||||||d}	t ||	 d	S )
a  Initialize the hyperparameters.

        Args:
            params: iterable of parameters to optimize or dicts defining parameter groups
            lr: learning rate
            betas: coefficients used for computing running averages of gradient and its square
            weight_decay: weight decay coefficient
            caution: apply caution
            corrected_weight_decay: apply corrected weight decay (lr**2 / max_lr)
        r   zInvalid learning rate: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})r   r   r   r   r   r   r   N)
ValueErrorformatdictsuper__init__)
selfr   r   r   r   r   r   r   r   defaults	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/lion.pyr   "   s    	zLion.__init__c                    sL   t  | | jD ]}|dd |dd |dd |dd  q	d S )Nr   Fr   r   r   )r   __setstate__param_groups
setdefault)r   stategroupr   r   r   r   I   s   
zLion.__setstate__c                 C   s  d}|durt   | }W d   n1 sw   Y  | jD ]l}g }g }g }|d \}}|d D ]7}	|	jdu r:q2||	 |	jjrGtd||	j | j|	 }
t|
dkrbt j	|	t j
d|
d< ||
d  q2t||||||d |d	 |d
 |d |d |d r| jd ndd q |S )zPerforms a single optimization step.

        Args:
            closure: A closure that reevaluates the model and returns the loss.

        Returns:
            the loss.
        Nr   r   z&Lion does not support sparse gradientsr   )memory_formatexp_avgr   r   r   r   r   r   )beta1beta2r   r   r   r   r   max_lr)torchenable_gradr    gradappend	is_sparseRuntimeErrorr"   len
zeros_likepreserve_formatlionr   )r   closurelossr#   params_with_gradgradsexp_avgsr&   r'   pr"   r   r   r   stepQ   sF   





z	Lion.step)r	   r
   r   FFFN)N)__name__
__module____qualname____doc__r   floatr   boolr   r   r   r)   no_gradr9   __classcell__r   r   r   r   r      s:    
	'r   F)r(   r   r6   r7   r   r   r&   r'   r   r   r   r(   c                C   s   |du rz|	 pdt jjj v }W n   d}Y |r$t j r$td|r.t j s.t}nt	}|| |||||||	||
d
 dS )z=Functional API that performs Lion algorithm computation.
    NScalarFz6torch.jit.script not supported with foreach optimizers)r&   r'   r   r   r   r   r(   )
r)   opsaten_foreach_maximum_	overloadsjitis_scriptingr.   _multi_tensor_lion_single_tensor_lion)r   r6   r7   r   r   r&   r'   r   r   r   r(   funcr   r   r   r2      s,   
r2   c                C   s   t | D ]v\}
}|s||
 n||
  }||
 }t|r+t|}t|}t|}|	d u r1|n|d |	 }|d||   ||j|d| d }|rj|| dk|j	}|
| jdd || |j|| d ||d|  qd S )N   r   alphar   MbP?)min)	enumerater)   
is_complexview_as_realmul_muladd_sign_todtypediv_meanclamp_lerp_)r   r6   r7   r&   r'   r   r   r   r   r(   iparamr+   r%   wd_scaleupdatemaskr   r   r   rJ      s"   




rJ   c                C   s2  t | dkrd S |rtt|}dd |D }dd |D }dd | D } |	d u r,|n|d |	 }
t| d|
|   t||}tj||d| d d	d |D }|r~t||}d
d t||D }dd |D }t|d t	|| t|| tj| || d t|| tj||d| d d S )Nr   c                 S   $   g | ]}t |rt |n|qS r   r)   rR   rS   .0xr   r   r   
<listcomp>      $ z&_multi_tensor_lion.<locals>.<listcomp>c                 S   rc   r   rd   re   r   r   r   rh      ri   c                 S   rc   r   rd   re   r   r   r   rh      ri   rL   r   rM   c                 S      g | ]}|  qS r   )rW   )rf   ur   r   r   rh          c                 S   s    g | ]\}}|d k |jqS )r   )rX   rY   )rf   mgr   r   r   rh     s     c                 S   rj   r   )r[   )rf   rm   r   r   r   rh     rl   rO   )
r/   r)   _foreach_negtuple_foreach_mul__foreach_mul_foreach_add_ziprE   _foreach_div_)r   r6   r7   r&   r'   r   r   r   r   r(   r`   updatesmasks
mask_scaler   r   r   rI      s,   rI   )FN)r=   typingr   r   r   r)   torch.optim.optimizerr   _typesr   r   Tensorr?   r>   r2   rJ   rI   r   r   r   r   <module>   s    n	

/	

)	
