o
    پi                     @   s|   d Z ddlZddlm  mZ ddlmZ ddlZdej	fddZ
dej	fddZd	ed
edefddZG dd deZdS )a3  
AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py

Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
Code: https://github.com/clovaai/AdamP

Copyright (c) 2020-present NAVER Corp.
MIT license
    N)	Optimizerreturnc                 C   s   |  | ddS )Nr   )reshapesizex r	   D/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adamp.py_channel_view   s   r   c                 C   s   |  ddS )N   r   )r   r   r	   r	   r
   _layer_view   s   r   deltawd_ratioepsc                 C   s   d}ddt | jd   }ttfD ]K}|| }	||}
tj|
|	d|d }| |t	|	
d k r\| |	jddd|| }||||| jdd| 8 }|}||f  S q||fS )	N      ?)r   )r   r   )dimr      )pr   )r   )lenshaper   r   Fcosine_similarityabs_maxmathsqrtr   normadd_r   sum)r   gradperturbr   r   r   wdexpand_size	view_func
param_view	grad_view
cosine_simp_nr	   r	   r
   
projection   s   "r)   c                       s>   e Zd Z							d fdd	Ze dd
dZ  ZS )AdamPMbP?g?g+?:0yE>r   皙?Fc	           
   	      s,   t |||||||d}	tt| ||	 d S )N)lrbetasr   weight_decayr   r   nesterov)dictsuperr*   __init__)
selfparamsr/   r0   r   r1   r   r   r2   defaults	__class__r	   r
   r5   ,   s   	zAdamP.__init__Nc              
   C   s  d }|d urt   | }W d    n1 sw   Y  | jD ]}|d D ]}|jd u r.q&|j}|d \}}|d }| j| }	t|	dkrXd|	d< t ||	d< t ||	d< |	d |	d }
}|	d  d7  < d||	d   }d||	d   }|
|j|d| d	 ||j	||d| d
 |
 t
| |d }|d | }|r||
 d| |  | }n|
| }d}t|jdkrt||||d |d |d \}}|d dkr|d|d |d  |   |j|| d	 q&q |S )Nr7   r0   r2   r   stepexp_avg
exp_avg_sqr   )alpha)valuer   r/   r   r   r   r1   )torchenable_gradparam_groupsr    stater   
zeros_likemul_r   addcmul_r   r   r   r)   )r6   closurelossgroupr   r    beta1beta2r2   rC   r<   r=   bias_correction1bias_correction2denom	step_sizer!   r   r	   r	   r
   r;   B   sH   



".z
AdamP.step)r+   r,   r-   r   r.   r.   F)N)__name__
__module____qualname__r5   r@   no_gradr;   __classcell__r	   r	   r9   r
   r*   +   s    r*   )__doc__r@   torch.nn.functionalnn
functionalr   torch.optim.optimizerr   r   Tensorr   r   floatr)   r*   r	   r	   r	   r
   <module>   s    
