o
    zi                     @   sL   d dl Z d dlZd dlmZ ddlmZmZmZmZ dZ	G dd deZ
dS )    N)	Optimizer   )Betas2OptFloatOptLossClosureParams)AdamPc                       s   e Zd ZdZ							dded	ed
ededededededdf fddZe	dd Z
e	dd Ze	dd Zdd Zd dedefddZ  ZS )!r   a  Implements AdamP algorithm.

    It has been proposed in `Slowing Down the Weight Norm Increase in
    Momentum-based Optimizers`__

    Arguments:
        params: iterable of parameters to optimize or dicts defining
            parameter groups
        lr: learning rate (default: 1e-3)
        betas: coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps: term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay: weight decay (L2 penalty) (default: 0)
        delta: threhold that determines whether a set of parameters is scale
            invariant or not (default: 0.1)
        wd_ratio: relative weight decay applied on scale-invariant parameters
            compared to that applied on scale-variant parameters (default: 0.1)
        nesterov: enables Nesterov momentum (default: False)


    Example:
        >>> import torch_optimizer as optim
        >>> optimizer = optim.AdamP(model.parameters(), lr=0.1)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

     __ https://arxiv.org/abs/2006.08217

    Note:
        Reference code: https://github.com/clovaai/AdamP
    MbP?g?g+?:0yE>r   皙?Fparamslrbetasepsweight_decaydeltawd_rationesterovreturnNc	           
   	      s   |dkrt d||dk rt d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d |dk rMt d	||dk rXt d
||dk rct d|t|||||||d}	tt| ||	 d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {}zInvalid weight_decay value: {}zInvalid delta value: {}zInvalid wd_ratio value: {})r   r   r   r   r   r   r   )
ValueErrorformatdictsuperr   __init__)
selfr   r   r   r   r   r   r   r   defaults	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adamp.pyr   .   s<   	zAdamP.__init__c                 C   s   |  | ddS )Nr   )viewsizexr   r   r    _channel_viewY   s   zAdamP._channel_viewc                 C   s   |  ddS )Nr   r!   )r"   r$   r   r   r    _layer_view]   s   zAdamP._layer_viewc                 C   sT   || } ||}| j dd|}|j dd|}| | jdd}| | | S )Nr   dim)normadd_sumabs)r%   yr   	view_funcx_normy_normdotr   r   r    _cosine_similaritya   s   zAdamP._cosine_similarityc                 C   s   d}dgdgt |jd   }| j| jfD ]G}	| ||j||	}
|
 |t|	|j	d k r\|j|	|jj
dd|| }|||	|| jdd| 8 }|}||f  S q||fS )Nr   r!   r(   )lenshaper&   r'   r3   datamaxmathsqrtr#   r*   r"   r+   r,   )r   pgradperturbr   r   r   wdexpand_sizer/   
cosine_simp_nr   r   r    _projectionl   s"   "zAdamP._projectionclosurec                 C   s  d}|dur	| }| j D ]}|d D ]}|jdu rq|jj}|d \}}|d }| j| }	t|	dkrGd|	d< t|j|	d< t|j|	d< |	d |	d }
}|	d  d	7  < d	||	d   }d	||	d   }|
|j|d	| d
 ||j	||d	| d |
 t
| |d }|d | }|r||
 d	| |  | }n|
| }d	}t|jd	kr| ||||d |d |d \}}|d dkr|jd	|d |d  |   |jj|| d
 qq|S )zPerforms a single optimization step.

        Arguments:
            closure: A closure that reevaluates the model and returns the loss.
        Nr   r   r   r   stepexp_avg
exp_avg_sqr   )alpha)valuer   r   r   r   r   )param_groupsr;   r6   stater4   torch
zeros_likemul_r+   addcmul_r9   r8   r5   rA   )r   rB   lossgroupr:   r;   beta1beta2r   rI   rD   rE   bias_correction1bias_correction2denom	step_sizer<   r   r   r   r    rC      sZ   



9z
AdamP.step)r	   r
   r   r   r   r   F)N)__name__
__module____qualname____doc__r   floatr   boolr   staticmethodr&   r'   r3   rA   r   r   rC   __classcell__r   r   r   r    r      sH    %	
+



r   )r8   rJ   torch.optim.optimizerr   typesr   r   r   r   __all__r   r   r   r   r    <module>   s    