o
    zi`                     @   sP   d dl Z d dlmZ G dd deZdd Zdd Zd	d
 Zdd Zdd ZdS )    N)	Optimizerc                       sD   e Zd ZdZd fdd	Ze ddd	Zd
d Zdd Z	  Z
S )SM3a  Implements SM3 algorithm.

    It has been proposed in `Memory-Efficient Adaptive Optimization`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): coefficient that scale delta before it is applied
            to the parameters (default: 0.1)
        momentum (float, optional): coefficient used to scale prior updates
            before adding. This drastically increases memory usage if
            `momentum > 0.0`. This is ignored if the parameter's gradient
            is sparse. (default: 0.0)
        beta (float, optional): coefficient used for exponential moving
            averages (default: 0.0)
        eps (float, optional): Term added to square-root in denominator to
            improve numerical stability (default: 1e-30)

    .. _Memory-Efficient Adaptive Optimization:
        https://arxiv.org/abs/1901.11150
        https://github.com/Enealor/PyTorch-SM3
    皙?        KH9c                    s   d|kst d|d|  krdk sn t d|d|  kr'dk s/n t d|d|ks:t d|||||d}tt| || d S )Nr   zInvalid learning rate: {0}      ?zInvalid momentum: {0}zInvalid beta: {0}zInvalid eps: {0})lrmomentumbetaeps)
ValueErrorformatsuperr   __init__)selfparamsr   r	   r
   r   defaults	__class__ G/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/sm3.pyr      s   zSM3.__init__Nc                    s  d}|durt   | }W d   n1 sw   Y  | jD ]}|d }|d }|d }|d D ]}|du r9q2|j | j|  j}t|}	tdkr[dd< dd	< t   jr 	   
   }
 fd
d}td }t|||
}| |||| || |
 ||}nG|	dkrfddt|	D }ntd g}t|| }| ||| ||   |dkr݈d	 }|d| || | d	< ||d | d  d7  < q2q |S )zPerforms single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr	   r
   r   r   r   stepr   momentum_bufferc                    s:    j } dks|  dkr|  S ||   S )Nr   )newdim
resize_as_size)valuesconstructor)gradgrad_indicesr   r   make_sparseQ   s   zSM3.step.<locals>.make_sparse   c                    s   g | ]} t | qS r   )_key).0i)stater   r   
<listcomp>g   s    zSM3.step.<locals>.<listcomp>r   r   )torchenable_gradparam_groupsr   r&   shapelen_add_initial_accumulators	is_sparsecoalesce_indices_valuesr#   _compute_sparse_update_update_sparse_accumulatoradd_rsqrt_mul_range_compute_update_update_accumulatordetachsub_)r   closurelossgroupr	   r
   r   pr+   rankgrad_valuesr!   accupdate_valuesupdateacc_listmr   )r   r    r&   r   r   *   s`   





@zSM3.stepc                 C   sD   t |D ]\}}t||}|dkrtj|||d q|| qd S )Nr   out)	enumerate_max_reduce_except_dimr(   maxcopy_)r   r
   rE   rD   r%   rB   nu_maxr   r   r   r9   }   s   
zSM3._update_accumulatorc                 C   s<   t | d }|dkrtj|||d d S || d S )Nr   r   rG   )rJ   to_densesqueezer(   rK   rL   )r   r
   rB   rD   rM   r   r   r   r3      s   zSM3._update_sparse_accumulator)r   r   r   r   )N)__name__
__module____qualname____doc__r   r(   no_gradr   r9   r3   __classcell__r   r   r   r   r      s    R	r   c                 C   s:   t |d|d }| dkr||  |d|  || |S )Nr   r   r   )r(   gatherr6   addcmul_)r
   rB   rA   r    rC   r   r   r   r2      s
   
r2   c                 C   s\   t |}|d  }td|D ]
}t||| }q| dkr#||  |d|  || |S )Nr   r"   r   r   )r,   cloner7   r(   minr6   rW   )r
   rE   r   r@   rD   r%   r   r   r   r8      s   
r8   c                 C   s   dt |  S )Naccumulator_)str)r%   r   r   r   r#      s   r#   c                 C   s   |j }t|}|j|jd}i }|jr#tj|d fi ||td< n9|dkr5tj|fi ||td< n't|D ]"}dg| || g dg|d |   }tj|fi ||t|< q9| 	| d S )N)devicedtyper   r"   )
r+   r,   r\   r]   r.   r(   zerosr#   r7   rD   )r&   r   r+   r@   r   rB   r%   	acc_shaper   r   r   r-      s    &r-   c                 C   s@   t | j}| }|dkrt|D ]}||kr|j|ddj}q|S )Nr   T)r   keepdim)r,   r+   r7   rK   r   )tensorr   r@   resultdr   r   r   rJ      s   
rJ   )	r(   torch.optimr   r   r2   r8   r#   r-   rJ   r   r   r   r   <module>   s     	