o
    پi,                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ zddlmZm	Z	 dZ
W n ey1   dZ
Y nw d	d
lmZ ddgZG dd deZ		d!dddee dee deee  dedee dedededededededee fddZdee dee deee  dededededededededee fddZdee dee deee  dededededededededee fdd ZdS )"a   SGD with decoupled weight-decay.

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Hacked together by Ross Wightman
    )ListOptionalN)Tensor)	Optimizer)_use_grad_for_differentiable_default_to_fused_or_foreachTF   )ParamsTSGDWsgdwc                       s   e Zd Z					ddddddddededed	ed
edededededee def fddZ fddZdd Z	e
 dddZ  ZS )r
   MbP?        FN)cautioncorrected_weight_decaymaximizeforeachdifferentiableparamslrmomentum	dampeningweight_decaynesterovr   r   r   r   r   c                   s   |dk rt d| |dk rt d| |dk r!t d| t||||||||	|
|d
}|r=|dks9|dkr=t dt || d S )Nr   zInvalid learning rate: zInvalid momentum value: zInvalid weight_decay value: )
r   r   r   r   r   r   r   r   r   r   r   z8Nesterov momentum requires a momentum and zero dampening)
ValueErrordictsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   defaults	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/sgdw.pyr      s*   zSGDW.__init__c                    sd   t  | | jD ]&}|dd |dd |dd |dd |dd  |dd q	d S )Nr   Fr   r   r   r   r   )r   __setstate__param_groups
setdefault)r   stategroupr   r!   r"   r#   @   s   
zSGDW.__setstate__c                 C   sn   d}|d D ].}|j d ur4|| ||j  |j jrd}| j| }d|vr-|d  q||d  q|S )NFr   Tmomentum_buffer)gradappend	is_sparser&   )r   r'   params_with_gradgradsmomentum_buffer_listhas_sparse_gradpr&   r!   r!   r"   _init_groupJ   s   


zSGDW._init_groupc                 C   s   d}|durt   | }W d   n1 sw   Y  | jD ]M}g }g }g }| ||||}t||||d |d |d |d |d |d |d ||d	 |d
 rV| jd ndd t||D ]\}}	| j| }
|	|
d< q_q |S )zPerforms a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r/   r   max_lrr(   )torchenable_gradr$   r1   r   r   zipr&   )r   closurelossr'   r,   r-   r.   r/   r0   r(   r&   r!   r!   r"   step^   s<   



z	SGDW.step)r   r   r   r   FN)__name__
__module____qualname__r	   floatboolr   r   r#   r1   r3   no_gradr8   __classcell__r!   r!   r   r"   r
      sN    		
&
)r2   r   r-   r.   r/   r   r   r   r   r   r   r   r   r2   c                C   s   t r(ttdr(|du rtj st| ddd\}}nd}|r'tj r'tdnd}|r4tj s4t}nt	}|| |||||||	|
|||d dS )zlFunctional API that performs SGD algorithm computation.

    See :class:`~torch.optim.SGD` for details.
    "_group_tensors_by_device_and_dtypeNF)r   	use_fusedz6torch.jit.script not supported with foreach optimizers)	r   r   r   r   r   r   r/   r   r2   )
has_recent_pthasattrr   r3   jitis_scriptingr   RuntimeError_multi_tensor_sgdw_single_tensor_sgdw)r   r-   r.   r/   r   r   r   r   r   r   r   r   r2   _funcr!   r!   r"   r      s4   

c       	         C   s  t | D ]\}}|	s|| n||  }|d u r|n|d | }|d||   |dkr}|| }|d u r@t| }|||< n||j|d| d |rq|rW|j||d}|| dk|j}|	|
 jdd || }n|r{|j||d}n|}|j|| d qd S )N         ?r   r   alphar   )min)	enumeratemul_r3   clonedetachadd_addtodtypediv_meanclamp_)r   r-   r.   r   r   r   r   r   r   r   r/   r2   iparamr)   wd_scalebufmaskr!   r!   r"   rI      s*   

rI   c       	         C   s@  t | dkrd S tj| ||gdd}| D ]\\}}}}|
o(tdd |D }|	r0t|}|d u r6|n|d | }t| d||   |dkrg }d}tt |D ]}|| d u r`d} n|	||  qT|r{t|| tj
||d	| d
 n<g }tt |D ]3}|| d u rt||   } ||< ||| < n|| }||j|| d	| d
 |	| q|r|rtj|||d
}t||}dd t||D }dd |D }t|d t|| t||}n|rtj
|||d
 n|}|stj
||| d
 qtt |D ]}|| j|| | d
 qqd S )Nr   T)with_indicesc                 s   s    | ]}|j V  qd S r9   )r+   ).0r)   r!   r!   r"   	<genexpr>  s    z%_multi_tensor_sgdw.<locals>.<genexpr>rL   rM   Fr   rN   c                 S   s    g | ]\}}|d k |jqS )r   )rW   rX   )rb   mgr!   r!   r"   
<listcomp>.  s     z&_multi_tensor_sgdw.<locals>.<listcomp>c                 S   s   g | ]}|  qS r!   )rZ   )rb   rd   r!   r!   r"   rf   /  s    r   )lenr   rA   valuesanyr3   _foreach_neg_foreach_mul_ranger*   _foreach_add_rS   rT   rR   rU   _foreach_add_foreach_mulr5   _foreach_maximum__foreach_div_)r   r-   r.   r   r   r   r   r   r   r   r/   r2   grouped_tensorsdevice_paramsdevice_gradsdevice_momentum_buffer_listindicesdevice_has_sparse_gradr^   bufsall_states_with_momentum_bufferr\   r_   masks
mask_scaler!   r!   r"   rH      s`   

rH   )NN)__doc__typingr   r   r3   r   torch.optim.optimizerr   r   r   rC   ImportError_typesr	   __all__r
   r>   r=   r   rI   rH   r!   r!   r!   r"   <module>   s    w
	

9
	

.
	
