o
    پi6L                  +   @   s2  d Z ddlmZmZmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	gZd
d Zd.ddZdd Zdd ZG dd de
Zdee dee dee dee dee dee dee dedededeeef dedee d ee d!ed"ed#ed$ed%ed&ef(d'd(Zdee dee dee dee dee dee dee dedededeeef dedee d ee d!ed"ed#ed$ed%ed&ef(d)d*Z		+	+			+d/dee dee dee dee dee d,ee d%ed&edee dee dedededeeef dedee d ee d!ed"ed#ed$ef*d-d	ZdS )0u@   ADOPT PyTorch Optimizer

ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate: https://arxiv.org/abs/2411.02853

Modified for reduced dependencies on PyTorch internals from original at: https://github.com/iShohei220/adopt

@inproceedings{taniguchi2024adopt,
 author={Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka},
 booktitle = {Advances in Neural Information Processing Systems},
 title = {ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate},
 year = {2024}
}

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285
    )castListOptionalTupleUnionN)Tensor)	Optimizer   )ParamsTAdoptadoptc                 G   sN   t | D ] \}}t|r$t| | | |< |D ]}t|| ||< qqd S N)	enumeratetorch
is_complexview_as_real)paramsstate_and_gradsips r   D/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adopt.py_view_as_real   s   
r   c                 C   s$   | rt jS t  t jkrt jS t jS r   )r   float32get_default_dtypefloat64)is_fusedr   r   r   _get_scalar_dtype$   s   r   c                   C   s$   t tdrt tjdrtj S dS )Ncompileris_compilingF)hasattrr   r   r    r   r   r   r   _is_compiling,   s   
r"   c                 C   s,   t j s
t r
| S t| t jr|  S | S r   )r   jitis_scriptingr"   
isinstancer   item)xr   r   r   
_get_value3   s   r(   c                       s   e Zd ZdZ							dddddddd	ed
eeef deeef dede	e dede
de
de
de	e
 de
de
de
f fddZ fddZdd Ze d ddZ  ZS )!r   uq   
    ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate: https://arxiv.org/abs/2411.02853

    MbP?g?gH.?ư>Zd;O?        F)cautionforeachmaximize
capturabledifferentiabler   lrbetasepsclip_expweight_decay	decoupledcorrected_weight_decayr.   r/   r0   r1   r2   c	                   s   t |tr|
r|std| dkrtdd|ks"td| d|ks-td| d|d   kr9dk sCn td	|d  d|d   krOdk sYn td
|d  d|ksdtd| t||||||||	||
||d}t || d S )NElr as a Tensor is not supported for capturable=False and foreach=Truer	   zTensor lr must be 1-elementr-   zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )r3   r4   r5   r7   r6   r8   r9   r.   r0   r/   r1   r2   )r%   r   
ValueErrornumeldictsuper__init__)selfr   r3   r4   r5   r6   r7   r8   r9   r.   r/   r0   r1   r2   defaults	__class__r   r   r@   @   s@   
zAdopt.__init__c                    s   t  | | jD ]e}|dd |dd  |dd |dd |dd  |dd |dd |d	 D ]4}| j|g }t|d
krmt|d smt	|d }|d rctj
|t |jdntj
|t d|d< q9q	d S )Nr0   Fr/   r1   r2   r6   r.   r9   r   r   stepdtypedevicerG   )r?   __setstate__param_groups
setdefaultstategetlenr   	is_tensorfloattensorr   rH   )rA   rM   groupr   p_statestep_valrC   r   r   rJ   s   s0   
zAdopt.__setstate__c           
      C   s4  d}|d D ]}|j d u rq|t|O }|| |j jr"td||j  | j| }	t|	dkrb|d rBtjdt	 |j j
dntjdt	 d	|	d
< tj|j tjd|	d< tj|j tjd|	d< ||	d  ||	d  |d r}|	d
 jr}td|d rt|d r|d std||	d
  q|S )NFr   z'ADOPT does not support sparse gradientsr   r1   r   rF   r-   rI   rE   )memory_formatexp_avg
exp_avg_sqr2   zB`requires_grad` is not supported for `step` in differentiable moder/   r3   r:   )gradr   r   append	is_sparseRuntimeErrorrM   rO   zerosr   rH   rR   
zeros_likepreserve_formatrequires_gradrP   )
rA   rS   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepshas_complexr   rM   r   r   r   _init_group   s2   	


zAdopt._init_groupNc              	   C   sD  |    d}|dur!t  | }W d   n1 sw   Y  | jD ]{}g }g }g }g }g }|d \}	}
| ||||||}t|||||fi d|d|	d|
d|d d|d d|d d	|d
 rj| jd ndd|d d|d d|d d|d d|d d|d d|d dt| dddt| dd q$|S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr4   rf   beta1beta2r3   r7   r6   max_lrr9   r8   r5   r.   r0   r/   r1   r2   
grad_scale	found_inf) _cuda_graph_capture_health_checkr   enable_gradrK   rg   r   rB   getattr)rA   closurelossrS   ra   rb   rc   rd   re   rh   ri   rf   r   r   r   rE      s|   

		
z
Adopt.step)r)   r*   r+   r,   r-   FFr   )__name__
__module____qualname____doc__r
   r   rQ   r   r   r   boolr@   rJ   rg   r   no_gradrE   __classcell__r   r   rC   r   r   ;   s\    

	31r   rb   rc   rd   re   rk   rl   rf   rh   ri   r3   r7   r6   rj   r8   r5   r.   r0   r1   r2   c          "      C   s2  |d u r|d u s
J t j rt|
tsJ t| D ]\}}|s$|| n||  }|| }|| }|| }|rYt sYddlm} | }|j	j
|j	j
krQ|j	j
|v sYJ d| d|d7 }t |r~t |}|d urpt |}|d uryt |}t |}|dkr|s|j||d}|s|r|nt|}|dkr|||  q|dkr|r|d ur|
d | n|
}|j|| | d t | |}||}|d ur|d | } ||  |  ||d|  |r|| dk|j}!|!|! jdd	 ||! }|j||
 d ||	j|| d|	 d
 qd S )Nr   !_get_capturable_supported_devicesIIf capturable=True, params and state_steps must be on supported devices: .r	   alpha   r)   )minvalue)r   r#   r$   r%   rQ   r   r"   torch.optim.optimizerrz   rH   typer   r   addr(   addcmul_conjadd_clampsqrtdivclamp_lerp_torG   div_meanmul_)"r   rb   rc   rd   re   rk   rl   rf   rh   ri   r3   r7   r6   rj   r8   r5   r.   r0   r1   r2   r   paramrY   rW   rX   step_trz   capturable_supported_devicesrE   wd_scaledenomnormed_gradclip_valmaskr   r   r   _single_tensor_adopt   sX   








 r   c          '         s  t | dkrd S t|
tr|std|r9t s9ddlm} |dd t fddt| |D s9J d  d	|d u rA|d u sCJ |rIJ d
t	
| ||||g}| D ]\\}}}}}}ttt |}ttt |}ttt |}ttt |}ttt |} |rt|||| |rt|}t s| d jrtj| tjddddd nt| d |dkr|s|rtj|||d ntj|||d}| d dkrt||| qW|dkr|r|d ur|
d | n|
}!tj|||! | d t|}"t|"| t||"}#|d ur| d d | }$t|#|$  t|#|$ t||#d|  |rTt||}%dd t|%|D }%dd |%D }&t|&d t|%|& t||%}tj|||
 d t||	 tj|||d|	 d qWd S )Nr   r:   ry   F)supports_xlac                 3   s0    | ]\}}|j j|j jko|j j v V  qd S r   )rH   r   ).0r   rE   r   r   r   	<genexpr>s  s
    
z&_multi_tensor_adopt.<locals>.<genexpr>r{   r|   z#_foreach ops don't support autogradr;   cpu)rH   r}   r	   r   c                 S   s    g | ]\}}|d k |jqS )r   )r   rG   )r   mgr   r   r   
<listcomp>  s     z'_multi_tensor_adopt.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )r   )r   r   r   r   r   r     s    r)   r   )rO   r%   r   r\   r"   r   rz   allzipr   "_group_tensors_by_device_and_dtypevaluesr   r   r   r   _foreach_negis_cpu_foreach_add_rR   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_maximum__foreach_div_foreach_minimum__foreach_lerp__foreach_mul_foreach_div__foreach_mul_)'r   rb   rc   rd   re   rk   rl   rf   rh   ri   r3   r7   r6   rj   r8   r5   r.   r0   r1   r2   rz   grouped_tensorsdevice_params_device_grads_device_exp_avgs_device_exp_avg_sqs_device_state_steps__device_paramsdevice_gradsdevice_exp_avgsdevice_exp_avg_sqsdevice_state_stepsr   exp_avg_sq_sqrtr   r   masks
mask_scaler   r   r   _multi_tensor_adoptN  s   


	



r   Fr/   c       
         C   s   |du rd}t  stdd |D std|r!tj r!td|r+tj s+t}nt}|| ||||f|
||||||||||||||	d dS )z?Functional API that performs ADOPT algorithm computation.

    NFc                 s   s    | ]	}t |tjV  qd S r   )r%   r   r   )r   tr   r   r   r     s    zadopt.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsz6torch.jit.script not supported with foreach optimizers)rf   rh   ri   r3   r7   r6   rj   r8   r5   r.   r0   r1   r2   rk   rl   )r"   r   r\   r   r#   r$   r   r   )r   rb   rc   rd   re   r/   r1   r2   rk   rl   rf   rh   ri   r3   r7   r6   rj   r8   r5   r.   r0   funcr   r   r   r     sD   r   )NFFNNF)ru   typingr   r   r   r   r   r   r   r   r   _typesr
   __all__r   r   r"   r(   r   rv   rQ   r   r   r   r   r   r   r   <module>   s"   
 =	


W	


 	

