o
    پih2                  '   @   s  d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	 Zd
eedf dededeeeef  fddZG dd de	Zdee dee deee  deee  deee  deee  dee dededededededee d eeejf d!ee d"ed#ed$ee f&d%d&Zdee dee deee  deee  deee  deee  dee dededededededee d eeejf d!ee d"ed#ed$ee f&d'd(ZdS ))a   Adafactor (Big Vision variant) for PyTorch

Adapted from the implementation in big vision: https://github.com/google-research/big_vision

Described in 'Scaling Vision Transformers': https://arxiv.org/abs/2106.04560

References for added functionality:
    Cautious Optimizers: https://arxiv.org/abs/2411.16085
    Why Gradients Rapidly Increase Near the End of Training: https://arxiv.org/abs/2506.02285

Adaptation and PyTorch modifications by Ross Wightman
    )ListOptionalTupleUnionN)Tensor)	Optimizer   )ParamsTc                   C   s   t jS )z6Get the scalar dtype that the optimizer uses for state)torchfloat64 r   r   K/home/ubuntu/.local/lib/python3.10/site-packages/timm/optim/adafactor_bv.py_get_scalar_dtype   s   r   shape.factoredmin_dim_size_to_factorreturnc                 C   sb   |rt | dk r
dS tdd t| D }| |d d  |k r!dS t|d d t|d d fS )a  Whether to use a factored second moment estimator.

    This function returns a tuple with the two largest axes to reduce over.
    If no two dimensions have size >= min_dim_size_to_factor, return None.

    Args:
      shape: an input shape
      factored: whether to use factored second-moment estimator for > 2d vars.
      min_dim_size_to_factor: only factor accumulator if two array dimensions have at least this size.

    Returns:
      None or a tuple of ints
       Nc                 s   s    | ]	\}}||fV  qd S Nr   ).0ixr   r   r   	<genexpr>/   s    z!_factored_dims.<locals>.<genexpr>r   )lensorted	enumerateint)r   r   r   sorted_dimsr   r   r   _factored_dims   s    r    c                !       s   e Zd ZdZddddddejdd	dd
d
d
fd
ddedededededede	e de
eejf de	e dede	e dededede	e f fddZ fddZe d!dd Z  ZS )"AdafactorBigVisionz
    PyTorch implementation of BigVision's Adafactor variant with both single and multi tensor implementations.

    Adapted from https://github.com/google-research/big_vision by Ross Wightman
          ?   g?r   g+?g?N        F)foreachparamslrr   
decay_ratedecay_offset	beta2_capmomentummomentum_dtypeepsweight_decayclipping_thresholdunscaled_wdcautioncorrected_weight_decayr%   c                   s|   t |tr#|dkrtj}n|dkrtj}n|dks J | dtj}t||||||||	|
|||||d}t || d S )Nfloat16bfloat16float32z dtype not supported)r'   r   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r%   )	
isinstancestrr
   r3   r4   r5   dictsuper__init__)selfr&   r'   r   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r%   defaults	__class__r   r   r:   <   s0   
zAdafactorBigVision.__init__c                    s   t  | | jD ]U}|dd |dd |dd  |d D ]<}| j|i }t|dkrEt|d sEtj	t
|d t d|d< d	|v r]t|d	 r]|d	 j| jd
 d|d	< q!q	d S )Nr1   Fr2   r%   r&   r   stepdtypeexp_avgr,   )r9   __setstate__param_groups
setdefaultstategetr   r
   	is_tensortensorfloatr   tor<   )r;   rF   grouppp_stater=   r   r   rC   k   s   
zAdafactorBigVision.__setstate__c                 C   s  d }|d urt   | }W d    n1 sw   Y  | jD ]1}g }g }g }g }g }g }	g }
|d D ]}|jd u r=q5|jjrEtd|| ||j | j| }t|dkrt j	dt
 d|d< |jj}t|d| jd d	}|d ur|\}}t|jj}d
||< t|jj}d
||< |j||d< |j||d< nt j|jt jd|d< | jd d urt j|j| jd d|d< |	|d  ||dd  ||dd  ||dd  |
|dd  q5|d rt}nt}|d$i d|d|d|d|d|d|
d|	d|d d|d d|d d|d d|d d|d d|d d|d d|d d |d  d!|d! d"|d# rN| jd nd  q |S )%Nr&   zSparse gradients not supportedr   r$   r@   r?   Tr   )r   r   r   exp_avg_sq_rexp_avg_sq_c)memory_format
exp_avg_sqr+   r,   rB   r%   gradsexp_avg_sq_rsexp_avg_sq_csexp_avg_sqsexp_avgsstate_stepsbeta2_decayr(   r*   r-   r'   r.   r/   r0   r1   max_lrr2   r   )r
   enable_gradrD   grad	is_sparseRuntimeErrorappendrF   r   rI   r   r   r    r<   list	new_zeros
zeros_likepreserve_formatrG   _multi_tensor_adafactor_single_tensor_adafactor)r;   closurelossrL   params_with_gradrS   rT   rU   rV   rX   rW   rM   rF   r   factored_dimsdcdr	row_shape	col_shapefuncr   r   r   r?   |   s   



	
zAdafactorBigVision.stepr   )__name__
__module____qualname____doc__r
   r4   r	   rJ   r   r   r   r7   rA   boolr:   rC   no_gradr?   __classcell__r   r   r=   r   r!   5   sf    		
/r!   r&   rS   rT   rU   rV   rW   rX   rY   r*   r-   r'   r.   r+   r,   r/   r0   r1   rZ   c          '      C   s  t | D ]N\}}|| }|| }|| }|| }|| }|| }|
d u r/|jtjkr-dnd}
|d7 }t|dt||   }d| }t||
 }|d u rt|jd|	d\}}|	|j
|dd| |	|j
|dd| ||krv|d n|} |j
| dd}!||!  }"| }#||" |# }$n|d u r|d u sJ |	|| ||  }$|d ur|$d|$ d	 |  jdd
}%|$|% |d ur
|d ur
||jkr|	|$|d|  ||j}$n|	|$d|  | }$|r
|$| dk|j}&|&|&
 jdd |$|& |$| |dkrL|r0|d u r$|d|  n(|d|| |   n|d u r?|d||   n|d|d | |   |j|$dd qd S )NgHz>gKH9r   r"   T)r   )dimkeepdimr   g      ?)maxr   gMbP?)ming      )alpha)r   rA   r
   r3   ry   rJ   squarer    r   lerp_meanrsqrtnormnumelclamp_div_rK   clonemul_add_)'r&   rS   rT   rU   rV   rW   rX   rY   r*   r   r-   r'   r.   r+   r,   r/   r0   r1   rZ   r   paramr\   rO   rP   rR   rB   step_tbeta2_tone_minus_beta2_tgrad_sqrrj   rk   	reduce_dcrow_col_mean
row_factor
col_factorupdatedenommaskr   r   r   re      sb   "






re   c                C   s   J d)NFz2multi-tensor fn (foreach=True) not implemented yetr   )r&   rS   rT   rU   rV   rW   rX   rY   r*   r   r-   r'   r.   r+   r,   r/   r0   r1   rZ   r   r   r   rd   ;  s   rd   )rr   typingr   r   r   r   r
   r   torch.optimr   _typesr	   r   r   rs   tupler    r!   rJ   r7   rA   re   rd   r   r   r   r   <module>   s    

 !



	

f



	
