o
    zi                     @   s<   d dl Z d dlmZ d dlZd dlZG dd dejjZdS )    N)deepcopyc                       sB   e Zd ZdZ							d fdd		Zd
d ZdddZ  ZS )
AdaHessianzImplements AdamHess algorithm.MbP?g?g+?:0yE>r      Fc	           
         s8   t ||||d}	tt| ||	 || _|| _|| _d S )N)lrbetasepsweight_decay)dictsuperr   __init__block_length
single_gpuhessian_power)
selfparamsr   r	   r
   r   r   r   r   defaults	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/torch_optimizer/adahess.pyr      s
   
zAdaHessian.__init__c                 C   s  | j d d }ttdd |}dd |D }|D ]}d||dk < d	||dk< qtjj|||d
d
d}g }t||D ]R\}}| }	t|	d	krWt	|| d }
|
|
 q:t|	dkrt	|| d d| j}t	tj|d	gddt| j }|| j|	}|
| q:|S )a  
        compute the Hessian vector product with v, at the current gradient
        point or compute the gradient of <gradsH,v>.
        :param v: a list of torch tensors
        :param gradsH: a list of torch variables
        :return: a list of torch tensors
        r   r   c                 S   s   | j S )N)requires_grad)xr   r   r   <lambda>'   s    z&AdaHessian.get_trace.<locals>.<lambda>c                 S   s   g | ]	}t j|d dqS )   )high)torchrandint_like).0pr   r   r   
<listcomp>)   s    z(AdaHessian.get_trace.<locals>.<listcomp>g      ?r   T)grad_outputsonly_inputsretain_graphg        r   )dim)param_groupslistfilterr   autogradgradzipsizelenabsappendviewr   sumfloatrepeat_interleave)r   gradsHr   vv_ihvshutchinson_tracehvvi
param_size
tmp_outputtmp_output1tmp_output2tmp_output3r   r   r   	get_trace   s@   	

zAdaHessian.get_traceNc                 C   s  d}|dur	| }|  |}| jD ]}t|d D ]\}}|jdu r#qt|| j }|jr3td|j }	| j	| }
t
|
dkrVd|
d< t|	|
d< t|	|
d< n|
d |	|
d< |
d |	|
d< |
d |
d }}|d \}}|
d  d	7  < ||d	| | ||d	| || ||  d	||
d   }d	||
d   }| jd	k r| t| | j |d
 }n| t| |d
 }|d | }|d dkr|	|d  |d  |	 |	| || |j|	 qq|S )zPerforms a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   zPAdaHessian does not support sparse gradients, please consider SparseAdam insteadr   stepexp_avgexp_hessian_diag_sqr	   r   r
   r   r   )rB   r(   	enumerater,   r   datar4   	is_sparseRuntimeErrorstater/   r   
zeros_liketype_asmul_add_addcmul_r   sqrtmathaddcdiv_copy_)r   r6   closureloss	hut_tracegroupir!   r,   p_data_fp32rJ   rD   rE   beta1beta2bias_correction1bias_correction2denom	step_sizer   r   r   rC   G   sx   








	
KzAdaHessian.step)r   r   r   r   r   r   F)NN)__name__
__module____qualname____doc__r   rB   rC   __classcell__r   r   r   r   r      s    *r   )rQ   copyr   r   torch.optimoptim	Optimizerr   r   r   r   r   <module>   s
    