o
    i                     @   s0   d dl Z d dlZd dlZG dd dejjZdS )    Nc                       sV   e Zd ZdZ					d fdd	Zed	d
 Zedd ZdddZdd Z	  Z
S )FairseqAdama  Implements Adam algorithm.

    This implementation is modified from torch.optim.Adam based on:
    `Fixed Weight Decay Regularization in Adam`
    (see https://arxiv.org/abs/1711.05101)

    It has been proposed in `Adam: A Method for Stochastic Optimization`_.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_

    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    MbP?g?g+?:0yE>r   Fc                    s.   t |||||d}tt| || || _d S )N)lrbetasepsweight_decayamsgrad)dictsuperr   __init__optimizer_lr)selfparamsr   
adam_betasadam_epsr	   r
   defaults	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/optimizers/fairseq_adam.pyr   (   s
   	

zFairseqAdam.__init__c                 C      dS NTr   r   r   r   r   supports_memory_efficient_fp167      z*FairseqAdam.supports_memory_efficient_fp16c                 C   r   r   r   r   r   r   r   supports_flat_params;   r   z FairseqAdam.supports_flat_paramsNc                 C   s`  d}|dur	| }| j D ] }|d D ]}|jdu rq|jj}|jtjtjhv r-| }|jr4t	d|
dd}|j}|jjtjtjhv rK| }| j| }t|dkrrd|d< t||d< t||d	< |rqt||d
< n|d ||d< |d	 ||d	< |r|d
 ||d
< |d |d	 }	}
|r|d
 }|d \}}|d  d7  < |	|j|d| d |
|j||d| d |rtj||
|d | |d }n	|
 |d }d||d   }d||d   }|d t| | }|d dkr|j||d  |d  d |j|	|| d |jjtjtjhv r,|j| qq|S )zPerforms a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   zJAdam does not support sparse gradients, please consider SparseAdam insteadr
   Fr   stepexp_avg
exp_avg_sqmax_exp_avg_sqr      )alpha)value)outr   r   r	   )param_groupsgraddatadtypetorchfloat16bfloat16float	is_sparseRuntimeErrorgetstatelen
zeros_liketomul_add_addcmul_maxsqrtmathaddcdiv_copy_)r   closurelossgrouppr'   r
   p_data_fp32r1   r   r    r!   beta1beta2denombias_correction1bias_correction2	step_sizer   r   r   r   ?   sh   

@zFairseqAdam.stepc                 C   s   | j D ]}||d< qdS )zSet the learning rate.r   N)r&   )r   r   param_groupr   r   r   set_lr   s   

zFairseqAdam.set_lr)r   r   r   r   F)N)__name__
__module____qualname____doc__r   propertyr   r   r   rI   __classcell__r   r   r   r   r      s    


Nr   )r:   r*   torch.optimoptim	Optimizerr   r   r   r   r   <module>   s   