o
    }oi3<                  "   @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ G dd deZG dd deZ	d	ee d
ee dee dee dee dee de
de
de
de
de
de
de
de
de
dedef"ddZd	ee d
ee dee dee dee dee de
de
de
de
de
de
de
de
de
dedef"ddZd	ee d
ee dee dee dee dee de
de
de
de
de
de
de
de
de
dedef"ddZd	ee d
ee dee dee dee dee de
de
de
de
de
de
de
de
de
dedef"d d!ZdS )"    N)List)Tensor)	Optimizerc                   @   s$   e Zd ZdZdZdd Zdd ZdS )MultiTensorApplyFc              
   C   sF   z	dt _|| _W d S  ty" } zdt _|t _W Y d }~d S d }~ww )NTF)r   	available
chunk_sizeImportError
import_err)selfr   err r   H/home/ubuntu/.local/lib/python3.10/site-packages/nemo/core/optim/adan.py__init__)   s   zMultiTensorApply.__init__c                 G   s   || j ||g|R  S N)r   )r
   opnoop_flag_buffertensor_listsargsr   r   r   __call__1   s   zMultiTensorApply.__call__N)__name__
__module____qualname__r   warnedr   r   r   r   r   r   r   %   s
    r   c                       sj   e Zd ZdZ								dded	ef fd
dZ fddZe dd Z	e dddZ
  ZS )Adana  
    Implements a pytorch variant of Adan
    Adan was proposed in
    Adan: Adaptive Nesterov Momentum Algorithm for
        Faster Optimizing Deep Models[J].arXiv preprint arXiv:2208.06677, 2022.
    https://arxiv.org/abs/2208.06677
    Arguments:
        params (iterable): iterable of parameters to optimize or
            dicts defining parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float, flot], optional): coefficients used for
            first- and second-order moments. (default: (0.98, 0.92, 0.99))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): decoupled weight decay
            (L2 penalty) (default: 0)
        max_grad_norm (float, optional): value used to clip
            global grad norm (default: 0.0 no clip)
        no_prox (bool): how to perform the decoupled weight decay
            (default: False)
        foreach (bool): if True would use torch._foreach implementation.
            It's faster but uses slightly more memory. (default: True)
        fused (bool, optional): whether fused implementation is used.
            (default: False)
    MbP?g\(\?gq=
ףp?gGz?:0yE>        FTforeachfusedc
              
      s   d|kst d|d|kst d|d|ks!t d|d|d   kr-dk s7n t d|d d|d   krCdk sMn t d	|d d|d
   krYdk scn t d|d
 t||||||||	d}
t ||
 d S )Nr   zInvalid Max grad norm: {}zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {}   z%Invalid beta parameter at index 2: {})lrbetasepsweight_decaymax_grad_normno_proxr   r   )
ValueErrorformatdictsuperr   )r
   paramsr#   r$   r%   r&   r'   r(   r   r   defaults	__class__r   r   r   P   s.   
zAdan.__init__c                    s,   t t| | | jD ]}|dd qd S )Nr(   F)r,   r   __setstate__param_groups
setdefault)r
   stategroupr/   r   r   r1   t   s   
zAdan.__setstate__c                 C   s`   | j D ]*}d|d< |d D ]}|jr,| j| }t||d< t||d< t||d< qqd S )Nr   stepr-   exp_avg
exp_avg_sqexp_avg_diff)r2   requires_gradr4   torch
zeros_like)r
   r5   pr4   r   r   r   restart_opty   s   

zAdan.restart_optNc                 C   sR  d}|durt   | }W d   n1 sw   Y  | jd dkrs| jd d d j}t jd|d}t j| jd |d}| jD ]}|d D ]}|jdur\|j}||	d
  qHqBt |}t j|||d   d	d
 }	nd	}	| jD ]-}g }
g }g }g }g }g }|d \}}}d|v r|d  d7  < nd|d< d	||d   }d	||d   }d	||d   }|d D ]e}|jdu rq|
| ||j | j| }t|dkrt ||d< t ||d< t ||d< d|vs|d dkr|j |	 |d< ||d  ||d  ||d  ||d  qtd#i d|
d|d|d|d|d|d|d|d|d|d|dt|d|d d|d d|d d|d d|	}|d  r|d! rt j r|td#i | qxtd"td#i | qx|d! rt j rtd#i | qxtd"td#i | qx|S )$z$Performs a single optimization step.Nr'   r   r-   r!   )devicer"   r%   r    )maxr$   r6   r7   r8   r9   neg_pre_gradgradsexp_avgsexp_avg_sqsexp_avg_diffsneg_pre_gradsbeta1beta2beta3bias_correction1bias_correction2bias_correction3_sqrtr#   r&   r(   clip_global_grad_normr   r   zFused Adan does not support CPUr   )r;   enable_gradr.   r2   r?   zerostensorgradadd_powsumsqrtclampitemappendr4   lenr<   clonemul_r+   mathcudais_available_fused_adan_multi_tensorr)   _multi_tensor_adan_fused_adan_single_tensor_single_tensor_adan)r
   closurelossr?   global_grad_normr'   r5   r=   rQ   rM   params_with_gradrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   bias_correction3r4   kwargsr   r   r   r6      s   



 


	




z	Adan.step)r   r   r   r   r   FTFr   )r   r   r   __doc__boolr   r1   r;   no_gradr>   r6   __classcell__r   r   r/   r   r   5   s(    	
$
r   r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r#   r&   r%   r(   rM   c                C   sV  t | D ]\}}|| }|| }|| }|| }|| }|| || ||j|d| d ||j|d| d ||| ||j||d| d | | |}|| |
 }||	 }|r|d||   |j||| d |j||| d n|j||| d |j||| d |d||   | j|dd qd S )Nr!   alphavalue      )	enumerater[   rR   addcmul_rU   addcdiv_div_zero_)r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r#   r&   r%   r(   rM   iparamrQ   r7   r8   r9   neg_grad_or_diffdenomstep_size_diff	step_sizer   r   r   rb      s0   

rb   c                C   st  t | dkrd S t|| t|| t|| tj||d| d t|| tj||d| d t|| t|| t|| tj|||d| d t|}t|| t|| || |
 }||	 }|rt| d||   tj| ||| d tj| ||| d ntj| ||| d tj| ||| d t| d||   t| tj||dd d S )Nr   r!   rm   ro   rq   )	rY   r;   _foreach_mul__foreach_add__foreach_addcmul__foreach_sqrt_foreach_div__foreach_addcdiv__foreach_zero_)r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r#   r&   r%   r(   rM   rz   r{   r|   r   r   r   r`   +  s4   

r`   c                C   sl   dd l }td}tjdg}||j|| |||||g||||	|
|||||| t| tj||dd d S )Nr   i   rq   rm   )
fused_adanr   r;   r]   	IntTensoradan_multi_tensorr   r~   )r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r#   r&   r%   r(   rM   r   multi_tensor_applier_dummy_overflow_bufr   r   r   r_   f  s*   
r_   c                C   s   t | D ]Z\}}|j }|j}|| }|| }|| }|| }|| }tj|j" dd l}||||||||||||	|
|||||| W d    n1 sPw   Y  | j	|dd qd S )Nr   rq   rm   )
rr   datafloatr;   r]   r?   r   adan_single_tensorrv   rR   )r-   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r#   r&   r%   r(   rM   rw   rx   p_data_fp32out_prQ   r7   r8   r9   neg_gradr   r   r   r   ra     sB   
ra   )r\   typingr   r;   r   torch.optim.optimizerr   objectr   r   r   rj   rb   r`   r_   ra   r   r   r   r   <module>   s(   @	

7	

;	

,	
