o
    Ti                     @   sP   d Z ddlZddlmZ edZddlmZ ddlmZ G dd	 d	ej	j
ZdS )
z*
This file is modified from fused_adam.py
    N   )MultiTensorApplyi   )get_accelerator)FusedLionBuilderc                       s8   e Zd ZdZd fdd	Z fdd	ZdddZ  ZS )	FusedLiona  Implements Lion algorithm.

    Currently GPU-only.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        set_grad_none (bool, optional): whether set grad to None when zero_grad()
            method is called. (default: True)

    .. _Symbolic Discovery of Optimization Algorithms:
        https://doi.org/10.48550/arXiv.2302.06675
    MbP?g?g+?        Tc                    sL   t |||d}tt| || || _t  }t dg| _	|j
| _
d S )N)lrbetasweight_decayr   )dictsuperr   __init__set_grad_noner   loadr   	IntTensor_dummy_overflow_bufmulti_tensor_lion)selfparamsr
   r   r   r   defaultsfused_lion_cuda	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/lion/fused_lion.pyr   $   s   
zFusedLion.__init__c                    s<   | j r| jD ]}|d D ]}d |_qqd S tt|   d S )Nr   )r   param_groupsgradr   r   	zero_grad)r   grouppr   r   r   r   /   s   
zFusedLion.zero_gradNc                 C   s  t dd ||||fD rtdd}|dur| }| jD ]"}t|d dkr)q|d \}	}
d|vr7d|d< g g g }}}g g g }}}g g g }}}|d D ]~}|jdu r[qS|jjjrdtd	| j| }t|dkr|	dd|d< t
|j|d
< |jt
jkr||jj ||j ||d
  qS|jt
jkr||j || ||d
  qS|jt
jkr||jj ||j ||d
  qStdt|dkr|d  d7  < t| j| j|||g|d |	|
|d |d  t|dkr|d  d7  < t| j| j|||g|d |	|
|d |d  t|dkr@|d  d7  < t| j| j|||g|d |	|
|d |d  q|S )a+  Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.

        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
        c                 s   s    | ]}|d uV  qd S )Nr   ).0r!   r   r   r   	<genexpr>@   s    z!FusedLion.step.<locals>.<genexpr>zFusedLion has been updated.Nr   r   r   stepz+FusedLion does not support sparse gradientsexp_avgz+FusedLion only support fp16, bf16 and fp32.r   r
   r   )anyRuntimeErrorr   lenr   data	is_sparseNotImplementedErrorstategettorch
zeros_likedtypefloat16appendbfloat16float32multi_tensor_applierr   r   )r   closuregradsoutput_paramsscale
grad_normsgrad_scalerlossr    beta1beta2g_16p_16m_16g_bfp_bfm_bfg_32p_32m_32r!   r,   r   r   r   r$   7   sj   	



zFusedLion.step)r   r   r	   T)NNNNNN)__name__
__module____qualname____doc__r   r   r$   __classcell__r   r   r   r   r      s
    r   )rK   r.   multi_tensor_applyr   r5   deepspeed.acceleratorr   deepspeed.ops.op_builderr   optim	Optimizerr   r   r   r   r   <module>   s   