o
    Ti`                     @   sP   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ G dd de j	j
ZdS )    N)get_cpu_info)logger)should_log_le)CPUAdamBuilderc                       sX   e Zd ZdZ								d fdd	Zd	d
 Z fddZe dddZ	  Z
S )DeepSpeedCPUAdamr   MbP?Tg?g+?:0yE>Fc
                    s   t ||||||d}
tt| ||
 t }d|v r |d  nd| _d| jv rKt| jD ]\}}t|d D ]\}}|j	t
jkrHtd  nq7q- tj| _tjd t_|| _|	| _t  | _| j| j||d |d |||td	 d
S )af
  Fast vectorized implementation of two variations of Adam optimizer on CPU:

        * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
        * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101)

        DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W).
        In order to apply this optimizer, the model requires to have its master parameter (in FP32)
        reside on the CPU memory.

        To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
        the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
        with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
        the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
        (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.

        For calling step function, there are two options available: (1) update optimizer's states and (2) update
        optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
        option can bring 30% higher throughput than the doing the copy separately using option one.


        .. note::
                We recommend using our `config
                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
                to allow :meth:`deepspeed.initialize` to build this optimizer
                for you.


        Arguments:
            model_params (iterable): iterable of parameters to optimize or dicts defining
                parameter groups.
            lr (float, optional): learning rate. (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square. (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability. (default: 1e-8)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
                algorithm from the paper `On the Convergence of Adam and Beyond`_
                (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
            adamw_mode: select between Adam and AdamW implementations (default: AdamW)
            fp32_optimizer_states: creates momentum and variance in full precision regardless of
                        the precision of the parameters (default: True)
        )lrbetasepsweight_decaybias_correctionamsgradvendor_id_rawunknownamdparamsz0FP16 params for CPUAdam may not work on AMD CPUs   r   infoN)dictsuperr   __init__r   lower
cpu_vendor	enumerateparam_groupsdtypetorchhalfr   warningoptimizer_idopt_idadam_w_modefp32_optimizer_statesr   loadds_opt_adamcreate_adamr   )selfmodel_paramsr
   r   r   r   r   r   
adamw_moder$   default_argscpu_infogroup_idgroupparam_idp	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/ops/adam/cpu_adam.pyr      s6   6

zDeepSpeedCPUAdam.__init__c                 C   s   | j | j d S N)r&   destroy_adamr"   )r(   r3   r3   r4   __del__c   s   zDeepSpeedCPUAdam.__del__c                    s,   t t| | | jD ]}|dd qd S )Nr   F)r   r   __setstate__r   
setdefault)r(   stater.   r1   r3   r4   r8   h   s   
zDeepSpeedCPUAdam.__setstate__Nc                 C   sX  d}|durt   | }W d   n1 sw   Y  t d}t| jD ]\}}t|d D ]w\}}|jdu r;q1|j|ksIJ d|j d| j| }t|dkrwd|d< | jr^t j	n|j
}	t j|j|	|d|d	< t j|j|	|d|d
< |d  d7  < |d \}
}| j| j|d |d |
||d |d |d |j|jj|d	 |d
  q1q'|S )a  Update the model parameters.

        .. note::
            This method will be called internally by ZeRO-Offload. DeepSpeed
            users should still use ``engine.step()`` as shown in the
            `Getting Started
            <https://www.deepspeed.ai/getting-started/#training>`_ guide.

        Args:
            closure (callable, optional): closure to compute the loss.
                Defaults to ``None``.

        Returns:
            loss: if ``closure`` is provided. Otherwise ``None``.
        Ncpur   zCPUAdam param is on zY and must be 'cpu', make sure you enabled 'offload_optimizer': 'cpu' in your ZeRO config.r   step)r   deviceexp_avg
exp_avg_sqr   r   r
   r   r   r   )r   enable_gradr=   r   r   gradr:   lenr$   floatr   
zeros_likedatar&   adam_updater"   )r(   closurelossr=   r-   r.   r/   r0   r:   state_dtypebeta1beta2r3   r3   r4   r<   m   s2   



 zDeepSpeedCPUAdam.step)r   Tr   r	   r   FTTr5   )__name__
__module____qualname__r!   r   r7   r8   r   no_gradr<   __classcell__r3   r3   r1   r4   r      s    Sr   )r   cpuinfor   deepspeed.utilsr   deepspeed.utils.loggingr   deepspeed.ops.op_builderr   optim	Optimizerr   r3   r3   r3   r4   <module>   s   