o
    TiU                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! dZ"dZ#dZ$dZ%dZ&e#e"gZ'e'e$e%e&g Z(G dd deZ)dS )zO
Copyright NVIDIA/apex
This file is adapted from FP16_Optimizer in NVIDIA/apex
    N)_flatten_dense_tensors_unflatten_dense_tensors)DeepSpeedOptimizer)get_global_normget_flattened_grad_normCheckOverflowget_weight_normget_norm_with_moe_layersis_model_parallel_parameter)INITIAL_LOSS_SCALESCALE_WINDOWMIN_LOSS_SCALE)loggerlog_dist)required_torch_version)OPTIMIZER_STATE_DICT	CLIP_GRAD)get_accelerator)is_moe_param_group)PIPE_REPLICATED)bwc_tensor_model_parallel_rankoverflow_checkcompute_normunscale_and_clip
basic_stepupdate_fp16c                   @   s
  e Zd ZdZ											d6dd	Zd
d Zd7ddZd8ddZdd Zdd Z	dd Z
dd Zdd Zd8ddZd7ddZd9ddZd d! Zd"d# Zd$d% ZeeeZd&d' Zd(d) ZeeeZd*d+ Zd,d- Zd7d.d/Zd0d1 Zd2d3 Zd4d5 ZeeeZdS ):FP16_Optimizerz
   FP16 Optimizer for training fp16 models. Handles loss scaling.

   For usage example please see, TODO:  DeepSpeed V2 Tutorial
    N      ?F        T        c                 C   s  |
| _ || _|| _|| _| jj| _t  std|| _	g | _
g | _g | _g | _d| _d| _t| j	jD ]U\}}| j
|d  | jtdd | j
| D  t| j| | j
| }t| j
| |D ]\}}|j|_qd| j| j|     d| j| _| j| g|d< q5|rd| _d| _d	| _d
| _|d u r|| _d| _ d| _!n|t" | _|t# | _ |t$ | _!n	d| _d| _|| _|| _%d| _&d | _'|	| _(d
| _)t*ddrt+j,j-j.| _.nt+j,j-j/| _.|| _0d| _1t2| j
| j0|d| _3| 4  d S )Nz$Cannot use fp16 without accelerator.Fr   paramsc                 S   s   g | ]}|   qS  )clonedetach.0pr!   r!   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/fused_optimizer.py
<listcomp>M   s    z+FP16_Optimizer.__init__.<locals>.<listcomp>Tr      i     g?)max_version)mpu	deepspeed)5fused_adam_legacytimersr.   has_moe_layerspipeline_parallelismusing_pipeliner   is_availableSystemError	optimizerfp16_groupsfp16_groups_flatfp32_groups_flatflatten_grad_norm_mask_listhas_executed_step_global_grad_norm	enumerateparam_groupsappendr   r   zipdatar"   floatr#   requires_graddynamic_loss_scalecur_iterlast_overflow_iterscale_factor	cur_scalescale_windowmin_loss_scaler   r   r   verbosecustom_loss_scalerexternal_loss_scale	clip_grad	norm_typer   torchnnutilsclip_grad_normclip_grad_norm_r-   overflowr   overflow_checkerinitialize_optimizer_states)selfinit_optimizerr.   static_loss_scalerD   initial_dynamic_scaledynamic_loss_argsrK   r-   rN   r/   r1   r0   iparam_groupupdated_paramsr&   qr!   r!   r'   __init__(   sd   

 



zFP16_Optimizer.__init__c                 C   sj   t | jD ]\}}tj| j|  | j| jd| j| _q| j	  t | jD ]
\}}d | j| _q(d S )Ndevice)
r=   r7   rP   zerosr9   sizerc   gradr6   step)rX   r]   groupr!   r!   r'   rW      s   

z*FP16_Optimizer.initialize_optimizer_statesc                 C   sD   | j D ]}|D ]}|rd|_q|jdur|j  |j  qqdS )z,
        Zero FP16 parameter grads.
        N)r7   rf   detach_zero_)rX   set_to_nonerh   r&   r!   r!   r'   	zero_grad   s   



zFP16_Optimizer.zero_gradc                 C   s2  g }g }t | jD ]\}}|tdd |D  |t|| | jd q	| j|| _| j	}| 
| j | jrI| jrFtd|| j	 | jS t|d}| j||dd}|| j	 | _| jjdd |D d	d | jD ||d
 tt|D ]}t| j| | j| }	t| j| |	D ]\}
}|j|
_qqw| jS ))
        Not supporting closure.
        c                 S   s4   g | ]}|j d u rtj| |j|jdn|j qS N)dtyperc   )rf   rP   rd   re   ro   rc   r$   r!   r!   r'   r(      s    (z2FP16_Optimizer.step_fused_adam.<locals>.<listcomp>)r-   ze[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: {}, reducing to {}	norm_listF)apply_scalec                 S      g | ]}|gqS r!   r!   )r%   gr!   r!   r'   r(          c                 S   rs   r!   r!   r$   r!   r!   r'   r(      ru   )gradsoutput_paramsscale
grad_norms)r=   r7   r?   r   r   r-   rV   check_using_normrU   rH   _update_scalerK   r   infoformatr   unscale_and_clip_gradsr<   r6   rg   r8   rangelenr   r@   rA   )rX   closuregrads_groups_flatnorm_groupsr]   rh   
prev_scalescaled_grad_normcombined_scaler_   r&   r`   r!   r!   r'   step_fused_adam   s@   


zFP16_Optimizer.step_fused_adamc                 C   s   | j jD ]}||d< qdS )zSet the learning rate.lrNr6   r>   )rX   r   r^   r!   r!   r'   set_lr   s   
zFP16_Optimizer.set_lrc                 C   s   | j jd d S )z!Return the current learning rate.r   r   r   rX   r!   r!   r'   get_lr   s   zFP16_Optimizer.get_lrc                 C   s2   || j krtd| j  d|  d| _|| _ d S )Nz$[deepspeed] setting loss scale from z -> T)rM   r   r|   rL   )rX   
loss_scaler!   r!   r'   override_loss_scale   s   

z"FP16_Optimizer.override_loss_scalec                 C   s0   t |tr
|jr
dS |dkrt|sdS d S d S )NTr   )hasattrr   ds_pipe_replicatedr
   )rX   r&   tensor_model_parallel_rankr!   r!   r'   _require_avoid_recompute_norm   s
   z,FP16_Optimizer._require_avoid_recompute_normc                 C   s   g }d}d}|D ]4}||   }|jdur:| |t| jr:t|dkr3||d d kr3||d d< n|||g |}qtj|t	 
 dS )aT  The function preserves the parallel information for norm
        from unflattened gradients.

        Args:
            group (Iterable[Tensor] ): params group

        Returns:
            torch.Tensor: A 2D tensor containing index ranges for each group,
                      where each row represents a [start index, end index].
        r   Nr)   rb   )numelrf   r   r   r-   r   r?   rP   tensorr   current_device_name)rX   rh   group_mask_idx_listgrad_flat_st_idxgrad_flat_en_idxr&   r!   r!   r'   _get_norm_mask_idx   s   z!FP16_Optimizer._get_norm_mask_idxc                    s  | j r|  S | t  g }t| jD ]\}}|dd |D  q| j	|| _
| t  | j}| | j
 | j
rj| jrOtd| d| j dgd t| jD ]\}}|D ]}d|_qZqT| jt | j
S g }g }i }	t| jt| jjks}J t| jD ]d\}}| j| j |t fdd|D  || | j| _| jj| }
| jrt|
r|
d	 |	vrg |	|
d	 < |	|
d	  | j|  n| js| |}| j| || j|  |D ]}d|_qq| t  t || j!| jd
}| jrt"|| j!|	| j#d}t$|gd}| t  || j | _%| t&  | '|| | t&  | t(  | j)  | t(  | jD ]}d|_qB| t*  t+t| jD ]#}t,| j| | j| }t-| j| |D ]\}}|j./|j. qlqWd| _| t*  | jt0 | j
S )rm   c                 S   s   g | ]	}|j d ur|qS N)rf   r$   r!   r!   r'   r(     s    z'FP16_Optimizer.step.<locals>.<listcomp>z8Overflow detected. Skipping step. Attempted loss scale: z, reducing to r   )ranksNc                    s8   g | ]}|j d u rtj|  |jdn|j  qS rn   )rf   rP   rd   re   rc   tor$   	data_typer!   r'   r(     s    *name)r-   grad_norm_mask)r-   expert_tensorsrO   rp   T)1r/   r   r0   OVERFLOW_CHECK_TIMERstartr=   r7   extendrV   has_overflowrU   stoprH   r{   rK   r   rf   logOVERFLOW_TIMERSr   r6   r>   r9   ro   r?   r   r1   r   r;   r   r:   COMPUTE_NORM_TIMERr   r-   r	   rO   r   r<   UNSCALE_AND_CLIP_TIMERr~   BASIC_STEP_TIMERrg   UPDATE_FP16_TIMERr   r   r@   rA   copy_STEP_TIMERS)rX   r   fp16_paramsr]   rh   r   r&   r   non_experts_grads_for_normexpert_grads_for_normr^   cur_flat_grad_norm_maskall_groups_normscaled_global_grad_normr_   r`   r!   r   r'   rg      s   



zFP16_Optimizer.stepc                 C   sX   | j }| jdkr|| j  d | j }|dkr|| j  }|r*|D ]
}|jd|  q|S )Nr   gư>r+   r   )rH   rN   rA   mul_)rX   grad_groups_flat
total_normrr   r   cliprf   r!   r!   r'   r~   c  s   

z%FP16_Optimizer.unscale_and_clip_gradsc                 C   s<   | j r| j| }|  dS | | j }|j||d dS )a  
        :attr:`backward` performs the following steps:

        1. fp32_loss = loss.float()
        2. scaled_loss = fp32_loss*loss_scale
        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
        )create_graphretain_graphN)rL   rM   backwardrB   rH   )rX   lossr   r   scaled_lossr!   r!   r'   r   r  s
   
zFP16_Optimizer.backwardc                 C   s   | j re| j}|r0t| j| j | j| _| j| _| jr/t	d| j  t	d| d| j  nE| j| j d }|dkrd|| j
 dkrd|  j| j9  _| jrdt	d| j
 d t	d| d| j  n|rut	d	| j t	d
| j |  jd7  _d S )Nz
Grad overflow on iteration z!Reducing dynamic loss scale from z to r+   r   zNo Grad overflow for z iterationsz#Increasing dynamic loss scale from zGrad overflow on iteration: %szUsing static loss scale of: %s)rD   rH   maxrG   rJ   rE   rF   rK   r   r|   rI   )rX   skipr   stable_intervalr!   r!   r'   r{     s*   zFP16_Optimizer._update_scalec                 C      | j jS r   r6   stater   r!   r!   r'   
_get_state     zFP16_Optimizer._get_statec                 C      || j _d S r   r   rX   valuer!   r!   r'   
_set_state     zFP16_Optimizer._set_statec                 C   r   r   r   r   r!   r!   r'   _get_param_groups  r   z FP16_Optimizer._get_param_groupsc                 C   r   r   r   r   r!   r!   r'   _set_param_groups  r   z FP16_Optimizer._set_param_groupsc                 C   sn   i }| j |d< | j|d< | j|d< |d r$| j|d< | j|d< | j|d< | j |t< | j	|d< | j
|t< |S )a  
        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
        of the contained Pytorch optimizer.
        Example::
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            torch.save(checkpoint, "saved.pth")
        rD   rH   rE   rF   rG   rI   r9   )rD   rH   rE   rF   rG   rI   r6   
state_dictr   r9   rN   r   )rX   r   r!   r!   r'   r     s   







zFP16_Optimizer.state_dictc                 C   s*   t | j| jD ]\}}|j|j qd S r   )r@   r9   r8   rA   r   )rX   currentsavedr!   r!   r'   refresh_fp32_params  s   z"FP16_Optimizer.refresh_fp32_paramsc                 C   s   |d | _ |d | _|d | _|d r"|d | _|d | _|d | _|r,| j|t  |t	 | _
t| j|d D ]\}}|j|j q9dS )	a%  
        Loads a state_dict created by an earlier call to state_dict().
        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
        whose parameters in turn came from ``model``, it is expected that the user
        will call ``model.load_state_dict()`` before
        ``fp16_optimizer_instance.load_state_dict()`` is called.
        Example::
            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
            ...
            checkpoint = torch.load("saved.pth")
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        rD   rH   rE   rF   rG   rI   r9   N)rD   rH   rE   rF   rG   rI   r6   load_state_dictr   r   rN   r@   r9   rA   r   )rX   r   load_optimizer_statesr   r   r!   r!   r'   r     s   






zFP16_Optimizer.load_state_dictc                 C   s
   t | jS r   )reprr6   r   r!   r!   r'   __repr__  s   
zFP16_Optimizer.__repr__c                 C   s   | j r| jS | jS r   )rL   rM   rH   r   r!   r!   r'   _get_loss_scale  s   zFP16_Optimizer._get_loss_scalec                 C   r   r   )loss_scalerrH   r   r!   r!   r'   _set_loss_scale  r   zFP16_Optimizer._set_loss_scale)Nr   Fr   NTNr   FFN)Tr   )FF)__name__
__module____qualname____doc__ra   rW   rl   r   r   r   r   r   r   rg   r~   r   r{   r   r   propertyr   r   r   r>   r   r   r   r   r   r   r   r!   r!   r!   r'   r   !   sN    
W

,

m



,r   )*r   rP   torch._utilsr   r    deepspeed.runtime.base_optimizerr   deepspeed.runtime.utilsr   r   r   r   r	   r
   "deepspeed.runtime.fp16.loss_scalerr   r   r   deepspeed.utilsr   r   deepspeed.utils.torchr   deepspeed.checkpoint.constantsr   r   deepspeed.acceleratorr   deepspeed.moe.utilsr   deepspeed.runtime.constantsr   deepspeed.utils.bwcr   r   r   r   r   r   r   r   r   r!   r!   r!   r'   <module>   s*    