o
    TimF                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ G dd deZdS )zO
Copyright NVIDIA/apex
This file is adapted from FP16_Optimizer in NVIDIA/apex
    )0split_params_grads_into_shared_and_expert_paramsN)_flatten_dense_tensors)DeepSpeedOptimizer)get_global_normCheckOverflowget_weight_norm)INITIAL_LOSS_SCALESCALE_WINDOWMIN_LOSS_SCALE)logger)required_torch_version)OPTIMIZER_STATE_DICT)get_accelerator)commc                   @   s   e Zd ZdZ								d1ddZd2d	d
Zd3ddZdd Zdd Zdd Z	d3ddZ
d2ddZd4ddZdd Zdd Zdd ZeeeZdd  Zd!d" ZeeeZd#d$ Zd%d& ZeeeZd'd( Zd)d* Zd2d+d,Zd-d. Zd/d0 ZdS )5FP16_UnfusedOptimizerz
    FP16 Optimizer without weight fusion to support LAMB optimizer

    For usage example please see, TODO:  DeepSpeed V2 Tutorial
    N      ?FT        c
                 C   s  |	| _ d| _t dkrtd| j  d t  std|| _	g | _
g | _t| j	jD ]*\}
}| j
|d  dd |d D }|D ]}d	|_qE| j| | j|
 |d< q.|rd	| _d| _d
| _d| _|d u rud| _d| _d| _n|t | _|t | _|t | _n	d| _d| _|| _d| _d | _|| _|| _d| _tddrt j!j"j#| _#nt j!j"j$| _#|| _%d| _&t'| j
| j%|d| _(| )  d S )Nr   r   zFused Lamb Legacy :  z$Cannot use fp16 without accelerator.paramsc                 S   s   g | ]
}|    qS  )clonefloatdetach.0pr   r   \/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/unfused_optimizer.py
<listcomp>?   s    z2FP16_UnfusedOptimizer.__init__.<locals>.<listcomp>Tg       @g      @i  g      ?F   g?)max_version)mpu	deepspeed)*fused_lamb_legacy_global_grad_normdistget_rankr   infor   is_availableSystemError	optimizerfp16_groupsfp32_groups	enumerateparam_groupsappendrequires_graddynamic_loss_scalecur_iterlast_overflow_iterscale_factor	cur_scalescale_windowmin_loss_scaler   r	   r
   custom_loss_scalerexternal_loss_scaleverbose	clip_grad	norm_typer   torchnnutilsclip_grad_normclip_grad_norm_r!   overflowr   overflow_checkerinitialize_optimizer_states)selfinit_optimizerr"   static_loss_scaler1   dynamic_loss_argsr:   r!   r;   r#   iparam_group
fp32_groupr   r   r   r   __init__   sV   



zFP16_UnfusedOptimizer.__init__c                 C   sD   | j D ]}|D ]}|rd|_q|jdur|j  |j  qqdS )z,
        Zero FP16 parameter grads.
        N)r+   graddetach_zero_)rE   set_to_nonegroupr   r   r   r   	zero_gradr   s   



zFP16_UnfusedOptimizer.zero_gradc                 C   sv  g }g }g }g }t | jD ]I\}}dd |D }|| |t| t|\}	}
d}t|	dkr;tt|	| jd}|| d}t|
dkrQtt|
| jd}|| q| j	|| | _
| j}| | j
 | j
r|| jrytd|| j | j
S t|d| _| j| jdd	}| jj|| j|d
 t| j| jD ]\}}t t||D ]\}\}}d|_|j|j qq| j
S ))
        Not supporting closure.
        c                 S   s4   g | ]}|j d u rtj| |j|jdn|j qS Ndtypedevice)rM   r=   zerossizerV   rW   r   r   r   r   r      s    (z9FP16_UnfusedOptimizer.step_fused_lamb.<locals>.<listcomp>r   r   r!   e[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss scale: {}, reducing to {}	norm_listF)apply_scale)gradsoutput_paramsscaleN)r-   r+   r/   r   r   lenr   r!   rC   check_using_normrB   r5   _update_scaler:   r   r'   formatr   r$   unscale_and_clip_gradsr*   stepzipr,   rM   datacopy_)rE   closuregrads_groups_flatgrads_groupsnorm_groupsexpert_norm_groupsrI   rQ   r_   grads_for_normexpert_grads_for_normnorm_group_valueexpert_norm_group_value
prev_scalecombined_scalerK   
fp16_groupidx
fp32_param
fp16_paramr   r   r   step_fused_lamb   sJ   


z%FP16_UnfusedOptimizer.step_fused_lambc                 C   s   | j jD ]}||d< qdS )zSet the learning rate.lrNr*   r.   )rE   r{   rJ   r   r   r   set_lr   s   
zFP16_UnfusedOptimizer.set_lrc                 C   s   | j jd d S )z!Return the current learning rate.r   r{   r|   rE   r   r   r   get_lr   s   zFP16_UnfusedOptimizer.get_lrc                 C   s2   || j krtd| j  d|  d| _|| _ d S )Nz$[deepspeed] setting loss scale from z -> T)r9   r   r'   r8   )rE   
loss_scaler   r   r   override_loss_scale   s   

z)FP16_UnfusedOptimizer.override_loss_scalec                 C   sd  | j r|  S | j | _| j}| | j | jr)| jr&t	d
|| j | jS g }t| jD ]I\}}t|\}}d}t|dkrIt|| jd}|| t| j| | j| D ]\}	}
|
jdu rptj|
 |	j|	jd|	_qY|
j|	j|	_qYq0t|d| _| | j | j  t| j| jD ]\}}tt||D ]\}\}	}
d|	_|
j !|	j  qq| jS )rS   r[   r   r   rZ   NrU   r\   )"r#   rz   rC   checkrB   r5   rd   r:   r   r'   re   r-   r+   r   rb   r   r!   r/   rh   r,   rM   r=   rX   rY   rV   rW   tor   r$   rf   r*   rg   ri   rj   )rE   rk   rt   rn   rI   rQ   rp   _rr   rx   ry   rK   rv   rw   r   r   r   rg      s@   



zFP16_UnfusedOptimizer.stepc                 C   sp   | j }| jdkr|| j  d | j }|dkr|| j  }|r6| jD ]}|D ]}|jd ur4|jjd|  q$q |S )Nr   gư>   r   )r5   r;   r,   rM   ri   mul_)rE   
total_normr^   ru   cliprQ   paramr   r   r   rf      s   



z,FP16_UnfusedOptimizer.unscale_and_clip_gradsc                 C   s<   | j r| j| }|  dS | | j }|j||d dS )a  
        :attr:`backward` performs the following steps:

        1. fp32_loss = loss.float()
        2. scaled_loss = fp32_loss*loss_scale
        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
        )create_graphretain_graphN)r8   r9   backwardr   r5   )rE   lossr   r   scaled_lossr   r   r   r     s
   
zFP16_UnfusedOptimizer.backwardc                 C   s   | j rc| j}|r.t| j| j | j| _| j| _| jr-t	d| j t	d| d| j  nE| j| j d }|dkrb|| j
 dkrb|  j| j9  _| jrbt	d| j
 d t	d| d| j  n|rst	d	| j t	d
| j |  jd7  _d S )NzGrad overflow on iteration: %sz!Reducing dynamic loss scale from z to r   r   zNo Grad overflow for z iterationsz#Increasing dynamic loss scale from zGrad overflow on iteration %szUsing static loss scale of %s)r1   r5   maxr4   r7   r2   r3   r:   r   r'   r6   )rE   skiprt   stable_intervalr   r   r   rd     s*   z#FP16_UnfusedOptimizer._update_scalec                 C      | j jS Nr*   stater~   r   r   r   
_get_state,     z FP16_UnfusedOptimizer._get_statec                 C      || j _d S r   r   rE   valuer   r   r   
_set_state/     z FP16_UnfusedOptimizer._set_statec                 C   r   r   r|   r~   r   r   r   _get_param_groups6  r   z'FP16_UnfusedOptimizer._get_param_groupsc                 C   r   r   r|   r   r   r   r   _set_param_groups9  r   z'FP16_UnfusedOptimizer._set_param_groupsc                 C   s   | j r| jS | jS r   )r8   r9   r5   r~   r   r   r   _get_loss_scale?  s   z%FP16_UnfusedOptimizer._get_loss_scalec                 C   r   r   )loss_scalerr5   r   r   r   r   _set_loss_scaleE  r   z%FP16_UnfusedOptimizer._set_loss_scalec                 C   sd   i }| j |d< | j|d< | j|d< |d r$| j|d< | j|d< | j|d< | j |t< | j	|d< |S )a  
        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
        of the contained Pytorch optimizer.
        Example::
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            torch.save(checkpoint, "saved.pth")
        r1   r5   r2   r3   r4   r6   r,   )
r1   r5   r2   r3   r4   r6   r*   
state_dictr   r,   )rE   r   r   r   r   r   J  s   






z FP16_UnfusedOptimizer.state_dictc                 C   s>   t | j| jD ]\}}t ||D ]\}}|j|j qqd S r   )rh   r,   r+   ri   rj   )rE   current_groupsaved_groupcurrentsavedr   r   r   refresh_fp32_paramsb  s
   z)FP16_UnfusedOptimizer.refresh_fp32_paramsc                 C   s   |d | _ |d | _|d | _|d r"|d | _|d | _|d | _|r,| j|t  t	| j
|d D ]\}}t	||D ]\}}|j|j q=q4dS )	a%  
        Loads a state_dict created by an earlier call to state_dict().
        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
        whose parameters in turn came from ``model``, it is expected that the user
        will call ``model.load_state_dict()`` before
        ``fp16_optimizer_instance.load_state_dict()`` is called.
        Example::
            model = torch.nn.Linear(D_in, D_out).to(get_accelerator().device_name()).half()
            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
            ...
            checkpoint = torch.load("saved.pth")
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        r1   r5   r2   r3   r4   r6   r,   N)r1   r5   r2   r3   r4   r6   r*   load_state_dictr   rh   r,   ri   rj   )rE   r   load_optimizer_statesr   r   r   r   r   r   r   r   g  s   





z%FP16_UnfusedOptimizer.load_state_dictc                 C   s
   t | jS r   )reprr*   r~   r   r   r   __repr__  s   
zFP16_UnfusedOptimizer.__repr__c                 C   s   t | jD ]\}}|D ]}tj| |jt  d|_qqt | j	D ]\}}|D ]}tj| |jt  d|_q)q#| j
  t | jD ]\}}|D ]}d |_qLqFt | j	D ]\}}|D ]}d |_q^qXd S rT   )r-   r+   r=   rX   rY   rV   r   current_device_namerM   r,   r*   rg   )rE   rI   rQ   r   r   r   r   rD     s0   




z1FP16_UnfusedOptimizer.initialize_optimizer_states)Nr   FNTNr   F)Tr   )FF)__name__
__module____qualname____doc__rL   rR   rz   r}   r   r   rg   rf   r   rd   r   r   propertyr   r   r   r.   r   r   r   r   r   r   r   rD   r   r   r   r   r      sD    

S
2

1




-r   )r   deepspeed.moe.utilsr   r=   torch._utilsr    deepspeed.runtime.base_optimizerr   deepspeed.runtime.utilsr   r   r   "deepspeed.runtime.fp16.loss_scalerr   r	   r
   deepspeed.utilsr   deepspeed.utils.torchr   deepspeed.checkpoint.constantsr   deepspeed.acceleratorr   r"   r   r%   r   r   r   r   r   <module>   s   