o
    Ti,                     @   sz   d Z ddlZddlmZ ddlmZ dZdZdZ	dZ
d	Zd
d ZG dd dZG dd deZG dd deZdd ZdS )a  
Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
Taken and modified for DeepSpeed from:
    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
    N)comm)logger
init_scalescale_windowdelayed_shiftconsecutive_hysteresis	min_scalec                 C   s   t | dr	|  S | d S )Nitemr   )hasattrr	   )t r   V/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.pyto_python_float$   s   
r   c                   @   s>   e Zd ZdZdd Zedd Zdd Zdd	 ZdddZ	dS )LossScalerBasez4LossScalarBase
    Base class for a loss scaler
    c                 C   s   || _ d| _d S NF)	cur_scaledynamic)selfr   r   r   r   __init__/   s   
zLossScalerBase.__init__c                 C   s   | j S N)r   r   r   r   r   
loss_scale3   s   zLossScalerBase.loss_scalec                    s   t  fdd|D S )Nc                 3   s    | ]} j | V  qd S r   )r   ).0gr   r   r   	<genexpr>8   s    z0LossScalerBase.scale_gradient.<locals>.<genexpr>)tuple)r   modulegrad_ingrad_outr   r   r   scale_gradient7   s   zLossScalerBase.scale_gradientc                 C   s   d S r   r   )r   overflowr   r   r   update_scale:      zLossScalerBase.update_scaleFc                 C   s   || j  }|j|d d S )N)retain_graph)r   backward)r   lossr#   scaled_lossr   r   r   r$   =   s   
zLossScalerBase.backwardN)F)
__name__
__module____qualname____doc__r   propertyr   r   r!   r$   r   r   r   r   r   *   s    
r   c                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )

LossScalerau  
    Class that manages a static loss scale.  This class is intended to interact with
    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.

    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
    :class:`FP16_Optimizer`'s constructor.

    Args:
        scale (float, optional, default=1.0):  The loss scale.
       c                    s   t t| | d S r   )superr,   r   )r   scale	__class__r   r   r   O   s   zLossScaler.__init__c                 C      dS r   r   )r   paramsr   r   r   has_overflowS   r"   zLossScaler.has_overflowc                 C   r2   r   r   )xr   r   r   _has_inf_or_nanW   r"   zLossScaler._has_inf_or_nan)r-   )r'   r(   r)   r*   r   r4   r6   __classcell__r   r   r0   r   r,   C   s
    r,   c                       sL   e Zd ZdZdddddddejf fdd		Zd
d Zdd Zdd Z	  Z
S )DynamicLossScalera  
    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
    operates, because the default options can be changed using the
    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.

    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
    occurred.
    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
    If a certain number of iterations occur without overflowing gradients detected,
    :class:`DynamicLossScaler` increases the loss scale once more.
    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
    always using the highest loss scale possible without incurring overflow.

    Args:
        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
        consecutive_hysteresis (bool, optional, default=False): Whether to refill hysteresis if we reach an iteration that doesn't overflow
    l        g       @i  r-   FTc	           	         sV   t t| | d| _d| _|| _|| _|| _|| _|| _	|| _
|| _d| _|| _d S )Nr   T)r.   r8   r   cur_iterlast_overflow_iterscale_factorr   r   r   cur_hysteresisr   raise_error_at_min_scaler   dtype)	r   r   r<   r   r   r   r   r>   r?   r0   r   r   r   v   s   	
zDynamicLossScaler.__init__c                 C   s,   |D ]}|j d ur| |j jr dS qdS )NTF)gradr6   data)r   r3   pr   r   r   has_overflow_serial   s
   z%DynamicLossScaler.has_overflow_serialc              
   C   sr   z
t |    }W n ty% } zd|jd vr W Y d }~dS d }~ww |t dt d fv s5||kr7dS dS )Nzvalue cannot be convertedr   TinfF)floatsumRuntimeErrorargs)r5   cpu_suminstancer   r   r   r6      s   z!DynamicLossScaler._has_inf_or_nanc                 C   s  |r| j dks| jdkrO| j| jkr| jrtdt| j| j | j}t	 dkrKdt	  d}| j
tjkrF|dt| j dt| 7 }t| || _n4t	 dkr|dt	  d}| j
tjkrw|dt| j d| j d	| jd  7 }t| |  jd8  _| j| _n2| jrt	 dkrd
| j  }t| | j | _| j| j | j dkr| js| j | _|  j| j9  _|  jd7  _d S )Nr-   zSCurrent loss scale already at minimum - cannot decrease scale anymore. Exiting run.r   z[deepspeed] OVERFLOW! Rank z Skipping step.z Attempted loss scale: z, reducing to z, but hysteresis is z. Reducing hysteresis to z;Consecutive hysteresis is enabled. Restoring hysteresis to )r   r=   r   r   r>   	Exceptionmaxr<   distget_rankr?   torchhalfintr   infor:   r;   r   r   )r   r    
next_scaleoverflow_msghysteresis_msgr   r   r   r!      s<   
(


zDynamicLossScaler.update_scale)r'   r(   r)   r*   rO   rP   r   rC   r6   r!   r7   r   r   r0   r   r8   [   s    r8   c                 C   sN   | t jkr|r|d u rt| dS tdd| i|S | t jkr |nd}t|dS )N)r?   r?   g      ?)r/   r   )rO   rP   r8   r,   )r?   static_loss_scaledynamic_scalingdynamic_loss_argsloss_scale_valuer   r   r   CreateLossScaler   s   

rZ   )r*   rO   	deepspeedr   rM   deepspeed.utilsr   INITIAL_LOSS_SCALESCALE_WINDOWDELAYED_SHIFTCONSECUTIVE_HYSTERESISMIN_LOSS_SCALEr   r   r,   r8   rZ   r   r   r   r   <module>   s   u