o
    9wi                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ er:d dlmZ edejdZG dd	 d	ZG d
d deZdS )    N)TYPE_CHECKINGAnyDictListLiteralOptionalUnion)compare_version)_PATH)Fabrictorchmetricsz1.0.0c                       s\  e Zd ZdZ							d/ded	 d
ededee dee dee de	f fddZ
e dddejdededdf
ddZdejde	fddZdddeddfddZdeeejf deeejf de	fd d!Zdeeejf deeejf de	fd"d#Zd$ejde	fd%d&Zd'ejddfd(d)Zdeeef fd*d+Zd,eeef ddfd-d.Z  ZS )0SpikeDetectiona  Spike Detection Callback.

    Terminates training with a ``TrainingSpikeException`` when a loss-spike was detected and
    saves the batches to skip when resuming to a file.

    We skip the current and the previous batch since it is unclear whether the previous batch
    altered the weights in a way that it causes the spike or just the current batch is corrupted somehow.

    Args:
        mode: Whether to minimize or maximize the tracked metric
        window: A running mean of metrics with ``window`` size. Serves as reference value for spikes.
        warmup: After how many batches spike-tracking should start
        atol: An absolute tolerance.  Every diff between the running mean and the current value,
            that's not an improvement and above ``atol`` will be considered a spike
        rtol: A relative tolerance. Every diff between the running mean and the current value,
            that's higher than ``rtol * running_mean`` is considered a spike
        exclude_batches_path: Where to save the file that contains the batches to exclude.
            Will default to current directory.
        finite_only: If set to ``False``, consider non-finite values like NaN, inf and -inf a spike as well.

    min
      N       @Tmode)r   maxwindowwarmupatolrtolexclude_batches_pathfinite_onlyc           
         s   t rddlm} ddlm}	 ntdt   d| _|	|ddd|d| _	d| j	_
d| j	_|| _|| _|| _|| _g | _|| _|| _d S )	Nr   )
MeanMetric)RunningzISpikeDetection requires torchmetrics>=1.0.0! Please upgrade your version!        F)dist_sync_on_stepsync_on_compute)r   )!_TORCHMETRICS_GREATER_EQUAL_1_0_0torchmetrics.aggregationr   torchmetrics.wrappersr   RuntimeErrorsuper__init__last_valrunning_meanr   r   r   r   r   r   bad_batchesr   r   )
selfr   r   r   r   r   r   r   r   r   	__class__ ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lightning/fabric/utilities/spike.pyr$   )   s    


zSpikeDetection.__init__fabricr   lossbatch	batch_idxreturnc                 C   s   |dkr| j |jj | jdu rt | _t| jds'tj	
| jd| _t|| jko1| |}|j  |jj|dd}|rJ| || dS | jp[|jjtt| dd}|re| | dS dS )z)Checks if we currently have a loss-spike.r   Nz.jsonzskip_batches.jsonF)allT)r&   tostrategyroot_devicer   osgetcwdstrendswithpathjoinboolr   	_is_spikebarrierreduce_boolean_decision_handle_spiker   torchisfiniter2   _update_stats)r(   r-   r.   r/   r0   is_spikeis_spike_globalis_finite_allr+   r+   r,   on_train_batch_endI   s"   


z!SpikeDetection.on_train_batch_endc                 C   s   t   t d | j }W d    n1 sw   Y  || j }| jr-t|s-dS | 	|r4dS | 
||o?| ||S )NignoreTF)warningscatch_warningssimplefilterr&   computer%   r   rA   rB   
_is_better_check_atol_check_rtol)r(   r.   running_val	curr_diffr+   r+   r,   r=   d   s   



zSpikeDetection._is_spikec                 C   s   | j |d |g |jdkrA| jd usJ tjtj| jdd t| jd}t	j
| j |dd W d    n1 s<w   Y  t|d)	Nr   r   T)exist_okw   )indent)r0   )r'   extendglobal_rankr   r6   makedirsr:   dirnameopenjsondumpTrainingSpikeException)r(   r-   r0   fr+   r+   r,   r@   t   s   

zSpikeDetection._handle_spikeval_aval_bc                 C   s$   | j d u ptt|| t| j kS N)r   r<   absr(   r_   r`   r+   r+   r,   rN         $zSpikeDetection._check_atolc                 C   s(   | j d u ptt|| t| j | kS ra   )r   r<   rb   rc   r+   r+   r,   rO      s   (zSpikeDetection._check_rtoldiff_valc                 C   sD   | j dkrt|dk S | j dkrt|dk S td| j  )Nr   r   r   r   z*Invalid mode. Has to be min or max, found )r   r<   r2   
ValueError)r(   re   r+   r+   r,   rM      s
   

zSpikeDetection._is_bettervalc                 C   s   | j | || _d S ra   )r&   updater%   )r(   rg   r+   r+   r,   rC      s   
zSpikeDetection._update_statsc              
   C   sL   t | jtjr| j n| j| j| j| j| j| j	| j
| j | jj d	S )N)	r%   r   r   r   r   r'   bad_batches_pathrunningmean)
isinstancer%   rA   Tensoritemr   r   r   r   r'   r   r&   
state_dictbase_metric)r(   r+   r+   r,   ro      s   
zSpikeDetection.state_dictro   c                 C   s~   | d| _| d| _| d| _| d| _| d| _| d| _| d| _| j	| d | j
j	| d	 d S )
Nr%   r   r   r   r   r'   ri   rj   rk   )popr%   r   r   r   r   r'   r   rj   load_state_dictr&   rp   )r(   ro   r+   r+   r,   rr      s   zSpikeDetection.load_state_dict)r   r   r   Nr   NT)__name__
__module____qualname____doc__r   intr   floatr
   r<   r$   rA   no_gradrm   r   rG   r=   r@   r   rN   rO   rM   rC   r   r8   ro   rr   __classcell__r+   r+   r)   r,   r      sD     "**"r   c                       s.   e Zd ZdZdededef fddZ  ZS )r]   z,Exception to be raised with Training Spikes.r0   argskwargsc                    s$   t  jd| g|R i | d S )Nz!Training spike detected in batch )r#   r$   )r(   r0   r{   r|   r)   r+   r,   r$      rd   zTrainingSpikeException.__init__)rs   rt   ru   rv   rw   r   r$   rz   r+   r+   r)   r,   r]      s    "r]   )r[   operatorr6   rI   typingr   r   r   r   r   r   r   rA    lightning_utilities.core.importsr	    lightning.fabric.utilities.typesr
   lightning.fabric.fabricr   ger   r   r"   r]   r+   r+   r+   r,   <module>   s    $ 