o
    8wi                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZ er6d dlmZ edejdZG dd	 d	ZG d
d deZdS )    N)TYPE_CHECKINGAnyLiteralOptionalUnion)compare_version)_PATH)Fabrictorchmetricsz1.0.0c                       s\  e Zd ZdZ							d/ded	 d
ededee dee dee de	f fddZ
e dddejdededdf
ddZdejde	fddZdddeddfddZdeeejf deeejf de	fd d!Zdeeejf deeejf de	fd"d#Zd$ejde	fd%d&Zd'ejddfd(d)Zdeeef fd*d+Zd,eeef ddfd-d.Z  ZS )0SpikeDetectiona  Spike Detection Callback.

    Terminates training with a ``TrainingSpikeException`` when a loss-spike was detected and
    saves the batches to skip when resuming to a file.

    We skip the current and the previous batch since it is unclear whether the previous batch
    altered the weights in a way that it causes the spike or just the current batch is corrupted somehow.

    Args:
        mode: Whether to minimize or maximize the tracked metric
        window: A running mean of metrics with ``window`` size. Serves as reference value for spikes.
        warmup: After how many batches spike-tracking should start
        atol: An absolute tolerance.  Every diff between the running mean and the current value,
            that's not an improvement and above ``atol`` will be considered a spike
        rtol: A relative tolerance. Every diff between the running mean and the current value,
            that's higher than ``rtol * running_mean`` is considered a spike
        exclude_batches_path: Where to save the file that contains the batches to exclude.
            Will default to current directory.
        finite_only: If set to ``False``, consider non-finite values like NaN, inf and -inf a spike as well.

    min
      N       @Tmode)r   maxwindowwarmupatolrtolexclude_batches_pathfinite_onlyc           
         s   t rddlm} ddlm}	 ntdt   d| _|	|ddd|d| _	d| j	_
d| j	_|| _|| _|| _|| _g | _|| _|| _d S )	Nr   )
MeanMetric)RunningzJSpikeDetection requires `torchmetrics>=1.0.0` Please upgrade your version.        F)dist_sync_on_stepsync_on_compute)r   )!_TORCHMETRICS_GREATER_EQUAL_1_0_0torchmetrics.aggregationr   torchmetrics.wrappersr   RuntimeErrorsuper__init__last_valrunning_meanr   r   r   r   r   r   bad_batchesr   r   )
selfr   r   r   r   r   r   r   r   r   	__class__ ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lightning_fabric/utilities/spike.pyr"   )   s    


zSpikeDetection.__init__fabricr	   lossbatch	batch_idxreturnc                 C   s   |dkr| j |jj | jdu rt | _t| jds'tj	
| jd| _t|| jko1| |}|j  |jj|dd}|rJ| || dS | jp[|jjtt| dd}|re| | dS dS )z)Checks if we currently have a loss-spike.r   Nz.jsonzskip_batches.jsonF)allT)r$   tostrategyroot_devicer   osgetcwdstrendswithpathjoinboolr   	_is_spikebarrierreduce_boolean_decision_handle_spiker   torchisfiniter0   _update_stats)r&   r+   r,   r-   r.   is_spikeis_spike_globalis_finite_allr)   r)   r*   on_train_batch_endI   s"   


z!SpikeDetection.on_train_batch_endc                 C   s   t   t d | j }W d    n1 sw   Y  || j }| jr-t|s-dS | 	|r4dS | 
||o?| ||S )NignoreTF)warningscatch_warningssimplefilterr$   computer#   r   r?   r@   
_is_better_check_atol_check_rtol)r&   r,   running_val	curr_diffr)   r)   r*   r;   d   s   



zSpikeDetection._is_spikec                 C   s   | j |d |g |jdkrA| jd usJ tjtj| jdd t| jd}t	j
| j |dd W d    n1 s<w   Y  t|d)	Nr   r   T)exist_okw   )indent)r.   )r%   extendglobal_rankr   r4   makedirsr8   dirnameopenjsondumpTrainingSpikeException)r&   r+   r.   fr)   r)   r*   r>   t   s   

zSpikeDetection._handle_spikeval_aval_bc                 C   s$   | j d u ptt|| t| j kS N)r   r:   absr&   r]   r^   r)   r)   r*   rL         $zSpikeDetection._check_atolc                 C   s(   | j d u ptt|| t| j | kS r_   )r   r:   r`   ra   r)   r)   r*   rM      s   (zSpikeDetection._check_rtoldiff_valc                 C   sD   | j dkrt|dk S | j dkrt|dk S td| j  )Nr   r   r   r   z*Invalid mode. Has to be min or max, found )r   r:   r0   
ValueError)r&   rc   r)   r)   r*   rK      s
   

zSpikeDetection._is_bettervalc                 C   s   | j | || _d S r_   )r$   updater#   )r&   re   r)   r)   r*   rA      s   
zSpikeDetection._update_statsc              
   C   sL   t | jtjr| j n| j| j| j| j| j| j	| j
| j | jj d	S )N)	r#   r   r   r   r   r%   bad_batches_pathrunningmean)
isinstancer#   r?   Tensoritemr   r   r   r   r%   r   r$   
state_dictbase_metric)r&   r)   r)   r*   rm      s   
zSpikeDetection.state_dictrm   c                 C   s~   | d| _| d| _| d| _| d| _| d| _| d| _| d| _| j	| d | j
j	| d	 d S )
Nr#   r   r   r   r   r%   rg   rh   ri   )popr#   r   r   r   r   r%   r   rh   load_state_dictr$   rn   )r&   rm   r)   r)   r*   rp      s   zSpikeDetection.load_state_dict)r   r   r   Nr   NT)__name__
__module____qualname____doc__r   intr   floatr   r:   r"   r?   no_gradrk   r   rE   r;   r>   r   rL   rM   rK   rA   dictr6   rm   rp   __classcell__r)   r)   r'   r*   r      sD     "**"r   c                       s.   e Zd ZdZdededef fddZ  ZS )r[   z,Exception to be raised with Training Spikes.r.   argskwargsc                    s$   t  jd| g|R i | d S )Nz!Training spike detected in batch )r!   r"   )r&   r.   rz   r{   r'   r)   r*   r"      rb   zTrainingSpikeException.__init__)rq   rr   rs   rt   ru   r   r"   ry   r)   r)   r'   r*   r[      s    "r[   )rY   operatorr4   rG   typingr   r   r   r   r   r?    lightning_utilities.core.importsr    lightning_fabric.utilities.typesr   lightning_fabric.fabricr	   ger   r   r    r[   r)   r)   r)   r*   <module>   s     