o
    }oiu                     @   sD   d dl Z d dlZd dlZd dlmZ d dlmZ G dd deZdS )    N)Callback)loggingc                   @   sL   e Zd ZdZdddZedd Zdd Zd	d
 Zde	fddZ
dd ZdS )PreemptionCallbackaH  
    PreemptionCallback class creates a callback that checks for preemption during training at the end of every step.
    Upon preemption the callback provides a function to gracefully exit the training immediately and also saves the current state in a checkpoint as *last.ckpt.
    (to be able to start from the same step without wasting any compute while resuming the next time).

    PreemptionCallback is always enabled by default via the arg create_preemption_callback under ExpManagerConfig. To disable please pass
    create_preemption_callback: False in your config file.
    Nc                 C   s(   || _ | j d u rtj| _ || _d| _d S )NF)sigsignalSIGTERMcheckpoint_callbackpreemption_enabled)selfr   r    r   S/home/ubuntu/.local/lib/python3.10/site-packages/nemo/utils/callbacks/preemption.py__init__"   s
   

zPreemptionCallback.__init__c                 C   s8   t j| jt j t jd}t j|d t|	 }|S )N)devicedtyper   )
torchtensor_interruptedcudacurrent_deviceint32distributed	broadcastboolitem)r
   interruptedr   r   r   r   )   s   zPreemptionCallback.interruptedc                    s   t j r
t j std  S d _d _d _t	
 j _ fdd} fdd}t j  _ jdkrAt		 j|  S t		 j|  S )	z
        Defines custom handlers at the beginning of training to be executed when the
        preemption signal is received.
        zMPreemption requires torch distributed to be initialized, disabling preemptionTFc                    s       d _d S )NT)releaser   signumframer
   r   r   master_handlerA   s   
z9PreemptionCallback.on_train_start.<locals>.master_handlerc                    s       d S N)r   r   r   r   r   ignoring_handlerF   s   z;PreemptionCallback.on_train_start.<locals>.ignoring_handlerr   )r   r   is_availableis_initializedr   infor	   r   releasedr   	getsignalr   original_handlerget_rankprivate_rank)r
   trainer	pl_moduler    r"   r   r   r   on_train_start0   s   

z!PreemptionCallback.on_train_startc                 C   s   | j r	|   d S d S r!   )r	   r   )r
   r+   r,   r   r   r   on_train_endQ   s   zPreemptionCallback.on_train_end	batch_idxc                 C   sJ   | j r!| j}|r#td | j|}| j|| td d S d S d S )Nz/Received SIGTERM, saving checkpoint and exitingr   )	r	   r   r   r%   r   _monitor_candidates_save_last_checkpointsysexit)r
   r+   r,   outputsbatchr/   r   monitor_candidatesr   r   r   on_train_batch_endU   s   
z%PreemptionCallback.on_train_batch_endc                 C   s$   | j rdS t| j| j d| _ dS )NFT)r&   r   r   r(   r   r   r   r   r   c   s
   zPreemptionCallback.releaser!   )__name__
__module____qualname____doc__r   propertyr   r-   r.   intr7   r   r   r   r   r   r      s    
	
!r   )r   r2   r   lightning.pytorch.callbacksr   
nemo.utilsr   r   r   r   r   r   <module>   s   