o
    }oi                     @   s~   d dl Z d dlZd dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ G dd	 d	eeZdS )
    N)Optional)Callback)Trainer)IOMixin)logging)AsyncFinalizableCheckpointIOc                   @   s   e Zd ZdZddee fddZdeddfdd	Zded
eddfddZ	deddfddZ
ded
eddfddZejdd ZedefddZdefddZedefddZdS )PreemptionCallbacka  
    PreemptionCallback checks for preemption during training at the end of every step.
    Upon preemption, it signals the trainer to stop gracefully.

    Args:
        sig (int, optional): The signal to listen for. Defaults to signal.SIGTERM.

    Example:
        >>> from nemo.lightning.pytorch.callbacks import PreemptionCallback
        >>> callback = PreemptionCallback()
        >>> trainer = Trainer(callbacks=[callback])
    Nsigc                 C   s*   |d ur|nt j| _d| _d | _d | _d S )NF)signalSIGTERMr	   _interrupted_handler_context_preemption_supported)selfr	    r   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/pytorch/callbacks/preemption.py__init__+   s   
zPreemptionCallback.__init__trainerreturnc                 C   s"   | j r|  | _| j  d S d S N)preemption_supported_preemption_handlerr   	__enter__r   r   	pl_moduler   r   r   on_train_start1   s   
z!PreemptionCallback.on_train_start	batch_idxc                 C   s6   | j s|  | _| j r|  | _| j  d S d S d S r   )r   _check_preemption_supportr   r   r   r   )r   r   r   batchr   r   r   r   on_train_batch_start6   s   

z'PreemptionCallback.on_train_batch_startc                 C   s   | j r| j d d d  d S d S r   )r   __exit__r   r   r   r   on_train_end=   s   zPreemptionCallback.on_train_endc                 C   st   | j r6td d|_|jr8|j|}|j|| t|jj	t
r/td |jj	jdd td d S d S d S )Nz2Preemption detected, saving checkpoint and exitingTz8Async checkpointing detected, waiting for it to complete)blockingr   )interruptedr   infoshould_stopcheckpoint_callback_monitor_candidates_save_last_checkpoint
isinstancestrategycheckpoint_ior   maybe_finalize_save_checkpointsysexit)r   r   r   outputsr   r   monitor_candidatesr   r   r   on_train_batch_endA   s   

z%PreemptionCallback.on_train_batch_endc              
   #   s     j std d V  d S t j} fdd}dd }ztj }t j|dkr.|n| d V  W t j| d S t j| w )NzSPreemption requires torch distributed to be initialized, preemption may be disabledc                    s   t d|  d d _d S )NReceived signal z, initiating graceful stopT)r   r$   r   signumframer   r   r   master_handlerV   s   
z>PreemptionCallback._preemption_handler.<locals>.master_handlerc                 S   s   t d|  d d S )Nr2   z on non-master rank, ignoring)r   debugr3   r   r   r   ignoring_handlerZ   s   z@PreemptionCallback._preemption_handler.<locals>.ignoring_handlerr   )	r   r   warningr
   	getsignalr	   torchdistributedget_rank)r   original_handlerr7   r9   private_rankr   r6   r   r   M   s   

"z&PreemptionCallback._preemption_handlerc                 C   s   | j d u r
|  | _ | j S r   )r   r   r6   r   r   r   r   d   s   

z'PreemptionCallback.preemption_supportedc                 C   s   t j o	t j S r   )r<   r=   is_availableis_initializedr6   r   r   r   r   j   s   z,PreemptionCallback._check_preemption_supportc                 C   s>   | j sdS tj| jtj tjd}tj|d t	|
 S )NF)devicedtyper   )r   r<   tensorr   cudacurrent_deviceint32r=   	broadcastboolitem)r   r#   r   r   r   r#   m   s
   zPreemptionCallback.interruptedr   )__name__
__module____qualname____doc__r   intr   r   r   r   r!   r1   
contextlibcontextmanagerr   propertyrJ   r   r   r#   r   r   r   r   r      s    
r   )rQ   r
   r-   typingr   r<   lightning.pytorch.callbacksr   !lightning.pytorch.trainer.trainerr   nemo.lightning.io.mixinr   
nemo.utilsr   !nemo.utils.callbacks.dist_ckpt_ior   r   r   r   r   r   <module>   s   