o
    ci                     @   s   d dl Z d dlmZmZ d dlmZmZmZmZm	Z	 G dd de
ZG dd deZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N)ListOptional)$DEFAULT_WORKER_GROUP_START_TIMEOUT_S%DEFAULT_WORKER_HEALTH_CHECK_TIMEOUT_S REPORT_BARRIER_TIMEOUT_S_ENV_VAR$WORKER_GROUP_START_TIMEOUT_S_ENV_VAR%WORKER_HEALTH_CHECK_TIMEOUT_S_ENV_VARc                   @      e Zd ZdZdS )RayTrainErrorz(Base class for all Ray Train exceptions.N__name__
__module____qualname____doc__ r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/ray/train/v2/_internal/exceptions.pyr
          r
   c                       s    e Zd ZdZ fddZ  ZS )WorkerHealthCheckTimeoutErrorzBException raised when a worker health check hangs for long enough.c                    s2   t tt}|dt d| d7 }t | d S )Nz	
Set the z> environment variable to increase the timeout (current value: z
 seconds).)osgetenvr   r   super__init__)selfmessagetimeout	__class__r   r   r      s   z&WorkerHealthCheckTimeoutError.__init__)r   r   r   r   r   __classcell__r   r   r   r   r      s    r   c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	WorkerHealthCheckFailedErrorz2Exception raised when a worker health check fails.failurec                    s   t  | || _|| _d S N)r   r   _messagehealth_check_failure)r   r   r   r   r   r   r   $   s   
z%WorkerHealthCheckFailedError.__init__c                 C      | j | j| jffS r    )r   r!   r"   r   r   r   r   
__reduce__)      z'WorkerHealthCheckFailedError.__reduce__c                 C   s   | j d t| j S )N
)r!   strr"   r$   r   r   r   __str__,   s   z$WorkerHealthCheckFailedError.__str__)	r   r   r   r   	Exceptionr   r%   r)   r   r   r   r   r   r   !   s
    r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )WorkerGroupStartupTimeoutErrora1  Exception raised when the worker group startup times out.

    Example scenario: 4 GPUs are detected in the cluster, but when the worker
    are actually scheduled, one of the nodes goes down and only 3 GPUs are
    available. One of the worker tasks may be stuck pending, until a timeout is reached.
    num_workersc              	      s<   t tjtt}|| _t d| d| dt d d S )Nz)The worker group startup timed out after z seconds waiting for a   workers. Potential causes include: (1) temporary insufficient cluster resources while waiting for autoscaling (ignore this warning in this case), (2) infeasible resource request where the provided `ScalingConfig` cannot be satisfied), and (3) transient network issues. Set the z. environment variable to increase the timeout.)	floatr   environgetr   r   r,   r   r   )r   r,   r   r   r   r   r   8   s   z'WorkerGroupStartupTimeoutError.__init__c                 C   s   | j | jffS r    )r   r,   r$   r   r   r   r%   M   s   z)WorkerGroupStartupTimeoutError.__reduce__)r   r   r   r   intr   r%   r   r   r   r   r   r+   0   s    r+   c                   @   r	   )WorkerGroupStartupFailedErrorzException raised when the worker group fails to start.

    Example scenario: A worker is scheduled onto a node that dies while
    the worker actor is initializing.
    Nr   r   r   r   r   r1   Q   r   r1   c                   @   r	   )$CheckpointManagerInitializationErrora  Exception raised when the checkpoint manager fails to initialize from a snapshot.

    Example scenarios:
    1. The checkpoint manager snapshot version is old and
        incompatible with the current version of Ray Train.
    2. The checkpoint manager snapshot JSON file is corrupted.
    3. The checkpoint manager snapshot references checkpoints that cannot be found
        in the run storage path.
    Nr   r   r   r   r   r2   Y   r   r2   c                   @   r	   )CollectiveTimeoutErrorzhException raised when an internal Ray Train collective operation of
    the worker group times out.
    Nr   r   r   r   r   r3   e   r   r3   c                       s>   e Zd ZdZdee dee def fddZdd Z	  Z
S )	BroadcastCollectiveTimeoutErrora  Exception raised when the broadcast operation times out.

    There are two main timeout examples:
    1. If not all workers call `ray.train.report`, the entire worker group will
        hang until the timeout before raising. This prevents indefinite worker
        group hangs.
    2. If a worker is slow in the training loop and fails to reach the broadcast
        time, the collective will time out.
    time_elapsedmissing_ranks	timeout_sc              	      sD   || _ || _|| _d|dd| dt d|dd	}t | d S )Nz(The broadcast operation timed out after z.2fzn seconds. Please make sure all worker ranks call `ray.train.report`. 
The following ranks have not called it: z#
You can set this timeout with the z& environment variable (current value: z s).)_time_elapsed_missing_ranks
_timeout_sr   r   r   )r   r5   r6   r7   r   r   r   r   r   v   s   
z(BroadcastCollectiveTimeoutError.__init__c                 C   s   | j | j| j| jffS r    )r   r8   r9   r:   r$   r   r   r   r%      s   z*BroadcastCollectiveTimeoutError.__reduce__)r   r   r   r   r   r-   r   r0   r   r%   r   r   r   r   r   r4   k   s    
r4   c                   @   s2   e Zd ZdZdedefddZdd Zdd	 Zd
S )UserExceptionWithTracebacka"  This class wraps a user code exception raised on the worker
    with its original traceback string, for logging and debugging purposes.

    This is needed because the original exception traceback is not serialized
    with the exception when it is *returned* back to the main process.
    exctraceback_strc                 C   s   || _ || _d S r    )	_base_exc_traceback_str)r   r<   r=   r   r   r   r      s   
z#UserExceptionWithTraceback.__init__c                 C   r#   r    )r   r>   r?   r$   r   r   r   r%      r&   z%UserExceptionWithTraceback.__reduce__c                 C   s   | j S r    )r?   r$   r   r   r   r)      s   z"UserExceptionWithTraceback.__str__N)	r   r   r   r   BaseExceptionr(   r   r%   r)   r   r   r   r   r;      s
    r;   )r   typingr   r    ray.train.v2._internal.constantsr   r   r   r   r   r*   r
   r   r   r+   r1   r2   r3   r4   r;   r   r   r   r   <module>   s    
!"