o
    wOi%                     @   s   d dl Z d dlmZmZmZmZmZ G dd deZG dd deZ	G dd deZ
G d	d
 d
eZG dd de jZG dd dZeegef ZG dd dZdS )    N)AnyCallableDictOptionalTuplec                   @      e Zd ZdZdS )RendezvousExceptionz=
    Represents the base type for rendezvous exceptions.
    N__name__
__module____qualname____doc__ r   r   O/home/ubuntu/.local/lib/python3.10/site-packages/torchelastic/rendezvous/api.pyr      s    r   c                   @   r   )RendezvousClosedExceptionzo
    Raised when a rendezvous is closed.

    This is used to signal completion to nodes that arrive late.
    Nr	   r   r   r   r   r      s    r   c                   @   r   )RendezvousTimeoutExceptionz
    Raised to signal that a rendezvous did not succeed within the allocated
    time.

    This is a non-retryable type of failure.
    Nr	   r   r   r   r   r      s    r   c                   @   r   )RendezvousNonRetryableErrorzf
    Raised when a failure occured that should not be retried within the same
    worker process.
    Nr	   r   r   r   r   r   (   s    r   c                   @   s   e Zd ZdZejdefddZejdede	e	f fddZ
ejdefdd	Zejd
d Zejde	fddZejdefddZdefddZdS )RendezvousHandlera  
    Main rendezvous interface.

    .. note:: torchelastic users normally **do not** need to implement their
              own ``RendezvousHandler``. An implementation based on
              `etcd <https://etcd.io/>`__ is already provided, and is recommended
              for most users, provided they can deploy it in their environment.

    .. warning:: torchelastic is currently considered experimental,
                 so the APIs may change!
    returnc                 C      dS )zM
        Return the string representation of the rendezvous handler.
        Nr   selfr   r   r   get_backend>      zRendezvousHandler.get_backendztorch.distributed.Storec                 C   r   )a  
        Main entry-point into the rendezvous barrier.
        Blocks until the rendezvous is complete (and the current
        process is included in the formed worker group), or a timeout occurs, or
        rendezvous was marked closed.

        Returns: a tuple of (``c10d Store``, ``rank``, ``world size``)

        Raises:
            RendezvousClosedException - if rendezvous for the current
               job is closed.
            RendezvousTimeoutException - on timeout
        Nr   r   r   r   r   next_rendezvousE   s   z!RendezvousHandler.next_rendezvousc                 C   r   )aL  
        Checks whether rendezvous for current job has been closed,
        which means all future attempts to re-rendezvous (within same job) will
        fail.

        .. note:: ``is_closed`` and ``set_closed`` have semantics of eventual
                  propagation, and should not be used for synchronization.
                  The intention here is that if at least one worker decides
                  the job is finished, it will close the rendezvous, and
                  other workers will soon observe this and stop
                  training/rendezvous-ing as well.
        Nr   r   r   r   r   	is_closedY   s   zRendezvousHandler.is_closedc                 C   r   )zJ
        Used to mark the rendezvous (for current job) as closed.
        Nr   r   r   r   r   
set_closedi   r   zRendezvousHandler.set_closedc                 C   r   )ue  
        Returns number of workers who *arrived late* at
        the rendezvous barrier, hence weren’t included in the current worker
        group.

        Callers should periodically call this method to check whether
        new members are waiting to join the job and if so admit them by
        calling ``next_rendezvous()`` (re-rendezvous).
        Nr   r   r   r   r   num_nodes_waitingp   s   z#RendezvousHandler.num_nodes_waitingc                 C   r   )a#  
        Returns the run_id of this rendezvous handler. The run_id is a user-defined
        id that uniquely identifies an instance of a distributed application.
        It typically maps to a job id and is used to allow workers to join the
        correct distributed application.
        Nr   r   r   r   r   
get_run_id}   s   zRendezvousHandler.get_run_idc                 C   r   )a2  
        Closes all resources that were open for rendezvous run.

        Usage:

        ::

         def main():
             rdzv_handler = ...
             try:
               rank, world_size, store = rdzv_handler.next_rendezvous()
             finally:
               rdzv_handler.shutdown()
        Nr   r   r   r   r   shutdown   s   zRendezvousHandler.shutdownN)r
   r   r   r   abcabstractmethodstrr   r   intr   boolr   r   r   r   r   r   r   r   r   r   1   s"    
	r   c                
   @   s   e Zd ZU dZdZeed< dZeed< dededed	ed
ef
ddZ	e
dd Ze
dd ZddededefddZddedee dee fddZddedee dee fddZdS )RendezvousParameterszR
    The data object holding parameters to construct a ``RendezvousHandler``.
    iX  _DEFAULT_TIMEOUT   _DEFAULT_LAST_CALL_TIMEOUTbackendendpointrun_id	min_nodes	max_nodesc                 K   sX   |du rt d|dk rt d||k rt d|| _|| _|| _|| _|| _|| _dS )a  
        Args:
            backend: The backend that is used to register the rendezvous.
            endpoint: The endpoint of the rendezvous. Usually it is a string in the format
                <hostname>:<port>.
            run_id: The id of the rendezvous.
            min_nodes: The minimum number of nodes required to complete the rendezvous.
            max_nodes: The maximum number of nodes that are allowed to join the rendezvous.
            **kwargs: Additional parameters for the specified backend.
        NzThe backend cannot be None.   z6The minimum number of nodes must be greater than zero.zYThe maximum number of nodes must be greater than or equal to the minimum number of nodes.)
ValueErrorr)   r*   r+   r,   r-   config)r   r)   r*   r+   r,   r-   kwargsr   r   r   __init__   s   
zRendezvousParameters.__init__c                 C      |  d| jS )z6
        Gets the timeout for the rendezvous.
        timeout)
get_as_intr&   r   r   r   r   r4      s   zRendezvousParameters.timeoutc                 C   r3   )z
        Gets additional waiting time after reaching the minimum number of nodes
        in case the rendezvous is elastic (min != max).
        last_call_timeout)r5   r(   r   r   r   r   r6      s   z&RendezvousParameters.last_call_timeoutNkeydefaultr   c                 C   s   | j ||S )zT
        Returns the value for ``key`` if ``key`` exists, else ``default``.
        )r0   get)r   r7   r8   r   r   r   r9      s   zRendezvousParameters.getc                 C   s^   |  ||}|du r|S t|tst|tr|rdS dS t|tr'| dv S td| d)zP
        Returns the value for ``key`` as a ``bool`` if ``key`` exists.
        NTF)1truetyesyThe 'z=' rendezvous config does not represent a valid boolean value.)r9   
isinstancer#   r$   r"   lowerr/   r   r7   r8   valr   r   r   get_as_bool   s   

z RendezvousParameters.get_as_boolc                 C   sB   |  ||}|du r|S zt|W S  ty    td| dw )zP
        Returns the value for ``key`` as an ``int`` if ``key`` exists.
        Nr?   z=' rendezvous config does not represent a valid integer value.)r9   r#   r/   rB   r   r   r   r5      s   

zRendezvousParameters.get_as_intN)r
   r   r   r   r&   r#   __annotations__r(   r"   r2   propertyr4   r6   r   r9   r   r$   rD   r5   r   r   r   r   r%      s,   
 
$

 $r%   c                   @   s<   e Zd ZdZdd ZdedefddZded	e	fd
dZ
dS )RendezvousHandlerFactoryzT
    Creates ``RendezvousHandler`` instances for supported rendezvous backends.
    c                 C   s
   i | _ d S rE   )	_registryr   r   r   r   r2     s   
z!RendezvousHandlerFactory.__init__r)   creatorc                 C   sl   z| j | }W n ty   d}Y nw |dur/td| d|j d|j d|j d|j d|| j |< dS )z5
        Registers a new rendezvous backend.
        NThe rendezvous backend 'z' cannot be registered with '.z$' as it is already registered with ''.)rI   KeyErrorr/   r   r
   )r   r)   rJ   current_creatorr   r   r   register  s   *z!RendezvousHandlerFactory.registerparamsr   c              	   C   sr   z| j |j }W n ty   td|j d| jj dw ||}| |jkr7td|  d|j d|S )zY
        Creates a new ``RendezvousHandler`` instance for the specified backend.
        rK   z,' is not registered. Did you forget to call ?z The rendezvous handler backend 'z(' does not match the requested backend 'rM   )rI   r)   rN   r/   rP   r
   r   RuntimeError)r   rQ   rJ   handlerr   r   r   create_handler  s   z'RendezvousHandlerFactory.create_handlerN)r
   r   r   r   r2   r"   RendezvousHandlerCreatorrP   r%   r   rU   r   r   r   r   rH      s
    rH   )r    typingr   r   r   r   r   	Exceptionr   r   r   r   ABCr   r%   rV   rH   r   r   r   r   <module>   s   
	hc