o
    wOi                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ defdd	Zzdd
lT W n	 eyX   Y nw dd Z ddee defddZ!dS )a  
Below is a simple program that locally launches a multi-role
(trainer, parameter server, reader) distributed application.
Each ``Role`` runs multiple replicas. In reality each replica
runs on its own container on a host. An ``Application`` is made up
to one or more such ``Roles``.

.. code:: python

 import getpass
 import torchelastic.tsm.driver as tsm

 username = getpass.getuser()
 train_project_dir = tsm.Container(image=f"/home/{username}/pytorch_trainer")
 reader_project_dir = tsm.Container(image=f"/home/{username}/pytorch_reader")

 trainer = tsm.ElasticRole(name="trainer", nprocs_per_node=2, nnodes="4:4")
              .runs("train_main.py", "--epochs", "50", MY_ENV_VAR="foobar")
              .on(train_project_dir)
              .replicas(4)

 ps = tsm.Role(name="parameter_server")
         .run("ps_main.py")
         .on(train_project_dir)
         .replicas(10)

 reader = tsm.Role(name="reader")
             .runs("reader/reader_main.py", "--buffer_size", "1024")
             .on(reader_project_dir)
             .replicas(1)

 app = tsm.Application(name="my_train_job").of(trainer, ps, reader)

 session = tsm.session(name="my_session")
 app_id = session.run(app, scheduler="local")
 session.wait(app_id)

In the example above, we have done a few things:

#. Created and ran a distributed training application that runs a total of
   4 + 10 + 1 = 15 containers (just processes since we used a ``local`` scheduler).
#. ``trainer`` run wrapped with TorchElastic.
#. The ``trainer`` and ``ps`` run from the same image (but different containers):
   ``/home/$USER/pytorch_trainer`` and the reader runs from the image:
   ``/home/$USER/pytorch_reader``. The images map to a local directory
   because we are using a local scheduler. For other non-trivial schedulers
   a container could map to a Docker image, tarball, rpm, etc.
#. The main entrypoints are relative to the container image's root dir.
   For example, the trainer runs ``/home/$USER/pytorch_trainer/train_main.py``.
#. Arguments to each role entrypoint are passed as ``*args`` after the entrypoint CMD.
#. Environment variables to each role entrypoint are passed as ``**kwargs``
   after the arguments.
#. The ``session`` object has action APIs on the app (see :class:`Session`).


    N)Optional)AppDryRunInfo	AppHandleApplicationAppState	AppStatus	ContainerDescribeAppResponseElasticRoleResourceRetryPolicyRole	RunConfig	SchedulerSchedulerBackendSessionis_terminalmacrosparse_app_handlerunopts)get_schedulers)StandaloneSessionreturnc                   C   s   t  S )N)getpassgetuser r   r   T/home/ubuntu/.local/lib/python3.10/site-packages/torchelastic/tsm/driver/__init__.py	get_owner]   s   r   )*c                   C   s   dt   S )Ntsm_)r   r   r   r   r   _gen_session_nameg   s   r    
standalonenamebackendc                 K   s@   |dkrt d| d| st } | |d< t| tdi |dS )Nr!   zUnsupported session backend: z. Supported values: standalonesession_name)r"   
schedulersr   )
ValueErrorr    r   r   )r"   r#   scheduler_argsr   r   r   sessionk   s   
r(   )Nr!   )"__doc__r   typingr   torchelastic.tsm.driver.apir   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   "torchelastic.tsm.driver.schedulersr   *torchelastic.tsm.driver.standalone_sessionr   strr   $torchelastic.tsm.driver.api_extendedModuleNotFoundErrorr    r(   r   r   r   r   <module>   s   8T