o
    zim                     @   s   d dl Z d dlZd dlZd dlmZmZmZmZ d dlm	Z	 d dl
mZ d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ e eZe	d
ZG dd deZdS )    N)AnyCallableListOptional)RequirementCache)override)ClusterEnvironment)_basic_subprocess_cmd_hydra_subprocess_cmd_launch_process_observer)_set_num_threads_if_needed)	_Launcher)_SIGNUMz
hydra-corec                       s   e Zd ZdZdedededdf fddZeede	fd	d
Z
edddededed dedef
ddZededdfddZdddZdddZ  ZS )_SubprocessScriptLaunchera  A process launcher that invokes the current script as many times as desired in a single node.

    This launcher needs to be invoked on each node.
    In its default behavior, the main process in each node then spawns N-1 child processes via :func:`subprocess.Popen`,
    where N is the number of devices (e.g. GPU) per node. It is very similar to how :mod:`torch.distributed.run`
    launches processes.

    For example, if the script gets invoked with the command

    .. code-block:: bash

        python train.py --devices 4

    The launcher will create three additional subprocesses that get called like so:

    .. code-block:: bash

        LOCAL_RANK=1 python train.py --devices 4
        LOCAL_RANK=2 python train.py --devices 4
        LOCAL_RANK=3 python train.py --devices 4

    It is implied that the main process which launched the others has ``LOCAL_RANK=0``.
    Beside the local rank, the following other environment variables also get set, but unlike the local rank, these
    get determined by the cluster environment:

    1. `MASTER_ADDR`: The IP address of the main node.
    2. `MASTER_PORT`: The port number of the main node through which all processes communicate.
    3. `NODE_RANK`: The index of the node the current process is running on. Ranges from 0 to ``num_nodes - 1``.
    4. `WORLD_SIZE`: The total number of processes across all nodes, i.e., ``num_processes * num_nodes``.

    Arguments:
        cluster_environment: A cluster environment that provides access to world size, node rank, etc.
        num_processes: The number of processes to launch in the current node.
        num_nodes: The total number of nodes that participate in this process group.

    cluster_environmentnum_processes	num_nodesreturnNc                    s&   t    || _|| _|| _g | _d S )N)super__init__r   r   r   procs)selfr   r   r   	__class__ l/home/ubuntu/.local/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.pyr   K   s
   

z"_SubprocessScriptLauncher.__init__c                 C   s   dS )NFr   r   r   r   r   is_interactive_compatibleR   s   z3_SubprocessScriptLauncher.is_interactive_compatible)trainerfunctionargsr   z
pl.Trainerkwargsc                O   sH   | j j| j| jd | j js|   t| j t| jd ||i |S )a0  Creates new processes, then calls the given function.

        Arguments:
            function: A callback function to execute after all processes have been created.
                It is up to the implementation of this function to synchronize the processes, e.g., with barriers.
            *args: Optional positional arguments to be passed to the given function.
            trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer`.
            **kwargs: Optional keyword arguments to be passed to the given function.

        )num_devicesr   )r   )	r   validate_settingsr   r   creates_processes_externally_call_children_scriptsr   r   r   )r   r   r   r    r!   r   r   r   launchW   s   
z _SubprocessScriptLauncher.launchsignumc              	   C   s<   | j D ]}tdt  d|j d|  || qd S )NzProcess z is terminating z with )r   logdebugosgetpidpidsend_signal)r   r'   procr   r   r   killk   s   
"z_SubprocessScriptLauncher.killc                 C   s  |    g | _| jjtjd< t| jjtjd< t| j tjd< t| j	 tjd< | j
| j  tjd< td| j
D ]E}tj }| |d< tjdd u rVd|v rV|d= d}d }trfd	d
lm} | }|rot|\}}nt }tj|||d}| j| q;d S )NMASTER_ADDRMASTER_PORT	NODE_RANK
LOCAL_RANK
WORLD_SIZE   PL_GLOBAL_SEEDFr   )HydraConfig)envcwd)_check_can_spawn_childrenr   r   main_addressr*   environstr	main_port	node_rank
local_rankr   r   rangecopyget_HYDRA_AVAILABLEhydra.core.hydra_configr7   initializedr
   r	   
subprocessPopenappend)r   r@   env_copyhydra_in_user9   r7   commandnew_processr   r   r   r%   r   s.   

z0_SubprocessScriptLauncher._call_children_scriptsc                 C   s0   t | jdkrtd| j dkrtdd S )Nr   z/The launcher can only create subprocesses once.a  Lightning attempted to launch new distributed processes with `local_rank > 0`. This should not happen. Possible reasons: 1) LOCAL_RANK environment variable was incorrectly modified by the user, 2) `ClusterEnvironment.creates_processes_externally` incorrectly implemented.)lenr   RuntimeErrorr   r@   r   r   r   r   r:      s   z3_SubprocessScriptLauncher._check_can_spawn_children)r   N)__name__
__module____qualname____doc__r   intr   propertyr   boolr   r   r   r   r&   r   r/   r%   r:   __classcell__r   r   r   r   r   %   s    %*
%r   )loggingr*   rG   typingr   r   r   r    lightning_utilities.core.importsr   typing_extensionsr   pytorch_lightningpllightning_fabric.pluginsr   7lightning_fabric.strategies.launchers.subprocess_scriptr	   r
   r   &lightning_fabric.utilities.distributedr   /pytorch_lightning.strategies.launchers.launcherr   5pytorch_lightning.trainer.connectors.signal_connectorr   	getLoggerrP   r(   rD   r   r   r   r   r   <module>   s   
