o
    „o™iõ"  ã                   @   s²   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ e  e¡ZG dd	„ d	eƒZd
efdd„Zd
efdd„ZdS )é    N)ÚOptional)Úoverride)ÚClusterEnvironment)Ú_IS_WINDOWS)Úrank_zero_warn)ÚPossibleUserWarningc                       sz  e Zd ZdZd.dedeej ddf‡ fdd„Ze	e
defd	d
„ƒƒZe	e
defdd„ƒƒZe	e
defdd„ƒƒZee
defdd„ƒƒZedee fdd„ƒZedee fdd„ƒZe
defdd„ƒZe
deddfdd„ƒZe
defdd„ƒZe
deddfdd„ƒZe
defdd „ƒZe
defd!d"„ƒZe
d#ed$eddfd%d&„ƒZed'edefd(d)„ƒZed/d*d+„ƒZed/d,d-„ƒZ‡  ZS )0ÚSLURMEnvironmenta'  Cluster environment for training on a cluster managed by SLURM.

    You can configure the `main_address` and `main_port` properties via the env variables `MASTER_ADDR` and
    `MASTER_PORT`, respectively.

    Args:
        auto_requeue: Whether automatic job resubmission is enabled or not. How and under which conditions a job gets
            rescheduled gets determined by the owner of this plugin.
        requeue_signal: The signal that SLURM will send to indicate that the job should be requeued. Defaults to
            SIGUSR1 on Unix.

    TNÚauto_requeueÚrequeue_signalÚreturnc                    s<   t ƒ  ¡  || _|d u rtstj}|| _|  ¡  |  ¡  d S )N)	ÚsuperÚ__init__r	   r   ÚsignalÚSIGUSR1r
   Ú_validate_srun_usedÚ_validate_srun_variables)Úselfr	   r
   ©Ú	__class__© ú_/home/ubuntu/.local/lib/python3.10/site-packages/lightning/fabric/plugins/environments/slurm.pyr   /   s   
zSLURMEnvironment.__init__c                 C   s   dS )NTr   ©r   r   r   r   Úcreates_processes_externally8   s   z-SLURMEnvironment.creates_processes_externallyc                 C   sP   t j d¡}|d u rt j dd¡}|  |¡}|t jd< t dt jd › ¡ |S )NÚMASTER_ADDRÚSLURM_NODELISTz	127.0.0.1zMASTER_ADDR: )ÚosÚenvironÚgetÚresolve_root_node_addressÚlogÚdebug)r   Ú	root_nodeÚnodelistr   r   r   Úmain_address=   s   

zSLURMEnvironment.main_addressc                 C   st   t j d¡}|d ur|dd … }t|ƒd }nd}dt jv r&tt jd ƒ}nt|ƒt jd< t dt jd › ¡ |S )NÚSLURM_JOB_IDéüÿÿÿi˜:  in2  ÚMASTER_PORTzMASTER_PORT: )r   r   r   ÚintÚstrr   r    )r   Újob_idÚdefault_portr   r   r   Ú	main_portI   s   
zSLURMEnvironment.main_portc                   C   s   t  ¡  tƒ S )aÒ  Returns ``True`` if the current process was launched on a SLURM cluster.

        It is possible to use the SLURM scheduler to request resources and then launch processes manually using a
        different environment. For this, the user can set the job name in SLURM to 'bash' or 'interactive' (srun --job-
        name=interactive). This will then avoid the detection of ``SLURMEnvironment`` and another environment can be
        detected automatically.

        )r   r   Ú_is_srun_usedr   r   r   r   Údetecte   s   zSLURMEnvironment.detectc                   C   s   t j d¡S )NÚSLURM_JOB_NAME)r   r   r   r   r   r   r   Újob_names   s   zSLURMEnvironment.job_namec                  C   sB   t ƒ rd S tj d¡} | d u rd S zt| ƒW S  ty    Y d S w )Nr$   ©Ú_is_slurm_interactive_moder   r   r   r'   Ú
ValueError)r)   r   r   r   r)   w   s   
ÿzSLURMEnvironment.job_idc                 C   ó   t tjd ƒS ©NÚSLURM_NTASKS©r'   r   r   r   r   r   r   Ú
world_size…   ó   zSLURMEnvironment.world_sizeÚsizec                 C   ó   t  d¡ d S )Nz[SLURMEnvironment.set_world_size was called, but setting world size is not allowed. Ignored.©r   r    )r   r9   r   r   r   Úset_world_size‰   r8   zSLURMEnvironment.set_world_sizec                 C   r3   )NÚSLURM_PROCIDr6   r   r   r   r   Úglobal_rank   r8   zSLURMEnvironment.global_rankÚrankc                 C   r:   )Nz]SLURMEnvironment.set_global_rank was called, but setting global rank is not allowed. Ignored.r;   )r   r?   r   r   r   Úset_global_rank‘   r8   z SLURMEnvironment.set_global_rankc                 C   r3   )NÚSLURM_LOCALIDr6   r   r   r   r   Ú
local_rank•   r8   zSLURMEnvironment.local_rankc                 C   r3   )NÚSLURM_NODEIDr6   r   r   r   r   Ú	node_rank™   r8   zSLURMEnvironment.node_rankÚnum_devicesÚ	num_nodesc                 C   sŠ   t ƒ rd S tj d¡}|d ur#t|ƒ|kr#td|› d|› d|› dƒ‚tj d¡}|d urAt|ƒ|krCtd|› d|› d	|› dƒ‚d S d S )
NÚSLURM_NTASKS_PER_NODEzYou set `devices=zX` in Lightning, but the number of tasks per node configured in SLURM `--ntasks-per-node=z%` does not match. HINT: Set `devices=z`.ÚSLURM_NNODESzYou set `num_nodes=zE` in Lightning, but the number of nodes configured in SLURM `--nodes=z'` does not match. HINT: Set `num_nodes=r0   )r   rE   rF   Úntasks_per_nodeÚnnodesr   r   r   Úvalidate_settings   s*   ÿÿÿÿÿÿÿz"SLURMEnvironment.validate_settingsÚnodesc                 C   s4   t  dd| ¡} t  dd| ¡} |  d¡d  d¡d S )a¢  The node selection format in SLURM supports several formats.

        This function selects the first host name from

        - a space-separated list of host names, e.g., 'host0 host1 host3' yields 'host0' as the root
        - a comma-separated list of host names, e.g., 'host0,host1,host3' yields 'host0' as the root
        - the range notation with brackets, e.g., 'host[5-9]' yields 'host5' as the root

        z\[(.*?)[,-].*\]z\1z	\[(.*?)\]ú r   ú,)ÚreÚsubÚsplit)rL   r   r   r   r   ®   s   z*SLURMEnvironment.resolve_root_node_addressc                  C   sf   t rdS t d¡du} | r/tƒ s1d dtj tj	¡gtj
¢¡dd… }td|› dtd dS dS dS )aU  Checks if the `srun` command is available and used.

        Parallel jobs (multi-GPU, multi-node) in SLURM are launched by prepending `srun` in front of the Python command.
        Not doing so will result in processes hanging, which is a frequent user error. Lightning will emit a warning if
        `srun` is found but not used.

        NÚsrunrM   é@   z§The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: z ...)Úcategory)r   ÚshutilÚwhichr,   Újoinr   ÚpathÚbasenameÚsysÚ
executableÚargvr   r   )Úsrun_existsÚhintr   r   r   r   ½   s   	
&ÿ
ýþz$SLURMEnvironment._validate_srun_usedc                  C   sB   t tj dd¡ƒ} | dkrdtjvrtd| › d| › dƒ‚dS dS )	a2  Checks for conflicting or incorrectly set variables set through `srun` and raises a useful error message.

        Right now, we only check for the most common user errors. See
        `the srun docs <https://slurm.schedmd.com/srun.html>`_
        for a complete list of supported srun variables.

        r5   Ú1é   rG   zYou set `--ntasks=z^` in your SLURM bash script, but this variable is not supported. HINT: Use `--ntasks-per-node=z
` instead.N)r'   r   r   r   ÚRuntimeError)Úntasksr   r   r   r   Ò   s   	ÿÿÿz)SLURMEnvironment._validate_srun_variables)TN)r   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úboolr   r   ÚSignalsr   Úpropertyr   r   r(   r#   r'   r+   Ústaticmethodr-   r/   r)   r7   r<   r>   r@   rB   rD   rK   r   r   r   Ú__classcell__r   r   r   r   r   !   sN    "	
r   r   c                   C   s   dt jv otƒ  S r4   )r   r   r1   r   r   r   r   r,   ã   s   r,   c                   C   s   t  ¡ dv S )N)ÚbashÚinteractive)r   r/   r   r   r   r   r1   ç   s   r1   )Úloggingr   rO   rU   r   rZ   Útypingr   Útyping_extensionsr   Ú9lightning.fabric.plugins.environments.cluster_environmentr   Ú"lightning.fabric.utilities.importsr   Ú$lightning.fabric.utilities.rank_zeror   Ú#lightning.fabric.utilities.warningsr   Ú	getLoggerrc   r   r   rg   r,   r1   r   r   r   r   Ú<module>   s"   
 C