o
    rri                     @   s   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZddl	m
Z
 eeZeddZdd Zd	d
 ZdddZdd Zdd Zdd ZdS )    )
namedtupleN   )get_xpDistribSpecz5rank world_size local_rank node_rank num_nodes sourcec            	      C   s  t  } | jdkr
dS dtjvr8dtjv sJ dtjd }tjd |g dddj  }|d	 }|tjd< d
tjvrjt	 }|j
}zt }W n	 tyR   Y nw ||j7 }t|}|dd}t|tjd
< dtjvrt| jtjd< t| jtjd< t| jtjd< dS dS )a  Calling this function will set the distributed environement
    including the master addr, master port etc. You shouldn't call
    this if you call `dora.distrib.init`, but it can be useful if you need to let
    some other framework handle the distributed initialization.
    r   NMASTER_ADDRSLURM_JOB_NODELISTzcase not handledzscontrol show hostnamesT)capture_outputcheckr   MASTER_PORTi N  i`  
WORLD_SIZERANK
LOCAL_RANK)get_distrib_spec
world_sizeosenvironsprunsplitstdoutdecoder   sigsubmititJobEnvironmentRuntimeErrorjob_idrandomRandomrandintstrrank
local_rank)	specnodelistnodesmaster_nodexpseedenvrngmaster_port r+   @/home/ubuntu/.local/lib/python3.10/site-packages/dora/distrib.pyset_distrib_env   s<   








r-   c                  C   s   dt jv r)tt jd } tt jd }dt jv r tt jd }n| }d}d}d}n-zt }W n tyD   d} d}d}d}d}d}Y nw |j} |j}|j}|j	}|j
}d}t| |||||S )	zReturn information on the distributed setup, i.e. world size, rank etc.
    This can be used even before distributed training is initialized, which is useful for
    PytorchLightning for instance.
    r   r   r   r   r   r(   emptyr   )r   r   intr   r   r   global_rank	num_tasksr!   node	num_nodesr   )r    r   r!   	node_rankr3   sourcer(   r+   r+   r,   r   >   s4   

r   ncclc                 C   s   t j rdS t }|jdkrtd dS t }t j	 r&t j
|j n| dks,J |jjr:dtj|j }nt  d}t jj| ||j|jd td|j|j|j|j |jjrmt j  t d	kro|j  dS dS dS )
z
    Initialize DDP.
    Nr   zworld_size is 1, skipping init.r6   zfile://zenv://)backendinit_methodr   r    z*Distributed init: %d/%d (local %d) from %sr   )torchdistributedis_initializedr   r   loggerinfor   cudais_available
set_devicer!   dorause_rendezvousr   pathabspathrendezvous_filer-   init_process_groupr    r5   barrierunlink)r7   r"   r&   r8   r+   r+   r,   inita   s<   





rI   c                   C   s
   t  dkS Nr   )r    r+   r+   r+   r,   	is_master   s   
rK   c                   C      t j r
t j S dS rJ   )r9   r:   r;   get_rankr+   r+   r+   r,   r          

r    c                   C   rL   )Nr   )r9   r:   r;   get_world_sizer+   r+   r+   r,   r      rN   r   )r6   )collectionsr   loggingr   r   
subprocessr   r   r9   r&   r   	getLogger__name__r<   r   r-   r   rI   rK   r    r   r+   r+   r+   r,   <module>   s$   
%
#$