o
    wOi                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ e Z	dZ
dZdZdZdZ			
		ddededededef
ddZdd Zdd ZdejfddZdS )    N)closing)
get_loggerzAddress already in usezconnect() timed out.zSocket Timeoutz_tcp_store/num_membersz_tcp_store/last_member   X     	is_serverserver_addrserver_port
world_sizetimeoutc           
      C   sh  |dkr|dkrt d| d| |dkrtd| d |dkr%|nd}	 |dkr/|}nt }td| d	| d
| d|  d| d ztj|||| tj|dd}t|| td |W S  t	y }	 zDt
|	tkr}| s}td| d	| |	t
|	tkr||k rtd| d| d| d |d7 }ntd| d| d|	 W Y d }	~	nd }	~	ww q()Nr   r   zCserver_port must be specified when world_size > 1, got server_port=z, world_size=zsever_port: z, specified, ignoring retriesTzCreating c10d store on :z
  world_size  : z
  is_server   : z
  timeout(sec): 
)seconds)	host_nameportr   	is_masterr   zSuccessfully created c10d storez*timed out waiting for tcp store's server: zport: z already in use, attempt: [/]zon z, port: z already in use)
ValueErrorloginfoget_free_portdistTCPStoredatetime	timedelta_check_full_rankRuntimeErrorstr_CONNECT_TIMEOUTTimeoutError_ADDRESS_IN_USEwarningIOError)
r   r	   r
   r   r   retriesattemptr   storee r)   R/home/ubuntu/.local/lib/python3.10/site-packages/torchelastic/utils/distributed.pycreate_c10d_store   sl   



r+   c              
   C   sj   |  td}||kr| td z| t W d S  ty4 } zt|tkr/td| d| d }~ww )Nr   z<val_ignored>ztimed out waiting for all z members to join)	add_MEMBER_CHECKINset_LAST_MEMBER_CHECKINgetr   r   _SOCKET_TIMEOUTr!   )r'   r   idxr(   r)   r)   r*   r   \   s   
r   c                  C   s>   t  } t|  |  d W  d    S 1 sw   Y  d S )Nr   )get_socket_with_portr   getsockname)sockr)   r)   r*   r   l   s   

$r   returnc                  C   s   t jddt jt jd} | D ]<}|\}}}}}t  |||}z|d |d |W   S  tyI } z|  tj	d|d W Y d}~qd}~ww t
d)	a  
    Returns a free port on localhost that is "reserved" by binding a temporary
    socket on it. Close the socket before passing the port to the entity
    that requires it. Usage example

    ::

    sock = _get_socket_with_port()
    with closing(sock):
        port = sock.getsockname()[1]
        sock.close()
        # there is still a race-condition that some other process
        # may grab this port before func() runs
        func(port)
    	localhostN)hostr   familytype)r7   r   r   zSocket creation attempt failed.)exc_infozFailed to create a socket)socketgetaddrinfo	AF_UNSPECSOCK_STREAMbindlistenOSErrorcloser   r   r   )addrsaddrr9   r:   proto_sr(   r)   r)   r*   r3   r   s    


r3   )r   r   r   r   )r   r<   
contextlibr   torch.distributeddistributedr   torchelastic.utils.loggingr   r   r"   r    r1   r-   r/   boolr   intfloatr+   r   r   r3   r)   r)   r)   r*   <module>   s:   
B