o
    $i                     @   s   d dl Z d dlmZ d dlmZmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlZd dlmZ d dlmZmZ d d	lmZmZ d d
lmZ eddeG dd deZG dd deZdededefddZeG dd dZdS )    N)	dataclass)OptionalSet)Coordinator)detect_nicsnics_to_env_var)secrettimeout)update_env_vars)WorkerWorkerGroup)BackendBackendConfig)	PublicAPIbeta)	stabilityc                   @   s   e Zd ZU dZdZeee  ed< dZ	e
ed< dZee ed< dZee
 ed< dZee ed< dZee ed	< d
Ze
ed< dZe
ed< edd Zdd Zedd ZdS )HorovodConfiga  Configurations for Horovod setup.

    See https://github.com/horovod/horovod/blob/master/horovod/runner/common/util/settings.py # noqa: E501

    Args:
        nics (Optional[Set[str]): Network interfaces that can be used for
            communication.
        verbose: Horovod logging verbosity.
        key (Optional[str]): Secret used for communication between workers.
        ssh_port (Optional[int]): Port for SSH server running on worker nodes.
        ssh_identity_file (Optional[str]): Path to the identity file to
            ssh into different hosts on the cluster.
        ssh_str (Optional[str]): CAUTION WHEN USING THIS. Private key
            file contents. Writes the private key to ssh_identity_file.
        timeout_s: Timeout parameter for Gloo rendezvous.
        placement_group_timeout_s: Timeout parameter for Ray
            Placement Group creation. Currently unused.
    Nnics   verbosekeyssh_portssh_identity_filessh_stri,  	timeout_sd   placement_group_timeout_sc                 C   s   t j| jddS )NzTimed out waiting for {activity}. Please check connectivity between servers. You may need to increase the --start-timeout parameter if you have too many servers.)message)r	   Timeoutr   self r!   U/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/train/horovod/config.pystart_timeout/   s   zHorovodConfig.start_timeoutc                 C   sv   | j r-tj| js-t| jd}t| jd || j  W d    n1 s(w   Y  | jd u r9t	
 | _d S d S )Nwi  )r   ospathexistsr   openchmodwriter   r   make_secret_key)r    fr!   r!   r"   __post_init__9   s   
zHorovodConfig.__post_init__c                 C   s   t S N)_HorovodBackendr   r!   r!   r"   backend_clsB   s   zHorovodConfig.backend_cls)__name__
__module____qualname____doc__r   r   r   str__annotations__r   intr   r   r   r   r   r   propertyr#   r-   r0   r!   r!   r!   r"   r      s   
 
		r   c                   @   s,   e Zd ZU dZeed< dedefddZdS )r/   Tshare_cuda_visible_devicesworker_groupbackend_configc              
      s\  g }t tD ]}j| jj}||t|t| qt	| t
|| _dd jD  dd jD }tt| D ]\}\}}| j||| qA| j }	g }|	 D ]\}}
||t|
 q[t	| | j } fddt D }fdd|D }t|t| jjksJ t|t| jj|d}|t| t| d S )Nc                 S      g | ]}|j jqS r!   )metadatanode_id.0r$   r!   r!   r"   
<listcomp>e       z,_HorovodBackend.on_start.<locals>.<listcomp>c                 S   r<   r!   )r=   hostnamer?   r!   r!   r"   rA   f   rB   c                    s   g | ]}  |qS r!   )index)r@   r>   )node_idsr!   r"   rA   y   s    c                    s   g | ]	}t  j| qS r!   )_HorovodWorkerWrapperworkers)r@   worker_index)r:   r!   r"   rA   z   s    )all_host_namesnode_workers)rangelenrG   r=   r>   appendexecute_single_async_init_env_varsraygetr   coordinator	enumeratezipregisterfinalize_registrationitemsr
   establish_rendezvousset	hostnamesr   listupdater   execute)r    r:   r;   setup_futuresrankworker_node_idrZ   rC   r>   all_infolocal_cross_env_varcoordinator_envsnode_worker_indexesrJ   r   r!   )rE   r:   r"   on_startJ   sP   
	





z_HorovodBackend.on_startN)	r1   r2   r3   r9   boolr6   r   r   re   r!   r!   r!   r"   r/   G   s   
 r/   
world_rank
world_sizer>   c                 C   s*   |t jd< t| t jd< t|t jd< dS )z)Initialize Horovod environment variables.HOROVOD_HOSTNAMEHOROVOD_RANKHOROVOD_SIZEN)r%   environr5   )rg   rh   r>   r!   r!   r"   rO      s   
rO   c                   @   s"   e Zd ZU eed< edd ZdS )rF   r$   c                    s   | j  G  fddd}| S )Nc                       s   e Zd Z fddZdS )z4_HorovodWorkerWrapper.execute.<locals>.ExecuteHandlec                    s"   d } j jj||g|R i |S r.   )actor_RayTrainWorker__executeremote)r    funcargskwargs_r$   r!   r"   ro      s   z;_HorovodWorkerWrapper.execute.<locals>.ExecuteHandle.remoteN)r1   r2   r3   ro   r!   rt   r!   r"   ExecuteHandle   s    ru   rt   )r    ru   r!   rt   r"   r]      s   z_HorovodWorkerWrapper.executeN)r1   r2   r3   r   r6   r8   r]   r!   r!   r!   r"   rF      s   
 rF   )r%   dataclassesr   typingr   r   horovod.ray.runnerr   horovod.ray.utilsr   r   horovod.runner.common.utilr   r	   rP   ray.train._internal.utilsr
    ray.train._internal.worker_groupr   r   ray.train.backendr   r   ray.utilr   r   r/   r7   r5   rO   rF   r!   r!   r!   r"   <module>   s$    5C