o
    $i                     @   s   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZ d dlmZ e eZed	d
eG dd deZdedededededefddZdd ZG dd deZdS )    N)	dataclass)ray_constants)get_address_and_port)WorkerGroup)BackendBackendConfig)*DEFAULT_JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S"JAX_DISTRIBUTED_SHUTDOWN_TIMEOUT_S)	PublicAPIalpha)	stabilityc                   @   s2   e Zd ZU dZeed< dZeed< edd ZdS )	JaxConfigFuse_tpuuse_gpuc                 C   s   t S N)_JaxBackend)self r   T/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/train/v2/jax/config.pybackend_cls   s   zJaxConfig.backend_clsN)	__name__
__module____qualname__r   bool__annotations__r   propertyr   r   r   r   r   r      s
   
 r   master_addr_with_portnum_workersindexr   r   resources_per_workerc           
      C   s   t jdd }|s|rdt jd< d}|s|rdt jd< d}d|dv r;|dd}ddd	 t|D t jd
< ddl}d|dv rS|j	| || t
d d|dv rw|dkrett|}	nd}	|j	| |||	 t
d dS dS )a  Set up distributed Jax training information.

    This function should be called on each worker. It sets JAX environment
    variables and initializes JAX distributed training.

    Args:
        master_addr_with_port: The master address with port for coordination.
        num_workers: Total number of workers.
        index: Index of this worker.
        use_tpu: Whether to configure for TPU. If True and JAX_PLATFORMS is not
            already set, it will be set to "tpu".
        use_gpu: Whether to configure for GPU. If True and JAX_PLATFORMS is not
            already set, it will be set to "cuda".
        resources_per_worker: The resources per worker.
    JAX_PLATFORMS tpucuda,GPUr   c                 s   s    | ]}t |V  qd S r   )str).0ir   r   r   	<genexpr>B   s    
z5_setup_jax_distributed_environment.<locals>.<genexpr>CUDA_VISIBLE_DEVICESNz#Initialized JAX distributed on TPU.z$Initialized JAX distributed on CUDA.)osenvirongetlowersplitjoinrangejaxdistributed
initializeloggerinfolist)
r   r   r   r   r   r   jax_platformsnum_gpus_per_workerr2   local_device_idsr   r   r   "_setup_jax_distributed_environment   s2   



r;   c               
   C   sP   zddl } | j  W dS  ty' } ztd|  W Y d}~dS d}~ww )zShutdown JAX distributed environment.

    This function should be called on each worker during cleanup.
    If JAX distributed was not initialized, this is a no-op.
    r   N'Error during JAX distributed shutdown: )r2   r3   shutdown	Exceptionr5   warning)r2   er   r   r   _shutdown_jax_distributedW   s   rA   c                   @   s0   e Zd ZdedefddZdedefddZdS )r   worker_groupbackend_configc                 C   s~   |j s|jsd S |dt\}}| d| }g }tt|D ]}||j|t|t|||j |j|	 d qt
| d S )Nr   :)r   r   r   r   r   r   )r   r   execute_singler   r1   lenappendexecute_single_asyncr;   get_resources_per_workerrayr-   )r   rB   rC   master_addrmaster_portr   setup_futuresr(   r   r   r   on_startf   s&   z_JaxBackend.on_startc              
   C   s   |j s|jsdS |t}ttt}ztj	||d t
d W dS  tjjy7   t
d| d Y dS  tyQ } zt
d|  W Y d}~dS d}~ww )zBCleanup JAX distributed resources when shutting down worker group.N)timeoutz"JAX distributed shutdown completedz)JAX distributed shutdown timed out after z= seconds. This may indicate workers are hung or unresponsive.r<   )r   r   execute_asyncrA   r   env_integerr	   r   rJ   r-   r5   debug
exceptionsGetTimeoutErrorr?   r>   )r   rB   rC   shutdown_futures	timeout_sr@   r   r   r   on_shutdown~   s$   


z_JaxBackend.on_shutdownN)r   r   r   r   r   rN   rW   r   r   r   r   r   e   s    r   )loggingr+   dataclassesr   rJ   ray._privater   ray.train._internal.utilsr    ray.train._internal.worker_groupr   ray.train.backendr   r   ray.train.constantsr   r	   ray.utilr
   	getLoggerr   r5   r   r&   intr   dictr;   rA   r   r   r   r   r   <module>   s:    
	
9