o
    $iA                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ e eZedd	eG d
d deZdd Zdd Zdd Zdd Zdd ZG dd deZdS )    N)	dataclass)BaseWorkerGroup)get_address_and_port)Backend)TorchConfig)	PublicAPIalpha)	stabilityc                   @   s*   e Zd ZU dZdZeed< edd ZdS )TorchXLAConfigz
    Configuration for torch XLA setup.
    See https://pytorch.org/xla/release/1.13/index.html for more info.
    Currently, only "neuron_cores" accelerator (AwsNeuronXLABackend)
    is supported with xrt runtime.
    Fneuron_parallel_compilec                 C   s   t S )N)_TorchAwsNeuronXLABackendself r   W/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/train/torch/xla/config.pybackend_cls   s   zTorchXLAConfig.backend_clsN)	__name__
__module____qualname____doc__r   bool__annotations__propertyr   r   r   r   r   r
      s
   
 r
   c                  C   s   dd l } | g d d S )Nr   )pkillz-fxrt_run_server)
subprocesscall)r   r   r   r   _kill_xrt_server#   s   r   c                  C   s   t j } t|  tjd< t|  tjd< t|  tjd< t| 	 tjd< t| 
 tjd< t| 	 |   tjd< t|  tjd< t|  tjd< t| 	 tjd	< d
tjd< dtjd< dtjd< dtjd< dtjd< d S )N
LOCAL_RANKRANKLOCAL_WORLD_SIZE
WORLD_SIZE
GROUP_RANKGROUP_WORLD_SIZE	ROLE_RANKROLE_WORLD_RANKROLE_WORLD_SIZEefaFI_PROVIDER1FI_EFA_USE_DEVICE_RDMAFI_EFA_FORK_SAFEXLA_TRANSFER_SEED_ASYNCNCCL_ASYNC_ERROR_HANDLING)raytrainget_contextstrget_local_rankosenvironget_world_rankget_local_world_sizeget_world_sizeget_node_rank)contextr   r   r   _set_xla_env_vars)   s"   





r:   c                  C   sN   zdd l m}  dd lm  m} dd l}| d W d S  ty&   tdw )Nr   xlaz5torch_xla must be installed to use torch_xla backend.)torch.distributeddistributedtorch_xla.core.xla_modelcore	xla_model!torch_xla.distributed.xla_backendinit_process_groupImportError)distxm	torch_xlar   r   r   _setup_xla_torch_process_groupC   s   rG   c                   C   s"   dt jd< dt jd< dt jd< d S )Nr)   NEURON_PARALLEL_COMPILENEURON_EXTRACT_GRAPHS_ONLYNEURON_FALL_BACK_TO_NULL_NEFF)r3   r4   r   r   r   r   %_set_neuron_parallel_compile_env_varsP   s   

rK   c                  C   s   zddl m}  ddlm} W n ty   tdw tjddkret	d dtjd	d
 d}tj
|r=t| tj|dd d }tjd }r[td| }r[|d}||| | d S d S )Nr   )CacheUrl)parallel_compilezBlibneuronxla must be installed to use Neuron parallel compilation.r   0z0Compiling extracted graphs on local rank0 workerz/tmp/USERzno-userz/parallel_compile_workdir/T)exist_okNEURON_CC_FLAGSz--cache_dir[= ](\S+)   )libneuronxla.neuron_cc_cacherL   $libneuronxla.neuron_parallel_compilerM   rC   r3   r4   getloggerinfopathexistsshutilrmtreemakedirsresearchgroupget_cache_url)rL   rM   parallel_compile_workdirexplicit_cache_dirneuron_cc_flagssr   r   r    _neuron_compile_extracted_graphsW   s0   


re   c                   @   sX   e Zd ZU ee Zeed< dede	fddZ
dede	fddZdede	fdd	Zd
S )r   unique_run_idworker_groupbackend_configc                    sX   | t |dt\}} fdd}|j |||d |jr*td | t dS dS )z+Logic ran right before training is started.r   c                    s(   | t jd< t|t jd<  jt jd< d S )NMASTER_ADDRMASTER_PORTTORCHELASTIC_RUN_ID)r3   r4   r1   rf   addrportr   r   r   set_env_vars   s   
z8_TorchAwsNeuronXLABackend.on_start.<locals>.set_env_varsrl   z1Extracting graphs for Neuron parallel compilationN)executer   execute_singler   r   rV   rW   rK   )r   rg   rh   master_addrmaster_portro   r   r   r   on_start{   s   

z"_TorchAwsNeuronXLABackend.on_startc                 C   s   | t | t dS )z
        Configure the environment variables for the worker group.
        And initialize the xla distributed process group.
        TODO: Current setup only supports homogenous cluster with
         neuron_cores accelerator and xrt runtime.
        N)rp   r:   rG   r   rg   rh   r   r   r   on_training_start   s   
	z+_TorchAwsNeuronXLABackend.on_training_startc                 C   s"   | t |jr| t dS dS )z
        Logic ran right after training is finished.
        This is a sanity cleanup to kill xrt server, and to optionally
        run neuron parallel graph compilation
        N)rp   r   r   re   ru   r   r   r   on_shutdown   s   
z%_TorchAwsNeuronXLABackend.on_shutdownN)r   r   r   r1   uuiduuid4rf   r   r   r
   rt   rv   rw   r   r   r   r   r   x   s   
 
r   )loggingr3   r]   rZ   rx   dataclassesr   r.   %ray.train._internal.base_worker_groupr   ray.train._internal.utilsr   ray.train.backendr   ray.train.torchr   ray.utilr   	getLoggerr   rV   r
   r   r:   rG   rK   re   r   r   r   r   r   <module>   s,    
!