o
    bi8                     @   s&  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlZddlZddlZddlmZ ddlZddlmZ dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z	d<ddZedkre \ZZ Z!Z"Z#Z$e
eZee e%de  e%de   e%d e!  e%d!e"  e&e' Z(e(d" d# e)e*e  e(d"< e%d$ e%d%e$  e$ree(e$ e%d$ e%d&e#  ee#Z+e%d'e+  e+ree(e+ e(,d(i ,d)Z-d*e(d( d+< e-d,kre  nYe-d-kr@e  ej.g d.ej/d/d0j01d12 Z3e%d2e3 ej.g d3ej/d/d0j01d12 Z4e%d4e4 d5 e4e(d( d6< ne-d7krNe%d$ e%d8 ne%d$ e%d9 e5d ej6d:d;%Z7e78e9e(:d1 e7;  e
e7j<Zee(ee e!e" W d   dS 1 sw   Y  dS dS )=a  
This script automates the process of launching and verifying a Ray cluster using a given
cluster configuration file. It also handles cluster cleanup before and after the
verification process. The script requires one command-line argument: the path to the
cluster configuration file.

Usage:
    python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES]
        [--num-expected-nodes NUM_NODES] [--docker-override DOCKER_OVERRIDE]
        [--wheel-override WHEEL_OVERRIDE]
        <cluster_configuration_file_path>
    N)Path)storage)RAYc                  C   s   t jdd} | jdddd | jdtdd	d
 | jdtddd
 | jdg dddd | jdtddd
 | jdtdd |  }|jdkrN|jdkrNJ d|j|j	|j
|j|j|jfS )z
    Check command line arguments and return the cluster configuration file path, the
    number of retries, the number of expected nodes, and the value of the
    --no-config-cache flag.
    zLaunch and verify a Ray cluster)description--no-config-cache
store_truez3Pass the --no-config-cache flag to Ray CLI commands)actionhelpz	--retries   z;Number of retries for verifying Ray is running (default: 3))typedefaultr	   z--num-expected-nodes   z9Number of nodes for verifying Ray is running (default: 1)z--docker-override)disablelatestnightlycommitr   zAOverride the docker image used for the head node and worker nodes)choicesr   r	   z--wheel-override z:Override the wheel used for the head node and worker nodescluster_configz&Path to the cluster configuration file)r   r	   z%Cannot override both docker and wheel)argparseArgumentParseradd_argumentintstr
parse_argsdocker_overridewheel_overrider   retriesno_config_cachenum_expected_nodes)parserargs r"   \/home/ubuntu/.local/lib/python3.10/site-packages/ray/autoscaler/launch_and_verify_cluster.pycheck_arguments    sX   r$   c                 C   sj   | dkrdS | dkrdS | dkr3t dtjr&dtj dtjd	d
  dS tdtj  td d	S )a  
    Get the docker image to use for the head node and worker nodes.

    Args:
        docker_override: The value of the --docker-override flag.

    Returns:
        The docker image to use for the head node and worker nodes, or None if not
        applicable.
    r   zrayproject/ray:latest-py39r   zrayproject/ray:nightly-py39r   z^[0-9]+.[0-9]+.[0-9]+$zrayproject/ray:.N   z-py39zGError: docker image is only available for release version, but we get: r   )rematchray__version__
__commit__printsysexit)r   r"   r"   r#   get_docker_imageW   s   
r/   c                 C   s6   |   rt| tjstd|   td dS dS )z
    Check if the provided file path is valid and readable.

    Args:
        file_path: The path of the file to check.

    Raises:
        SystemExit: If the file is not readable or does not exist.
    z/Error: Cannot read cluster configuration file: r   N)is_fileosaccessR_OKr,   r-   r.   )	file_pathr"   r"   r#   
check_filer   s   
r5   c                 C   s*   |  dg }|d| d || d< d S )Nsetup_commandsz9pip3 uninstall -y ray && pip3 install -U "ray[default] @ ")getappend)config_yaml	wheel_urlr6   r"   r"   r#   override_wheels_url   s
   
r<   c                 C   sT   |  di }||d< d|d< | dd u sJ d| dd u s$J d|| d< d S )	Ndockerimageray_containercontainer_name
head_imagezCannot override head_imageworker_imagezCannot override worker_image)r8   )r:   docker_imagedocker_configr"   r"   r#   override_docker_image   s   rE   c                  C   s|   t d t d tjddd} d}d}tjd| }tjtj|s/ttj| | 	||| t
|d	 d
S )z=Download the ssh key from the S3 bucket to the local machine.&======================================zDownloading ssh key...s3	us-west-2region_namezaws-cluster-launcher-testzray-autoscaler_59_us-west-2.pem~/.ssh/   N)r,   boto3clientr1   path
expanduserexistsdirnamemakedirsdownload_filechmod)	s3_clientbucket_namekey_namelocal_key_pathr"   r"   r#   download_ssh_key_aws   s   rZ   c                  C   s   t d t d t } d}d}| |}||}tjd| }tjtj	|s6t
tj	| || t|d dS )zGDownload the ssh key from the google cloud bucket to the local machine.rF   zDownloading ssh key from GCP...z*gcp-cluster-launcher-release-test-ssh-keyszAray-autoscaler_gcp_us-west1_anyscale-bridge-cd812d38_ubuntu_0.pemrK   rL   N)r,   r   Client
get_bucketget_blobr1   rO   rP   rQ   rR   rS   download_to_filenamerU   )rN   rW   rX   bucketblobrY   r"   r"   r#   download_ssh_key_gcp   s   


ra   c                 C   s   t d t d d}d}t|D ]]}ztjddddt|gd	d	d
 t|  W  dS  tjym } z7t d|d  d| d t |jd t	
  t d|jd  t d|jd  |}W Y d}~qd}~ww |)z
    Clean up the cluster using the given cluster configuration file.

    Args:
        cluster_config: The path of the cluster configuration file.
    rF   zCleaning up cluster...Nr
   r)   down-v-yTcheckcapture_outputzray down fails[r   /z]: utf-8stdout:
stderr:
)r,   range
subprocessrunr   cleanup_security_groupsCalledProcessErroroutputdecode	traceback	print_excstdoutstderr)r:   r   
last_error	num_triesier"   r"   r#   cleanup_cluster   s.   r{   c              
   C   s   d}|dk rYz	| j |d W d S  tjjyR } z5|jd d dkr:d| }td| d	|  t| |d
7 }ntd|  W Y d }~d S W Y d }~nd }~ww |dk sd S d S )Nr   
   )GroupIdErrorCodeDependencyViolation   zWaiting zGs for the instance to be terminated before deleting the security group r   zError deleting security group: )delete_security_groupbotocore
exceptionsClientErrorresponser,   timesleep)
ec2_clientidretryrz   
sleep_timer"   r"   r#   cleanup_security_group   s&   

r   c              
   C   s   |  di  d}|dkrd S z*tjddd}|jdtgdd	| d
 gdgd}|d D ]	}t||d  q-W d S  tyS } ztd|  W Y d }~d S d }~ww )Nproviderr   awsec2rH   rI   ztag-key)NameValuesztag:ray-cluster-namecluster_name)FiltersSecurityGroupsr}   z#Error cleaning up security groups: )r8   rM   rN   describe_security_groupsr   r   	Exceptionr,   )configprovider_typer   r   security_grouprz   r"   r"   r#   ro      s*   ro   r   c           	   
   C   s  t d t d g d}|r|d |t| t d| z
tj|ddd W n* tjyU } zt |j t d|j	d	  t d
|j
	d	  |d}~ww t d t d d}d}||k rz!dddt|dd| d g}|r}|d tj|dd d}W n" tjy   |d7 }t d| d| d td Y nw ||k sf|st d t d| d t| | t d t d td t d t d t| | t d t d dS ) ak  
    Run the necessary Ray commands to start a cluster, verify Ray is running, and clean
    up the cluster.

    Args:
        cluster_config: The path of the cluster configuration file.
        retries: The number of retries for the verification step.
        no_config_cache: Whether to pass the --no-config-cache flag to the ray CLI
            commands.
    rF   zStarting new cluster...)r)   uprc   rd   r    Tre   rj   ri   rk   NzVerifying Ray is running...Fr   r)   execrc   z2python -c 'import ray; ray.init("localhost:6379");z assert len(ray.nodes()) >= ')rf   r   z#Verification failed. Retry attempt z of z...<   z!Error: Verification failed after z0 attempts. Cleaning up cluster before exiting...zExiting script.zRay verification successful.z'Finished executing script successfully.)r,   r9   r   joinrm   rn   rp   rq   ru   rr   rv   r   r   r{   r-   r.   )	r:   r   r   r   r   cmdrz   successcountr"   r"   r#   run_ray_commands  sn   








r   __main__z"Using cluster configuration file: z4Number of retries for 'verify ray is running' step: zUsing --no-config-cache flag: z6Number of expected nodes for 'verify ray is running': r   -rF   zOverriding ray wheel...: zOverriding docker image...: zUsing docker image: r   r   Fcache_stopped_nodesr   gcp)gcloudr   	get-valueaccountT)ru   rf   ri   zActive account email:)r   r   r   projectzInjecting GCP project 'z$' into cluster configuration file...
project_idvspherezVSPHERE provider detected.z-Provider type not recognized. Exiting script.z.yaml)suffix)r   )=__doc__r   r1   r'   rm   r-   tempfiler   rs   pathlibr   rM   r   yamlgoogle.cloudr   r)   "ray.autoscaler._private.aws.configr   r$   r/   r5   r<   rE   rZ   ra   r{   r   ro   r   __name__r   r   r   r   r   r   r,   	safe_load	read_textr:   r   r   docker_override_imager8   r   rn   PIPEru   rr   stripaccount_emailr   r.   NamedTemporaryFiletempwritedumpencodeflushnamer"   r"   r"   r#   <module>   s    7	*

O




		





$