o
    `۷i;E                     @   s`  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlZddlZddlZddlmZ ddlZddlmZ dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z	dBdd Z e!d!kre \Z"Z#Z$Z%Z&Z'ee"Z"ee" e(d"e"  e(d#e#  e(d$e$  e(d%e%  e)e"* Z+e+d& d' e,e-e  e+d&< e(d( e(d)e'  e'ree+e' e(d( e(d*e&  ee&Z.e(d+e.  e.ree+e. e+/d,i /d-Z0d.e+d, d/< e0d0kre  nje0d1kre  e+d, 1d2d nYe0d3kr]e  ej2g d4ej3d5d6j45d76 Z7e(d8e7 ej2g d9ej3d5d6j45d76 Z8e(d:e8 d; e8e+d, d<< ne0d=krke(d( e(d> ne(d( e(d? e9d ej:d@dA%Z;e;<e=e+>d7 e;?  ee;j@Z"e e+e"e#e$e% W d   dS 1 sw   Y  dS dS )Ca  
This script automates the process of launching and verifying a Ray cluster using a given
cluster configuration file. It also handles cluster cleanup before and after the
verification process. The script requires one command-line argument: the path to the
cluster configuration file.

Usage:
    python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES]
        [--num-expected-nodes NUM_NODES] [--docker-override DOCKER_OVERRIDE]
        [--wheel-override WHEEL_OVERRIDE]
        <cluster_configuration_file_path>
    N)Path)storage)RAYc                  C   s   t jdd} | jdddd | jdtdd	d
 | jdtddd
 | jdg dddd | jdtddd
 | jdtdd |  }|jdkrN|jdkrNJ d|j|j	|j
|j|j|jfS )z
    Check command line arguments and return the cluster configuration file path, the
    number of retries, the number of expected nodes, and the value of the
    --no-config-cache flag.
    zLaunch and verify a Ray cluster)description--no-config-cache
store_truez3Pass the --no-config-cache flag to Ray CLI commands)actionhelpz	--retries   z;Number of retries for verifying Ray is running (default: 3))typedefaultr	   z--num-expected-nodes   z9Number of nodes for verifying Ray is running (default: 1)z--docker-override)disablelatestnightlycommitr   zAOverride the docker image used for the head node and worker nodes)choicesr   r	   z--wheel-override z:Override the wheel used for the head node and worker nodescluster_configz&Path to the cluster configuration file)r   r	   z%Cannot override both docker and wheel)argparseArgumentParseradd_argumentintstr
parse_argsdocker_overridewheel_overrider   retriesno_config_cachenum_expected_nodes)parserargs r"   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/autoscaler/launch_and_verify_cluster.pycheck_arguments!   sX   r$   c                 C   sj   | dkrdS | dkrdS | dkr3t dtjr&dtj dtjd	d
  dS tdtj  td d	S )a  
    Get the docker image to use for the head node and worker nodes.

    Args:
        docker_override: The value of the --docker-override flag.

    Returns:
        The docker image to use for the head node and worker nodes, or None if not
        applicable.
    r   zrayproject/ray:latest-py310r   zrayproject/ray:nightly-py310r   z^[0-9]+.[0-9]+.[0-9]+$zrayproject/ray:.N   z-py310zGError: docker image is only available for release version, but we get: r   )rematchray__version__
__commit__printsysexit)r   r"   r"   r#   get_docker_imageX   s   
r/   c                 C   s6   |   rt| tjstd|   td dS dS )z
    Check if the provided file path is valid and readable.

    Args:
        file_path: The path of the file to check.

    Raises:
        SystemExit: If the file is not readable or does not exist.
    z/Error: Cannot read cluster configuration file: r   N)is_fileosaccessR_OKr,   r-   r.   )	file_pathr"   r"   r#   
check_files   s   
r5   c                 C   s*   |  dg }|d| d || d< d S )Nsetup_commandsz9pip3 uninstall -y ray && pip3 install -U "ray[default] @ ")getappend)config_yaml	wheel_urlr6   r"   r"   r#   override_wheels_url   s
   
r<   c                 C   sT   |  di }||d< d|d< | dd u sJ d| dd u s$J d|| d< d S )	Ndockerimageray_containercontainer_name
head_imagezCannot override head_imageworker_imagezCannot override worker_image)r8   )r:   docker_imagedocker_configr"   r"   r#   override_docker_image   s   rE   c            
      C   s   t d t d tjddd} | jdd}|d }t|d	 }t|d
 }| jdd}|d }t }tj	
|d}t|d}	|	| W d   n1 sQw   Y  tdddd|d|d|g	 dS )zRGet Azure service principal credentials from AWS Secrets Manager and authenticate.&======================================z5Getting Azure credentials from AWS Secrets Manager...secretsmanager	us-west-2region_namez#azure-service-principal-oss-release)SecretIdSecretString	client_id	tenant_idz#azure-service-principal-certificatezazure_cert.pemwNazloginz--service-principalz
--usernamez--certificatez--tenant)r,   boto3clientget_secret_valuejsonloadstempfilemkdtempr1   pathjoinopenwrite
subprocess
check_call)
secrets_clientsecret_responsesecretrM   rN   cert_responsecerttmp_dir	cert_pathfr"   r"   r#   azure_authenticate   s<   rg   c                  C   s|   t d t d tjddd} d}d}tjd| }tjtj|s/ttj| | 	||| t
|d	 d
S )z=Download the ssh key from the S3 bucket to the local machine.rF   zDownloading ssh key...s3rH   rI   zaws-cluster-launcher-testzray-autoscaler_59_us-west-2.pem~/.ssh/   N)r,   rR   rS   r1   rY   
expanduserexistsdirnamemakedirsdownload_filechmod)	s3_clientbucket_namekey_namelocal_key_pathr"   r"   r#   download_ssh_key_aws   s   ru   c                  C   s   t d t d t } d}d}| |}||}tjd| }tjtj	|s6t
tj	| || t|d dS )zGDownload the ssh key from the google cloud bucket to the local machine.rF   zDownloading ssh key from GCP...z*gcp-cluster-launcher-release-test-ssh-keyszAray-autoscaler_gcp_us-west1_anyscale-bridge-cd812d38_ubuntu_0.pemri   rj   N)r,   r   Client
get_bucketget_blobr1   rY   rk   rl   rm   rn   download_to_filenamerp   )rS   rr   rs   bucketblobrt   r"   r"   r#   download_ssh_key_gcp   s   


r|   c                 C   s   t d t d d}d}t|D ]i}z#tj }|dd tjdddd	t|gd
d
|d t	|  W  dS  tj
yy } z7t d|d  d| d t |jd t  t d|jd  t d|jd  |}W Y d}~qd}~ww |)z
    Clean up the cluster using the given cluster configuration file.

    Args:
        cluster_config: The path of the cluster configuration file.
    rF   zCleaning up cluster...Nr
   
PYTHONPATHr)   down-v-yTcheckcapture_outputenvzray down fails[r   /z]: utf-8stdout:
stderr:
)r,   ranger1   environcopypopr]   runr   cleanup_security_groupsCalledProcessErroroutputdecode	traceback	print_excstdoutstderr)r:   r   
last_error	num_triesir   er"   r"   r#   cleanup_cluster   s4   
r   c              
   C   s   d}|dk rYz	| j |d W d S  tjjyR } z5|jd d dkr:d| }td| d	|  t| |d
7 }ntd|  W Y d }~d S W Y d }~nd }~ww |dk sd S d S )Nr   
   )GroupIdErrorCodeDependencyViolation   zWaiting zGs for the instance to be terminated before deleting the security group r   zError deleting security group: )delete_security_groupbotocore
exceptionsClientErrorresponser,   timesleep)
ec2_clientidretryr   
sleep_timer"   r"   r#   cleanup_security_group  s&   

r   c                  C   s  t d t d tjd} tjd}tj| r&tj|r&t d dS t d tj| }tj|s=tj|dd	 ztjd
ddddd| ddddgddd t d W dS  tj	y } z&t d t d|j
d  t d|jd  td W Y d}~dS d}~ww )zX
    Ensure that the SSH keys for Azure tests exist, and create them if they don't.
    rF   z Ensuring Azure SSH keys exist...z#~/.ssh/ray-autoscaler-tests-ssh-keyz'~/.ssh/ray-autoscaler-tests-ssh-key.pubzAzure SSH keys already exist.Nz.Azure SSH keys not found. Creating new keys...T)exist_okz
ssh-keygenz-trsaz-b4096z-fz-Nr   z-Czray-autoscaler-azure)r   r   z$Successfully created Azure SSH keys.zError creating SSH keys:r   r   r   r   )r,   r1   rY   rk   rl   rm   rn   r]   r   r   r   r   r   r-   r.   )private_key_pathpublic_key_pathssh_dirr   r"   r"   r#   ensure_ssh_keys_azure/  sH   r   c              
   C   s   |  di  d}|dkrd S z*tjddd}|jdtgdd	| d
 gdgd}|d D ]	}t||d  q-W d S  tyS } ztd|  W Y d }~d S d }~ww )Nproviderr   awsec2rH   rI   ztag-key)NameValuesztag:ray-cluster-namecluster_name)FiltersSecurityGroupsr   z#Error cleaning up security groups: )r8   rR   rS   describe_security_groupsr   r   	Exceptionr,   )configprovider_typer   r   security_groupr   r"   r"   r#   r   [  s*   r   r   c              
   C   s  |  di  d}|dkrt  td td g d}|r#|d |t| tj }|dd	 zt	j
|d
d
|d W n* t	jyj } zt|j td|jd  td|jd  |d	}~ww td td d}	d}
|
|k rz"dddt|dd| d g}|r|d t	j
|d
|d d
}	W n" t	jy   |
d7 }
td|
 d| d td Y nw |
|k s{|	std td| d t| | td td  td td td! t| | td td" d	S )#ak  
    Run the necessary Ray commands to start a cluster, verify Ray is running, and clean
    up the cluster.

    Args:
        cluster_config: The path of the cluster configuration file.
        retries: The number of retries for the verification step.
        no_config_cache: Whether to pass the --no-config-cache flag to the ray CLI
            commands.
    r   r   azurerF   zStarting new cluster...)r)   upr   r   r   r}   NTr   r   r   r   zVerifying Ray is running...Fr   r)   execr   z2python -c 'import ray; ray.init("localhost:6379");z assert len(ray.nodes()) >= ')r   r   r   z#Verification failed. Retry attempt z of z...<   z!Error: Verification failed after z0 attempts. Cleaning up cluster before exiting...zExiting script.zRay verification successful.z'Finished executing script successfully.)r8   rg   r,   r9   r   r1   r   r   r   r]   r   r   r   r   r   r   r   r   r   r-   r.   )r:   r   r   r   r   r   cmdr   r   successcountr"   r"   r#   run_ray_commandst  sv   









r   __main__z"Using cluster configuration file: z4Number of retries for 'verify ray is running' step: zUsing --no-config-cache flag: z6Number of expected nodes for 'verify ray is running': r   -rF   zOverriding ray wheel...: zOverriding docker image...: zUsing docker image: r   r   Fcache_stopped_nodesr   r   availability_zonegcp)gcloudr   	get-valueaccountT)r   r   r   zActive account email:)r   r   r   projectzInjecting GCP project 'z$' into cluster configuration file...
project_idvspherezVSPHERE provider detected.z-Provider type not recognized. Exiting script.z.yaml)suffix)r   )A__doc__r   rU   r1   r'   r]   r-   rW   r   r   pathlibr   rR   r   yamlgoogle.cloudr   r)   "ray.autoscaler._private.aws.configr   r$   r/   r5   r<   rE   rg   ru   r|   r   r   r   r   r   __name__r   r   r   r   r   r   r,   	safe_load	read_textr:   r   r   docker_override_imager8   r   r   r   PIPEr   r   stripaccount_emailr   r.   NamedTemporaryFiletempr\   dumpencodeflushnamer"   r"   r"   r#   <module>   s    7	--,

T





		





$