diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index 18d7b88ef5a70..f3d05695bf1ec 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -85,7 +85,7 @@ available_node_types: ray.worker.default: # The minimum number of worker nodes of this type to launch. # This number should be >= 0. - min_workers: 0 + min_workers: 1 # The maximum number of worker nodes of this type to launch. # This takes precedence over min_workers. max_workers: 2 diff --git a/python/ray/autoscaler/aws/tests/aws_cluster.yaml b/python/ray/autoscaler/aws/tests/aws_cluster.yaml index b226c723129c4..73a4ff87c1265 100644 --- a/python/ray/autoscaler/aws/tests/aws_cluster.yaml +++ b/python/ray/autoscaler/aws/tests/aws_cluster.yaml @@ -1,6 +1,6 @@ # An unique identifier for the head node and workers of this cluster. cluster_name: nightly-test-minimal -max_workers: 0 +max_workers: 1 idle_timeout_minutes: 2 # Cloud-provider specific configuration. @@ -14,3 +14,9 @@ available_node_types: resources: {} node_config: InstanceType: t3.large + ray.worker.default: + resources: {} + min_workers: 1 + max_workers: 1 + node_config: + InstanceType: t3.large diff --git a/python/ray/autoscaler/aws/tests/aws_config.yaml b/python/ray/autoscaler/aws/tests/aws_config.yaml index 028cc7ce2a32a..0d35ec3a3e976 100644 --- a/python/ray/autoscaler/aws/tests/aws_config.yaml +++ b/python/ray/autoscaler/aws/tests/aws_config.yaml @@ -1,5 +1,7 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} debian_packages: [] +env_vars: + RAY_WHEEL_URL: {{ env["RAY_WHEELS"] | default("") }} python: pip_packages: [] diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml index 97c1772cd597a..f5b30613aed95 100644 --- a/python/ray/autoscaler/gcp/example-full.yaml +++ b/python/ray/autoscaler/gcp/example-full.yaml @@ -86,7 +86,7 @@ available_node_types: ray_worker_small: # The minimum number of worker nodes of this type to launch. # This number should be >= 0. - min_workers: 0 + min_workers: 1 # The maximum number of worker nodes of this type to launch. # This takes precedence over min_workers. max_workers: 2 diff --git a/python/ray/autoscaler/gcp/tests/gce_config.yaml b/python/ray/autoscaler/gcp/tests/gce_config.yaml index 028cc7ce2a32a..0d35ec3a3e976 100644 --- a/python/ray/autoscaler/gcp/tests/gce_config.yaml +++ b/python/ray/autoscaler/gcp/tests/gce_config.yaml @@ -1,5 +1,7 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} debian_packages: [] +env_vars: + RAY_WHEEL_URL: {{ env["RAY_WHEELS"] | default("") }} python: pip_packages: [] diff --git a/python/ray/autoscaler/launch_and_verify_cluster.py b/python/ray/autoscaler/launch_and_verify_cluster.py index 14be7ec65388a..6786283746776 100644 --- a/python/ray/autoscaler/launch_and_verify_cluster.py +++ b/python/ray/autoscaler/launch_and_verify_cluster.py @@ -6,14 +6,13 @@ Usage: python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES] + [--num-expected-nodes NUM_NODES] [--docker-override DOCKER_OVERRIDE] + [--wheel-override WHEEL_OVERRIDE] - -Example: - python launch_and_verify_cluster.py --retries 5 --no-config-cache - /path/to/cluster_config.yaml """ import argparse import os +import re import subprocess import sys import tempfile @@ -23,15 +22,14 @@ import boto3 import yaml +import ray + def check_arguments(): """ Check command line arguments and return the cluster configuration file path, the - number of retries, and the value of the --no-config-cache flag. - - Returns: - A tuple containing the cluster config file path, the number of retries, and the - value of the --no-config-cache flag. + number of retries, the number of expected nodes, and the value of the + --no-config-cache flag. """ parser = argparse.ArgumentParser(description="Launch and verify a Ray cluster") parser.add_argument( @@ -45,12 +43,68 @@ def check_arguments(): default=3, help="Number of retries for verifying Ray is running (default: 3)", ) + parser.add_argument( + "--num-expected-nodes", + type=int, + default=1, + help="Number of nodes for verifying Ray is running (default: 1)", + ) + parser.add_argument( + "--docker-override", + choices=["disable", "latest", "nightly", "commit"], + default="disable", + help="Override the docker image used for the head node and worker nodes", + ) + parser.add_argument( + "--wheel-override", + type=str, + default="", + help="Override the wheel used for the head node and worker nodes", + ) parser.add_argument( "cluster_config", type=str, help="Path to the cluster configuration file" ) args = parser.parse_args() - return args.cluster_config, args.retries, args.no_config_cache + assert not ( + args.docker_override != "disable" and args.wheel_override != "" + ), "Cannot override both docker and wheel" + + return ( + args.cluster_config, + args.retries, + args.no_config_cache, + args.num_expected_nodes, + args.docker_override, + args.wheel_override, + ) + + +def get_docker_image(docker_override): + """ + Get the docker image to use for the head node and worker nodes. + + Args: + docker_override: The value of the --docker-override flag. + + Returns: + The docker image to use for the head node and worker nodes, or None if not + applicable. + """ + if docker_override == "latest": + return "rayproject/ray:latest-py38" + elif docker_override == "nightly": + return "rayproject/ray:nightly-py38" + elif docker_override == "commit": + if re.match("^[0-9]+.[0-9]+.[0-9]+$", ray.__version__): + return f"rayproject/ray:{ray.__version__}.{ray.__commit__[:6]}-py38" + else: + print( + "Error: docker image is only available for " + f"release version, but we get: {ray.__version__}" + ) + sys.exit(1) + return None def check_file(file_path): @@ -68,6 +122,23 @@ def check_file(file_path): sys.exit(1) +def override_wheels_url(config_yaml, wheel_url): + setup_commands = config_yaml.get("setup_commands", []) + setup_commands.append( + f'pip3 uninstall -y ray && pip3 install -U "ray[default] @ {wheel_url}"' + ) + config_yaml["setup_commands"] = setup_commands + + +def override_docker_image(config_yaml, docker_image): + docker_config = config_yaml.get("docker", {}) + docker_config["image"] = docker_image + docker_config["container_name"] = "ray_container" + assert docker_config.get("head_image") is None, "Cannot override head_image" + assert docker_config.get("worker_image") is None, "Cannot override worker_image" + config_yaml["docker"] = docker_config + + def download_ssh_key(): """Download the ssh key from the S3 bucket to the local machine.""" print("======================================") @@ -101,7 +172,7 @@ def cleanup_cluster(cluster_config): subprocess.run(["ray", "down", "-v", "-y", str(cluster_config)], check=True) -def run_ray_commands(cluster_config, retries, no_config_cache): +def run_ray_commands(cluster_config, retries, no_config_cache, num_expected_nodes=1): """ Run the necessary Ray commands to start a cluster, verify Ray is running, and clean up the cluster. @@ -135,7 +206,10 @@ def run_ray_commands(cluster_config, retries, no_config_cache): "exec", "-v", str(cluster_config), - "python -c 'import ray; ray.init(\"localhost:6379\")'", + ( + 'python -c \'import ray; ray.init("localhost:6379");' + + f" assert len(ray.nodes()) >= {num_expected_nodes}'" + ), ] if no_config_cache: cmd.append("--no-config-cache") @@ -145,7 +219,7 @@ def run_ray_commands(cluster_config, retries, no_config_cache): except subprocess.CalledProcessError: count += 1 print(f"Verification failed. Retry attempt {count} of {retries}...") - time.sleep(5) + time.sleep(60) if not success: print("======================================") @@ -168,19 +242,40 @@ def run_ray_commands(cluster_config, retries, no_config_cache): if __name__ == "__main__": - cluster_config, retries, no_config_cache = check_arguments() + ( + cluster_config, + retries, + no_config_cache, + num_expected_nodes, + docker_override, + wheel_override, + ) = check_arguments() cluster_config = Path(cluster_config) check_file(cluster_config) print(f"Using cluster configuration file: {cluster_config}") print(f"Number of retries for 'verify ray is running' step: {retries}") print(f"Using --no-config-cache flag: {no_config_cache}") + print(f"Number of expected nodes for 'verify ray is running': {num_expected_nodes}") config_yaml = yaml.safe_load(cluster_config.read_text()) # Make the cluster name unique config_yaml["cluster_name"] = ( config_yaml["cluster_name"] + "-" + str(int(time.time())) ) + + print("======================================") + print(f"Overriding ray wheel...: {wheel_override}") + if wheel_override: + override_wheels_url(config_yaml, wheel_override) + + print("======================================") + print(f"Overriding docker image...: {docker_override}") + docker_override_image = get_docker_image(docker_override) + print(f"Using docker image: {docker_override_image}") + if docker_override_image: + override_docker_image(config_yaml, docker_override_image) + provider_type = config_yaml.get("provider", {}).get("type") if provider_type == "aws": download_ssh_key() @@ -222,4 +317,4 @@ def run_ray_commands(cluster_config, retries, no_config_cache): temp.write(yaml.dump(config_yaml).encode("utf-8")) temp.flush() cluster_config = Path(temp.name) - run_ray_commands(cluster_config, retries, no_config_cache) + run_ray_commands(cluster_config, retries, no_config_cache, num_expected_nodes) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index a4dd04d630d3c..51da1e490736c 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -6910,8 +6910,61 @@ cluster_compute: aws/tests/aws_compute.yaml run: - timeout: 1200 - script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml + timeout: 2400 + script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 + + +- name: aws_cluster_launcher_nightly_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + frequency: nightly + team: core + python: "3.8" + cluster: + byod: {} + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml + + run: + timeout: 2400 + script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly + + +- name: aws_cluster_launcher_latest_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + frequency: nightly + team: core + python: "3.8" + cluster: + byod: {} + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml + + run: + timeout: 2400 + script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest + + +- name: aws_cluster_launcher_release_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + frequency: manual + team: core + python: "3.8" + cluster: + byod: {} + cluster_env: aws/tests/aws_config.yaml + cluster_compute: aws/tests/aws_compute.yaml + + run: + timeout: 2400 + script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit + + - name: aws_cluster_launcher_minimal group: cluster-launcher-test @@ -6942,8 +6995,8 @@ cluster_compute: aws/tests/aws_compute.yaml run: - timeout: 1200 - script: python launch_and_verify_cluster.py aws/example-full.yaml + timeout: 3000 + script: python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20 - name: gcp_cluster_launcher_minimal group: cluster-launcher-test @@ -6976,5 +7029,57 @@ cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml run: - timeout: 2400 - script: python launch_and_verify_cluster.py gcp/example-full.yaml + timeout: 3600 + script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 + +- name: gcp_cluster_launcher_latest_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + stable: true + + env: gce + frequency: nightly + team: core + cluster: + cluster_env: gcp/tests/gce_config.yaml + cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml + + run: + timeout: 3600 + script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly + +- name: gcp_cluster_launcher_nightly_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + stable: true + + env: gce + frequency: nightly + team: core + cluster: + cluster_env: gcp/tests/gce_config.yaml + cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml + + run: + timeout: 3600 + script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest + + +- name: gcp_cluster_launcher_release_image + group: cluster-launcher-test + working_dir: ../python/ray/autoscaler/ + + stable: true + + env: gce + frequency: manual + team: core + cluster: + cluster_env: gcp/tests/gce_config.yaml + cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml + + run: + timeout: 3600 + script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit