Skip to content

Commit

Permalink
[CI][cluster-launcher] release tests that covers autoscaling case (ra…
Browse files Browse the repository at this point in the history
…y-project#37680)

Why are these changes needed?
Add release test that both check the cluster is alive, as well as the number of (autoscaled) nodes are expected.
  • Loading branch information
scv119 committed Jul 23, 2023
1 parent 5495176 commit c7f5440
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 24 deletions.
2 changes: 1 addition & 1 deletion python/ray/autoscaler/aws/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ available_node_types:
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
min_workers: 1
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
Expand Down
8 changes: 7 additions & 1 deletion python/ray/autoscaler/aws/tests/aws_cluster.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: nightly-test-minimal
max_workers: 0
max_workers: 1
idle_timeout_minutes: 2

# Cloud-provider specific configuration.
Expand All @@ -14,3 +14,9 @@ available_node_types:
resources: {}
node_config:
InstanceType: t3.large
ray.worker.default:
resources: {}
min_workers: 1
max_workers: 1
node_config:
InstanceType: t3.large
2 changes: 2 additions & 0 deletions python/ray/autoscaler/aws/tests/aws_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
debian_packages: []
env_vars:
RAY_WHEEL_URL: {{ env["RAY_WHEELS"] | default("") }}

python:
pip_packages: []
Expand Down
2 changes: 1 addition & 1 deletion python/ray/autoscaler/gcp/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ available_node_types:
ray_worker_small:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
min_workers: 1
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 2
Expand Down
2 changes: 2 additions & 0 deletions python/ray/autoscaler/gcp/tests/gce_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }}
debian_packages: []
env_vars:
RAY_WHEEL_URL: {{ env["RAY_WHEELS"] | default("") }}

python:
pip_packages: []
Expand Down
125 changes: 110 additions & 15 deletions python/ray/autoscaler/launch_and_verify_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
Usage:
python launch_and_verify_cluster.py [--no-config-cache] [--retries NUM_RETRIES]
[--num-expected-nodes NUM_NODES] [--docker-override DOCKER_OVERRIDE]
[--wheel-override WHEEL_OVERRIDE]
<cluster_configuration_file_path>
Example:
python launch_and_verify_cluster.py --retries 5 --no-config-cache
/path/to/cluster_config.yaml
"""
import argparse
import os
import re
import subprocess
import sys
import tempfile
Expand All @@ -23,15 +22,14 @@
import boto3
import yaml

import ray


def check_arguments():
"""
Check command line arguments and return the cluster configuration file path, the
number of retries, and the value of the --no-config-cache flag.
Returns:
A tuple containing the cluster config file path, the number of retries, and the
value of the --no-config-cache flag.
number of retries, the number of expected nodes, and the value of the
--no-config-cache flag.
"""
parser = argparse.ArgumentParser(description="Launch and verify a Ray cluster")
parser.add_argument(
Expand All @@ -45,12 +43,68 @@ def check_arguments():
default=3,
help="Number of retries for verifying Ray is running (default: 3)",
)
parser.add_argument(
"--num-expected-nodes",
type=int,
default=1,
help="Number of nodes for verifying Ray is running (default: 1)",
)
parser.add_argument(
"--docker-override",
choices=["disable", "latest", "nightly", "commit"],
default="disable",
help="Override the docker image used for the head node and worker nodes",
)
parser.add_argument(
"--wheel-override",
type=str,
default="",
help="Override the wheel used for the head node and worker nodes",
)
parser.add_argument(
"cluster_config", type=str, help="Path to the cluster configuration file"
)
args = parser.parse_args()

return args.cluster_config, args.retries, args.no_config_cache
assert not (
args.docker_override != "disable" and args.wheel_override != ""
), "Cannot override both docker and wheel"

return (
args.cluster_config,
args.retries,
args.no_config_cache,
args.num_expected_nodes,
args.docker_override,
args.wheel_override,
)


def get_docker_image(docker_override):
"""
Get the docker image to use for the head node and worker nodes.
Args:
docker_override: The value of the --docker-override flag.
Returns:
The docker image to use for the head node and worker nodes, or None if not
applicable.
"""
if docker_override == "latest":
return "rayproject/ray:latest-py38"
elif docker_override == "nightly":
return "rayproject/ray:nightly-py38"
elif docker_override == "commit":
if re.match("^[0-9]+.[0-9]+.[0-9]+$", ray.__version__):
return f"rayproject/ray:{ray.__version__}.{ray.__commit__[:6]}-py38"
else:
print(
"Error: docker image is only available for "
f"release version, but we get: {ray.__version__}"
)
sys.exit(1)
return None


def check_file(file_path):
Expand All @@ -68,6 +122,23 @@ def check_file(file_path):
sys.exit(1)


def override_wheels_url(config_yaml, wheel_url):
setup_commands = config_yaml.get("setup_commands", [])
setup_commands.append(
f'pip3 uninstall -y ray && pip3 install -U "ray[default] @ {wheel_url}"'
)
config_yaml["setup_commands"] = setup_commands


def override_docker_image(config_yaml, docker_image):
docker_config = config_yaml.get("docker", {})
docker_config["image"] = docker_image
docker_config["container_name"] = "ray_container"
assert docker_config.get("head_image") is None, "Cannot override head_image"
assert docker_config.get("worker_image") is None, "Cannot override worker_image"
config_yaml["docker"] = docker_config


def download_ssh_key():
"""Download the ssh key from the S3 bucket to the local machine."""
print("======================================")
Expand Down Expand Up @@ -101,7 +172,7 @@ def cleanup_cluster(cluster_config):
subprocess.run(["ray", "down", "-v", "-y", str(cluster_config)], check=True)


def run_ray_commands(cluster_config, retries, no_config_cache):
def run_ray_commands(cluster_config, retries, no_config_cache, num_expected_nodes=1):
"""
Run the necessary Ray commands to start a cluster, verify Ray is running, and clean
up the cluster.
Expand Down Expand Up @@ -135,7 +206,10 @@ def run_ray_commands(cluster_config, retries, no_config_cache):
"exec",
"-v",
str(cluster_config),
"python -c 'import ray; ray.init(\"localhost:6379\")'",
(
'python -c \'import ray; ray.init("localhost:6379");'
+ f" assert len(ray.nodes()) >= {num_expected_nodes}'"
),
]
if no_config_cache:
cmd.append("--no-config-cache")
Expand All @@ -145,7 +219,7 @@ def run_ray_commands(cluster_config, retries, no_config_cache):
except subprocess.CalledProcessError:
count += 1
print(f"Verification failed. Retry attempt {count} of {retries}...")
time.sleep(5)
time.sleep(60)

if not success:
print("======================================")
Expand All @@ -168,19 +242,40 @@ def run_ray_commands(cluster_config, retries, no_config_cache):


if __name__ == "__main__":
cluster_config, retries, no_config_cache = check_arguments()
(
cluster_config,
retries,
no_config_cache,
num_expected_nodes,
docker_override,
wheel_override,
) = check_arguments()
cluster_config = Path(cluster_config)
check_file(cluster_config)

print(f"Using cluster configuration file: {cluster_config}")
print(f"Number of retries for 'verify ray is running' step: {retries}")
print(f"Using --no-config-cache flag: {no_config_cache}")
print(f"Number of expected nodes for 'verify ray is running': {num_expected_nodes}")

config_yaml = yaml.safe_load(cluster_config.read_text())
# Make the cluster name unique
config_yaml["cluster_name"] = (
config_yaml["cluster_name"] + "-" + str(int(time.time()))
)

print("======================================")
print(f"Overriding ray wheel...: {wheel_override}")
if wheel_override:
override_wheels_url(config_yaml, wheel_override)

print("======================================")
print(f"Overriding docker image...: {docker_override}")
docker_override_image = get_docker_image(docker_override)
print(f"Using docker image: {docker_override_image}")
if docker_override_image:
override_docker_image(config_yaml, docker_override_image)

provider_type = config_yaml.get("provider", {}).get("type")
if provider_type == "aws":
download_ssh_key()
Expand Down Expand Up @@ -222,4 +317,4 @@ def run_ray_commands(cluster_config, retries, no_config_cache):
temp.write(yaml.dump(config_yaml).encode("utf-8"))
temp.flush()
cluster_config = Path(temp.name)
run_ray_commands(cluster_config, retries, no_config_cache)
run_ray_commands(cluster_config, retries, no_config_cache, num_expected_nodes)
117 changes: 111 additions & 6 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6910,8 +6910,61 @@
cluster_compute: aws/tests/aws_compute.yaml

run:
timeout: 1200
script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml
timeout: 2400
script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10


- name: aws_cluster_launcher_nightly_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

frequency: nightly
team: core
python: "3.8"
cluster:
byod: {}
cluster_env: aws/tests/aws_config.yaml
cluster_compute: aws/tests/aws_compute.yaml

run:
timeout: 2400
script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override nightly


- name: aws_cluster_launcher_latest_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

frequency: nightly
team: core
python: "3.8"
cluster:
byod: {}
cluster_env: aws/tests/aws_config.yaml
cluster_compute: aws/tests/aws_compute.yaml

run:
timeout: 2400
script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override latest


- name: aws_cluster_launcher_release_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

frequency: manual
team: core
python: "3.8"
cluster:
byod: {}
cluster_env: aws/tests/aws_config.yaml
cluster_compute: aws/tests/aws_compute.yaml

run:
timeout: 2400
script: python launch_and_verify_cluster.py aws/tests/aws_cluster.yaml --num-expected-nodes 2 --retries 10 --docker-override commit



- name: aws_cluster_launcher_minimal
group: cluster-launcher-test
Expand Down Expand Up @@ -6942,8 +6995,8 @@
cluster_compute: aws/tests/aws_compute.yaml

run:
timeout: 1200
script: python launch_and_verify_cluster.py aws/example-full.yaml
timeout: 3000
script: python launch_and_verify_cluster.py aws/example-full.yaml --num-expected-nodes 2 --retries 20

- name: gcp_cluster_launcher_minimal
group: cluster-launcher-test
Expand Down Expand Up @@ -6976,5 +7029,57 @@
cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml

run:
timeout: 2400
script: python launch_and_verify_cluster.py gcp/example-full.yaml
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20

- name: gcp_cluster_launcher_latest_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

stable: true

env: gce
frequency: nightly
team: core
cluster:
cluster_env: gcp/tests/gce_config.yaml
cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml

run:
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly

- name: gcp_cluster_launcher_nightly_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

stable: true

env: gce
frequency: nightly
team: core
cluster:
cluster_env: gcp/tests/gce_config.yaml
cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml

run:
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest


- name: gcp_cluster_launcher_release_image
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

stable: true

env: gce
frequency: manual
team: core
cluster:
cluster_env: gcp/tests/gce_config.yaml
cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml

run:
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit

0 comments on commit c7f5440

Please sign in to comment.