Skip to content

Commit

Permalink
[release] move release testing end to end script to main ray repo (ra…
Browse files Browse the repository at this point in the history
  • Loading branch information
krfricke committed Jul 14, 2021
1 parent 92f1917 commit ed131f8
Show file tree
Hide file tree
Showing 12 changed files with 2,823 additions and 0 deletions.
307 changes: 307 additions & 0 deletions release/.buildkite/build_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
import copy
import logging
import os
import sys

import yaml

# Env variables:

# RAY_REPO Repo to use for finding the wheel
# RAY_BRANCH Branch to find the wheel
# RAY_TEST_REPO Repo to use for test scripts
# RAY_TEST_BRANCH Branch for test scripts
# FILTER_FILE File filter
# FILTER_TEST Test name filter
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly)


class ReleaseTest:
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0):
self.name = name
self.smoke_test = smoke_test
self.retry = retry

def __str__(self):
return self.name

def __repr__(self):
return self.name

def __contains__(self, item):
return self.name.__contains__(item)

def __iter__(self):
return iter(self.name)

def __len__(self):
return len(self.name)


class SmokeTest(ReleaseTest):
def __init__(self, name: str, retry: int = 0):
super(SmokeTest, self).__init__(
name=name, smoke_test=True, retry=retry)


CORE_NIGHTLY_TESTS = {
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"shuffle_10gb",
"shuffle_50gb",
"shuffle_50gb_large_partition",
"shuffle_100gb",
"non_streaming_shuffle_100gb",
"non_streaming_shuffle_50gb_large_partition",
"non_streaming_shuffle_50gb",
"dask_on_ray_10gb_sort",
"dask_on_ray_100gb_sort",
"dask_on_ray_large_scale_test_no_spilling",
"dask_on_ray_large_scale_test_spilling",
"stress_test_placement_group",
"shuffle_1tb_1000_partition",
"non_streaming_shuffle_1tb_1000_partition",
"shuffle_1tb_5000_partitions",
"non_streaming_shuffle_1tb_5000_partitions",
"decision_tree_autoscaling",
"autoscaling_shuffle_1tb_1000_partitions",
SmokeTest("stress_test_many_tasks"),
SmokeTest("stress_test_dead_actors"),
],
"~/ray/benchmarks/benchmark_tests.yaml": [
"single_node",
"object_store",
],
}

NIGHTLY_TESTS = {
# "~/ray/release/horovod_tests/horovod_tests.yaml": [
# SmokeTest("horovod_test"),
# ], # Should we enable this?
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [
"dask_xgboost_test",
"modin_xgboost_test",
"torch_tune_serve_test",
],
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("actor_deaths"),
SmokeTest("apex"),
SmokeTest("impala"),
SmokeTest("many_actor_tasks"),
SmokeTest("many_drivers"),
SmokeTest("many_ppo"),
SmokeTest("many_tasks"),
SmokeTest("many_tasks_serialized_ids"),
SmokeTest("node_failures"),
SmokeTest("pbt"),
# SmokeTest("serve"),
# SmokeTest("serve_failure"),
],
"~/ray/release/microbenchmark/microbenchmark.yaml": [
"microbenchmark",
],
"~/ray/release/sgd_tests/sgd_tests.yaml": [
"sgd_gpu",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"bookkeeping_overhead",
"durable_trainable",
SmokeTest("long_running_large_checkpoints"),
SmokeTest("network_overhead"),
"result_throughput_cluster",
"result_throughput_single_node",
"xgboost_sweep",
],
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [
"train_small",
"train_moderate",
"train_gpu",
"tune_small",
"tune_4x32",
"tune_32x4",
"ft_small_elastic",
"ft_small_non_elastic",
"distributed_api_test",
],
}

WEEKLY_TESTS = {
"~/ray/benchmarks/benchmark_tests.yaml": [
"distributed",
],
"~/ray/release/nightly_tests/nightly_tests.yaml": [
"stress_test_many_tasks",
"stress_test_dead_actors",
],
"~/ray/release/horovod_tests/horovod_tests.yaml": [
"horovod_test",
],
"~/ray/release/long_running_distributed_tests"
"/long_running_distributed.yaml": [
"pytorch_pbt_failure",
],
# Full long running tests (1 day runtime)
"~/ray/release/long_running_tests/long_running_tests.yaml": [
"actor_deaths",
"apex",
"impala",
"many_actor_tasks",
"many_drivers",
"many_ppo",
"many_tasks",
"many_tasks_serialized_ids",
"node_failures",
"pbt",
# "serve",
# "serve_failure",
],
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [
"network_overhead",
"long_running_large_checkpoints",
],
}

MANUAL_TESTS = {
"~/ray/release/rllib_tests/rllib_tests.yaml": [
"learning_tests",
"example_scripts_on_gpu_tests",
"stress_tests",
],
"~/ray/release/long_running_tests/long_running_tests.yaml": [
SmokeTest("serve"),
SmokeTest("serve_failure"),
]
}

SUITES = {
"core-nightly": CORE_NIGHTLY_TESTS,
"nightly": NIGHTLY_TESTS,
"weekly": WEEKLY_TESTS,
"manual": MANUAL_TESTS,
}

DEFAULT_STEP_TEMPLATE = {
"env": {
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc",
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle",
"RELEASE_AWS_BUCKET": "ray-release-automation-results",
"RELEASE_AWS_LOCATION": "dev",
"RELEASE_AWS_DB_NAME": "ray_ci",
"RELEASE_AWS_DB_TABLE": "release_test_result",
"AWS_REGION": "us-west-2"
},
"agents": {
"queue": "runner_queue_branch"
},
"plugins": [{
"docker#v3.8.0": {
"image": "rayproject/ray",
"propagate-environment": True
}
}],
"commands": []
}


def build_pipeline(steps):
all_steps = []

RAY_BRANCH = os.environ.get("RAY_BRANCH", "master")
RAY_REPO = os.environ.get("RAY_REPO",
"https://github.com/ray-project/ray.git")

RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH)
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO)

FILTER_FILE = os.environ.get("FILTER_FILE", "")
FILTER_TEST = os.environ.get("FILTER_TEST", "")

logging.info(
f"Building pipeline \n"
f"Ray repo/branch to test:\n"
f" RAY_REPO = {RAY_REPO}\n"
f" RAY_BRANCH = {RAY_BRANCH}\n\n"
f"Ray repo/branch containing the test configurations and scripts:"
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n"
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n"
f"Filtering for these tests:\n"
f" FILTER_FILE = {FILTER_FILE}\n"
f" FILTER_TEST = {FILTER_TEST}\n\n")

for test_file, test_names in steps.items():
if FILTER_FILE and FILTER_FILE not in test_file:
continue

test_base = os.path.basename(test_file)
for test_name in test_names:
if FILTER_TEST and FILTER_TEST not in test_name:
continue

if not isinstance(test_name, ReleaseTest):
test_name = ReleaseTest(name=test_name)

logging.info(f"Adding test: {test_base}/{test_name}")

cmd = str(f"python release/e2e.py "
f"--ray-branch {RAY_BRANCH} "
f"--category {RAY_BRANCH} "
f"--test-config {test_file} "
f"--test-name {test_name}")

if test_name.smoke_test:
logging.info("This test will run as a smoke test.")
cmd += " --smoke-test"

step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)

if test_name.retry:
logging.info(f"This test will be retried up to "
f"{test_name.retry} times.")
step_conf["retry"] = {
"automatic": [{
"exit_status": "*",
"limit": test_name.retry
}]
}

step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray",
cmd,
]

step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \
f"{RAY_TEST_BRANCH}/{test_base}"
all_steps.append(step_conf)

return all_steps


def alert_pipeline(stats: bool = False):
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE)

cmd = "python release/alert.py"
if stats:
cmd += " --stats"

step_conf["commands"] = [
"pip install -q -r release/requirements.txt",
"pip install -U boto3 botocore",
cmd,
]
step_conf["label"] = f"Send periodic alert (stats_only = {stats})"
return [step_conf]


if __name__ == "__main__":
alert = os.environ.get("RELEASE_ALERT", "0")

if alert in ["1", "stats"]:
steps = alert_pipeline(alert == "stats")
else:
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly")
PIPELINE_SPEC = SUITES[TEST_SUITE]

steps = build_pipeline(PIPELINE_SPEC)

yaml.dump({"steps": steps}, sys.stdout)
Empty file added release/__init__.py
Empty file.
Loading

0 comments on commit ed131f8

Please sign in to comment.