forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[release] move release testing end to end script to main ray repo (ra…
- Loading branch information
Showing
12 changed files
with
2,823 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,307 @@ | ||
import copy | ||
import logging | ||
import os | ||
import sys | ||
|
||
import yaml | ||
|
||
# Env variables: | ||
|
||
# RAY_REPO Repo to use for finding the wheel | ||
# RAY_BRANCH Branch to find the wheel | ||
# RAY_TEST_REPO Repo to use for test scripts | ||
# RAY_TEST_BRANCH Branch for test scripts | ||
# FILTER_FILE File filter | ||
# FILTER_TEST Test name filter | ||
# RELEASE_TEST_SUITE Release test suite (e.g. manual, nightly) | ||
|
||
|
||
class ReleaseTest: | ||
def __init__(self, name: str, smoke_test: bool = False, retry: int = 0): | ||
self.name = name | ||
self.smoke_test = smoke_test | ||
self.retry = retry | ||
|
||
def __str__(self): | ||
return self.name | ||
|
||
def __repr__(self): | ||
return self.name | ||
|
||
def __contains__(self, item): | ||
return self.name.__contains__(item) | ||
|
||
def __iter__(self): | ||
return iter(self.name) | ||
|
||
def __len__(self): | ||
return len(self.name) | ||
|
||
|
||
class SmokeTest(ReleaseTest): | ||
def __init__(self, name: str, retry: int = 0): | ||
super(SmokeTest, self).__init__( | ||
name=name, smoke_test=True, retry=retry) | ||
|
||
|
||
CORE_NIGHTLY_TESTS = { | ||
"~/ray/release/nightly_tests/nightly_tests.yaml": [ | ||
"shuffle_10gb", | ||
"shuffle_50gb", | ||
"shuffle_50gb_large_partition", | ||
"shuffle_100gb", | ||
"non_streaming_shuffle_100gb", | ||
"non_streaming_shuffle_50gb_large_partition", | ||
"non_streaming_shuffle_50gb", | ||
"dask_on_ray_10gb_sort", | ||
"dask_on_ray_100gb_sort", | ||
"dask_on_ray_large_scale_test_no_spilling", | ||
"dask_on_ray_large_scale_test_spilling", | ||
"stress_test_placement_group", | ||
"shuffle_1tb_1000_partition", | ||
"non_streaming_shuffle_1tb_1000_partition", | ||
"shuffle_1tb_5000_partitions", | ||
"non_streaming_shuffle_1tb_5000_partitions", | ||
"decision_tree_autoscaling", | ||
"autoscaling_shuffle_1tb_1000_partitions", | ||
SmokeTest("stress_test_many_tasks"), | ||
SmokeTest("stress_test_dead_actors"), | ||
], | ||
"~/ray/benchmarks/benchmark_tests.yaml": [ | ||
"single_node", | ||
"object_store", | ||
], | ||
} | ||
|
||
NIGHTLY_TESTS = { | ||
# "~/ray/release/horovod_tests/horovod_tests.yaml": [ | ||
# SmokeTest("horovod_test"), | ||
# ], # Should we enable this? | ||
"~/ray/release/golden_notebook_tests/golden_notebook_tests.yaml": [ | ||
"dask_xgboost_test", | ||
"modin_xgboost_test", | ||
"torch_tune_serve_test", | ||
], | ||
"~/ray/release/long_running_tests/long_running_tests.yaml": [ | ||
SmokeTest("actor_deaths"), | ||
SmokeTest("apex"), | ||
SmokeTest("impala"), | ||
SmokeTest("many_actor_tasks"), | ||
SmokeTest("many_drivers"), | ||
SmokeTest("many_ppo"), | ||
SmokeTest("many_tasks"), | ||
SmokeTest("many_tasks_serialized_ids"), | ||
SmokeTest("node_failures"), | ||
SmokeTest("pbt"), | ||
# SmokeTest("serve"), | ||
# SmokeTest("serve_failure"), | ||
], | ||
"~/ray/release/microbenchmark/microbenchmark.yaml": [ | ||
"microbenchmark", | ||
], | ||
"~/ray/release/sgd_tests/sgd_tests.yaml": [ | ||
"sgd_gpu", | ||
], | ||
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ | ||
"bookkeeping_overhead", | ||
"durable_trainable", | ||
SmokeTest("long_running_large_checkpoints"), | ||
SmokeTest("network_overhead"), | ||
"result_throughput_cluster", | ||
"result_throughput_single_node", | ||
"xgboost_sweep", | ||
], | ||
"~/ray/release/xgboost_tests/xgboost_tests.yaml": [ | ||
"train_small", | ||
"train_moderate", | ||
"train_gpu", | ||
"tune_small", | ||
"tune_4x32", | ||
"tune_32x4", | ||
"ft_small_elastic", | ||
"ft_small_non_elastic", | ||
"distributed_api_test", | ||
], | ||
} | ||
|
||
WEEKLY_TESTS = { | ||
"~/ray/benchmarks/benchmark_tests.yaml": [ | ||
"distributed", | ||
], | ||
"~/ray/release/nightly_tests/nightly_tests.yaml": [ | ||
"stress_test_many_tasks", | ||
"stress_test_dead_actors", | ||
], | ||
"~/ray/release/horovod_tests/horovod_tests.yaml": [ | ||
"horovod_test", | ||
], | ||
"~/ray/release/long_running_distributed_tests" | ||
"/long_running_distributed.yaml": [ | ||
"pytorch_pbt_failure", | ||
], | ||
# Full long running tests (1 day runtime) | ||
"~/ray/release/long_running_tests/long_running_tests.yaml": [ | ||
"actor_deaths", | ||
"apex", | ||
"impala", | ||
"many_actor_tasks", | ||
"many_drivers", | ||
"many_ppo", | ||
"many_tasks", | ||
"many_tasks_serialized_ids", | ||
"node_failures", | ||
"pbt", | ||
# "serve", | ||
# "serve_failure", | ||
], | ||
"~/ray/release/tune_tests/scalability_tests/tune_tests.yaml": [ | ||
"network_overhead", | ||
"long_running_large_checkpoints", | ||
], | ||
} | ||
|
||
MANUAL_TESTS = { | ||
"~/ray/release/rllib_tests/rllib_tests.yaml": [ | ||
"learning_tests", | ||
"example_scripts_on_gpu_tests", | ||
"stress_tests", | ||
], | ||
"~/ray/release/long_running_tests/long_running_tests.yaml": [ | ||
SmokeTest("serve"), | ||
SmokeTest("serve_failure"), | ||
] | ||
} | ||
|
||
SUITES = { | ||
"core-nightly": CORE_NIGHTLY_TESTS, | ||
"nightly": NIGHTLY_TESTS, | ||
"weekly": WEEKLY_TESTS, | ||
"manual": MANUAL_TESTS, | ||
} | ||
|
||
DEFAULT_STEP_TEMPLATE = { | ||
"env": { | ||
"ANYSCALE_CLOUD_ID": "cld_4F7k8814aZzGG8TNUGPKnc", | ||
"ANYSCALE_PROJECT": "prj_2xR6uT6t7jJuu1aCwWMsle", | ||
"RELEASE_AWS_BUCKET": "ray-release-automation-results", | ||
"RELEASE_AWS_LOCATION": "dev", | ||
"RELEASE_AWS_DB_NAME": "ray_ci", | ||
"RELEASE_AWS_DB_TABLE": "release_test_result", | ||
"AWS_REGION": "us-west-2" | ||
}, | ||
"agents": { | ||
"queue": "runner_queue_branch" | ||
}, | ||
"plugins": [{ | ||
"docker#v3.8.0": { | ||
"image": "rayproject/ray", | ||
"propagate-environment": True | ||
} | ||
}], | ||
"commands": [] | ||
} | ||
|
||
|
||
def build_pipeline(steps): | ||
all_steps = [] | ||
|
||
RAY_BRANCH = os.environ.get("RAY_BRANCH", "master") | ||
RAY_REPO = os.environ.get("RAY_REPO", | ||
"https://github.com/ray-project/ray.git") | ||
|
||
RAY_TEST_BRANCH = os.environ.get("RAY_TEST_BRANCH", RAY_BRANCH) | ||
RAY_TEST_REPO = os.environ.get("RAY_TEST_REPO", RAY_REPO) | ||
|
||
FILTER_FILE = os.environ.get("FILTER_FILE", "") | ||
FILTER_TEST = os.environ.get("FILTER_TEST", "") | ||
|
||
logging.info( | ||
f"Building pipeline \n" | ||
f"Ray repo/branch to test:\n" | ||
f" RAY_REPO = {RAY_REPO}\n" | ||
f" RAY_BRANCH = {RAY_BRANCH}\n\n" | ||
f"Ray repo/branch containing the test configurations and scripts:" | ||
f" RAY_TEST_REPO = {RAY_TEST_REPO}\n" | ||
f" RAY_TEST_BRANCH = {RAY_TEST_BRANCH}\n\n" | ||
f"Filtering for these tests:\n" | ||
f" FILTER_FILE = {FILTER_FILE}\n" | ||
f" FILTER_TEST = {FILTER_TEST}\n\n") | ||
|
||
for test_file, test_names in steps.items(): | ||
if FILTER_FILE and FILTER_FILE not in test_file: | ||
continue | ||
|
||
test_base = os.path.basename(test_file) | ||
for test_name in test_names: | ||
if FILTER_TEST and FILTER_TEST not in test_name: | ||
continue | ||
|
||
if not isinstance(test_name, ReleaseTest): | ||
test_name = ReleaseTest(name=test_name) | ||
|
||
logging.info(f"Adding test: {test_base}/{test_name}") | ||
|
||
cmd = str(f"python release/e2e.py " | ||
f"--ray-branch {RAY_BRANCH} " | ||
f"--category {RAY_BRANCH} " | ||
f"--test-config {test_file} " | ||
f"--test-name {test_name}") | ||
|
||
if test_name.smoke_test: | ||
logging.info("This test will run as a smoke test.") | ||
cmd += " --smoke-test" | ||
|
||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) | ||
|
||
if test_name.retry: | ||
logging.info(f"This test will be retried up to " | ||
f"{test_name.retry} times.") | ||
step_conf["retry"] = { | ||
"automatic": [{ | ||
"exit_status": "*", | ||
"limit": test_name.retry | ||
}] | ||
} | ||
|
||
step_conf["commands"] = [ | ||
"pip install -q -r release/requirements.txt", | ||
"pip install -U boto3 botocore", | ||
f"git clone -b {RAY_TEST_BRANCH} {RAY_TEST_REPO} ~/ray", | ||
cmd, | ||
] | ||
|
||
step_conf["label"] = f"{test_name} ({RAY_BRANCH}) - " \ | ||
f"{RAY_TEST_BRANCH}/{test_base}" | ||
all_steps.append(step_conf) | ||
|
||
return all_steps | ||
|
||
|
||
def alert_pipeline(stats: bool = False): | ||
step_conf = copy.deepcopy(DEFAULT_STEP_TEMPLATE) | ||
|
||
cmd = "python release/alert.py" | ||
if stats: | ||
cmd += " --stats" | ||
|
||
step_conf["commands"] = [ | ||
"pip install -q -r release/requirements.txt", | ||
"pip install -U boto3 botocore", | ||
cmd, | ||
] | ||
step_conf["label"] = f"Send periodic alert (stats_only = {stats})" | ||
return [step_conf] | ||
|
||
|
||
if __name__ == "__main__": | ||
alert = os.environ.get("RELEASE_ALERT", "0") | ||
|
||
if alert in ["1", "stats"]: | ||
steps = alert_pipeline(alert == "stats") | ||
else: | ||
TEST_SUITE = os.environ.get("RELEASE_TEST_SUITE", "nightly") | ||
PIPELINE_SPEC = SUITES[TEST_SUITE] | ||
|
||
steps = build_pipeline(PIPELINE_SPEC) | ||
|
||
yaml.dump({"steps": steps}, sys.stdout) |
Empty file.
Oops, something went wrong.