Skip to content

Commit

Permalink
[CI][Green-Ray][1] Automated retry of infra-error release tests (ray-…
Browse files Browse the repository at this point in the history
…project#34057)

This PR is a part of my effort to make OSS release test run greener, starting with reducing infra error rates. Other work such as [this from Lonnie](https://docs.google.com/document/d/1hF7h8F19qFWFxH9WVeT8fWwVuNyUyHLTx-7LP3uxD50/edit#heading=h.i0cvl0u8jbfu) fixes systematic issues such as unstable Anyscale staging environment. This PR addresses transient issues with Anyscale that are hard to avoid in a distributed system. On a day Anyscale behaves well, transient issue seem to be around [2-3%](https://b534fd88.us1a.app.preset.io/superset/dashboard/43/?force=false&native_filters_key=MoYaGptJfGwbkF60A7RSzfoRLL_ypDf_JvNFxp2YGQ8Ls4CNgbAWEBh0WcOkOLsS), aka. 4 random failures for a test suite of 200 tests, annoying!

Concretely it will:

- First, classify an infra test run as a transient infra issue
- Instruct buildkite to automatically retry on transient issue
- If retry runs out, classify the infra test run as infra issue

Some other limitations that will be addressed in followup PRs:
- Move infra-failure retry configuration into LaunchDarkly?
- Limit auto-retry based on test cost or test runtime

Signed-off-by: Cuong Nguyen <[email protected]>
  • Loading branch information
can-anyscale committed Apr 17, 2023
1 parent dced575 commit 20aa7b7
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 50 deletions.
8 changes: 8 additions & 0 deletions release/ray_release/buildkite/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@
],
"artifact_paths": [f"{DEFAULT_ARTIFACTS_DIR_HOST}/**/*"],
"priority": 0,
"retry": {
"automatic": [
{
"exit_status": os.environ.get("BUILDKITE_RETRY_CODE", 79),
"limit": os.environ.get("BUILDKITE_MAX_RETRIES", 1),
}
]
},
}


Expand Down
4 changes: 2 additions & 2 deletions release/ray_release/command_runner/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,12 @@ def run_prepare_command(
max_retries=3,
)

def get_last_logs(self) -> str:
def get_last_logs(self) -> Optional[str]:
try:
return self.get_last_logs_ex()
except Exception as e:
logger.exception(f"Error fetching logs: {e}")
return "No logs could be retrieved."
return None

def get_last_logs_ex(self):
raise NotImplementedError
Expand Down
37 changes: 21 additions & 16 deletions release/ray_release/glue.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import time
import traceback
from typing import Optional, List, Tuple

from ray_release.alerts.handle import handle_result, require_result
Expand Down Expand Up @@ -453,21 +454,21 @@ def run_release_test(
) -> Result:
old_wd = os.getcwd()
start_time = time.monotonic()

buildkite_group(":spiral_note_pad: Loading test configuration")
cluster_manager, command_runner, artifact_path = _load_test_configuration(
test,
anyscale_project,
result,
ray_wheels_url,
smoke_test,
no_terminate,
)

command_runner = None
cluster_manager = None
pipeline_exception = None
# non critical for some tests. So separate it from the general one.
fetch_result_exception = None
try:
buildkite_group(":spiral_note_pad: Loading test configuration")
cluster_manager, command_runner, artifact_path = _load_test_configuration(
test,
anyscale_project,
result,
ray_wheels_url,
smoke_test,
no_terminate,
)
buildkite_group(":nut_and_bolt: Setting up cluster environment")
(
prepare_cmd,
Expand Down Expand Up @@ -525,7 +526,6 @@ def run_release_test(
smoke_test,
start_time_unix,
)

except Exception as e:
logger.exception(e)
buildkite_open_last()
Expand All @@ -540,9 +540,9 @@ def run_release_test(
result.job_url = command_runner.job_manager.job_url
result.job_id = command_runner.job_manager.job_id

result.last_logs = command_runner.get_last_logs()
result.last_logs = command_runner.get_last_logs() if command_runner else None

if not no_terminate:
if not no_terminate and cluster_manager:
buildkite_group(":earth_africa: Terminating cluster")
cluster_manager.terminate_cluster(wait=False)

Expand Down Expand Up @@ -570,12 +570,17 @@ def run_release_test(

if pipeline_exception:
buildkite_group(":rotating_light: Handling errors")
exit_code, error_type, runtime = handle_exception(pipeline_exception)
exit_code, result_status, runtime = handle_exception(pipeline_exception)

result.return_code = exit_code.value
result.status = error_type
result.status = result_status.value
if runtime is not None:
result.runtime = runtime
try:
raise pipeline_exception
except Exception:
if not result.last_logs:
result.last_logs = traceback.format_exc()

buildkite_group(":memo: Reporting results", open=True)
for reporter in reporters or []:
Expand Down
71 changes: 44 additions & 27 deletions release/ray_release/result.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,29 @@
import enum
import os
from dataclasses import dataclass
from typing import Optional, Dict, Tuple


class ResultStatus(enum.Enum):
"""
Overall status of the result test run
"""

SUCCESS = "success"
UNKNOWN = "unknown"
RUNTIME_ERROR = "runtime_error"
TRANSIENT_INFRA_ERROR = "transient_infra_error"
INFRA_ERROR = "infra_error"
INFRA_TIMEOUT = "infra_timeout"
ERROR = "error"
TIMEOUT = "timeout"


@dataclass
class Result:
results: Optional[Dict] = None

status: str = "invalid"
status: str = ResultStatus.UNKNOWN.value
return_code: int = 0
last_logs: Optional[str] = None

Expand Down Expand Up @@ -64,33 +80,34 @@ class ExitCode(enum.Enum):
PREPARE_ERROR = 43


def handle_exception(e: Exception) -> Tuple[ExitCode, str, Optional[int]]:
def handle_exception(e: Exception) -> Tuple[ExitCode, ResultStatus, Optional[int]]:
from ray_release.exception import ReleaseTestError

if isinstance(e, ReleaseTestError):
exit_code = e.exit_code
# Legacy reporting
if 1 <= exit_code.value < 10:
error_type = "runtime_error"
runtime = None
elif 10 <= exit_code.value < 20:
error_type = "infra_error"
runtime = None
elif 30 <= exit_code.value < 40:
error_type = "infra_timeout"
runtime = None
elif exit_code == ExitCode.COMMAND_TIMEOUT:
error_type = "timeout"
runtime = 0
elif 40 <= exit_code.value < 50:
error_type = "error"
runtime = 0
else:
error_type = "error"
runtime = 0
else:
exit_code = ExitCode.UNKNOWN
error_type = "unknown error"
if not isinstance(e, ReleaseTestError):
return ExitCode.UNKNOWN, ResultStatus.UNKNOWN, 0
exit_code = e.exit_code
if 1 <= exit_code.value < 10:
result_status = ResultStatus.RUNTIME_ERROR
runtime = None
elif 10 <= exit_code.value < 20:
result_status = ResultStatus.INFRA_ERROR
runtime = None
elif 30 <= exit_code.value < 40:
result_status = ResultStatus.INFRA_TIMEOUT
runtime = None
elif exit_code == ExitCode.COMMAND_TIMEOUT:
result_status = ResultStatus.TIMEOUT
runtime = 0
elif 40 <= exit_code.value:
result_status = ResultStatus.ERROR
runtime = 0

# if this result is to be retried, mark its status as transient
# this logic should be in-sync with run_release_test.sh
if result_status in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]:
retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0))
max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1))
if retry_count < max_retry:
result_status = ResultStatus.TRANSIENT_INFRA_ERROR

return exit_code, error_type, runtime
return exit_code, result_status, runtime
3 changes: 1 addition & 2 deletions release/ray_release/tests/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self):
self._run(result)
self.assertTrue(any("Could not fetch results" in o for o in cm.output))
self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value)
self.assertEqual(result.status, "infra_error")
self.assertEqual(result.status, "transient_infra_error")

# Ensure cluster was terminated, no matter what
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Expand All @@ -638,7 +638,6 @@ def testLastLogsFails(self):
self.assertTrue(any("Error fetching logs" in o for o in cm.output))
self.assertEqual(result.return_code, ExitCode.SUCCESS.value)
self.assertEqual(result.status, "finished")
self.assertIn("No logs", result.last_logs)

# Ensure cluster was terminated
self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
Expand Down
2 changes: 1 addition & 1 deletion release/ray_release/tests/test_run_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_repeat(setup):
ExitCode.CLUSTER_WAIT_TIMEOUT,
ExitCode.RAY_WHEELS_TIMEOUT,
)
== ExitCode.RAY_WHEELS_TIMEOUT.value
== 79 # BUILDKITE_RETRY_CODE
)
assert _read_state(state_file) == 3

Expand Down
10 changes: 8 additions & 2 deletions release/run_release_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
RAY_TEST_REPO=${RAY_TEST_REPO-https:https://github.com/ray-project/ray.git}
RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
BUILDKITE_MAX_RETRIES=1
BUILDKITE_RETRY_CODE=79

# This is not a great idea if your OS is different to the one
# used in the product clusters. However, we need this in CI as reloading
# Ray within the python process does not work for protobuf changes.
INSTALL_MATCHING_RAY=${BUILDKITE-false}

export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR BUILDKITE_MAX_RETRIES BUILDKITE_RETRY_CODE

if [ -z "${NO_INSTALL}" ]; then
pip install --use-deprecated=legacy-resolver -q -r requirements.txt
Expand Down Expand Up @@ -173,4 +175,8 @@ if [ -z "${NO_CLONE}" ]; then
rm -rf "${TMPDIR}" || true
fi

exit $EXIT_CODE
if [ "$REASON" == "infra error" ] || [ "$REASON" == "infra timeout" ]; then
exit $BUILDKITE_RETRY_CODE
else
exit $EXIT_CODE
fi

0 comments on commit 20aa7b7

Please sign in to comment.