[CI][Green-Ray][1] Automated retry of infra-error release tests (ray-…

…project#34057) This PR is a part of my effort to make OSS release test run greener, starting with reducing infra error rates. Other work such as [this from Lonnie](https://docs.google.com/document/d/1hF7h8F19qFWFxH9WVeT8fWwVuNyUyHLTx-7LP3uxD50/edit#heading=h.i0cvl0u8jbfu) fixes systematic issues such as unstable Anyscale staging environment. This PR addresses transient issues with Anyscale that are hard to avoid in a distributed system. On a day Anyscale behaves well, transient issue seem to be around [2-3%](https://b534fd88.us1a.app.preset.io/superset/dashboard/43/?force=false&native_filters_key=MoYaGptJfGwbkF60A7RSzfoRLL_ypDf_JvNFxp2YGQ8Ls4CNgbAWEBh0WcOkOLsS), aka. 4 random failures for a test suite of 200 tests, annoying! Concretely it will: - First, classify an infra test run as a transient infra issue - Instruct buildkite to automatically retry on transient issue - If retry runs out, classify the infra test run as infra issue Some other limitations that will be addressed in followup PRs: - Move infra-failure retry configuration into LaunchDarkly? - Limit auto-retry based on test cost or test runtime Signed-off-by: Cuong Nguyen <[email protected]>
sihanwang41 · Apr 17, 2023 · 20aa7b7 · 20aa7b7
1 parent dced575
commit 20aa7b7
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 50 deletions.
diff --git a/release/ray_release/buildkite/step.py b/release/ray_release/buildkite/step.py
@@ -50,6 +50,14 @@
  ],
  "artifact_paths": [f"{DEFAULT_ARTIFACTS_DIR_HOST}/**/*"],
  "priority": 0,
+ "retry": {
+ "automatic": [
+ {
+ "exit_status": os.environ.get("BUILDKITE_RETRY_CODE", 79),
+ "limit": os.environ.get("BUILDKITE_MAX_RETRIES", 1),
+ }
+ ]
+ },
 }
 
 

diff --git a/release/ray_release/command_runner/command_runner.py b/release/ray_release/command_runner/command_runner.py
@@ -119,12 +119,12 @@ def run_prepare_command(
  max_retries=3,
  )
 
- def get_last_logs(self) -> str:
+ def get_last_logs(self) -> Optional[str]:
  try:
  return self.get_last_logs_ex()
  except Exception as e:
  logger.exception(f"Error fetching logs: {e}")
- return "No logs could be retrieved."
+ return None
 
  def get_last_logs_ex(self):
  raise NotImplementedError

diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py
@@ -1,5 +1,6 @@
 import os
 import time
+import traceback
 from typing import Optional, List, Tuple
 
 from ray_release.alerts.handle import handle_result, require_result
@@ -453,21 +454,21 @@ def run_release_test(
 ) -> Result:
  old_wd = os.getcwd()
  start_time = time.monotonic()
-
- buildkite_group(":spiral_note_pad: Loading test configuration")
- cluster_manager, command_runner, artifact_path = _load_test_configuration(
- test,
- anyscale_project,
- result,
- ray_wheels_url,
- smoke_test,
- no_terminate,
- )
-
+ command_runner = None
+ cluster_manager = None
  pipeline_exception = None
  # non critical for some tests. So separate it from the general one.
  fetch_result_exception = None
  try:
+ buildkite_group(":spiral_note_pad: Loading test configuration")
+ cluster_manager, command_runner, artifact_path = _load_test_configuration(
+ test,
+ anyscale_project,
+ result,
+ ray_wheels_url,
+ smoke_test,
+ no_terminate,
+ )
  buildkite_group(":nut_and_bolt: Setting up cluster environment")
  (
  prepare_cmd,
@@ -525,7 +526,6 @@ def run_release_test(
  smoke_test,
  start_time_unix,
  )
-
  except Exception as e:
  logger.exception(e)
  buildkite_open_last()
@@ -540,9 +540,9 @@ def run_release_test(
  result.job_url = command_runner.job_manager.job_url
  result.job_id = command_runner.job_manager.job_id
 
- result.last_logs = command_runner.get_last_logs()
+ result.last_logs = command_runner.get_last_logs() if command_runner else None
 
- if not no_terminate:
+ if not no_terminate and cluster_manager:
  buildkite_group(":earth_africa: Terminating cluster")
  cluster_manager.terminate_cluster(wait=False)
 
@@ -570,12 +570,17 @@ def run_release_test(
 
  if pipeline_exception:
  buildkite_group(":rotating_light: Handling errors")
- exit_code, error_type, runtime = handle_exception(pipeline_exception)
+ exit_code, result_status, runtime = handle_exception(pipeline_exception)
 
  result.return_code = exit_code.value
- result.status = error_type
+ result.status = result_status.value
  if runtime is not None:
  result.runtime = runtime
+ try:
+ raise pipeline_exception
+ except Exception:
+ if not result.last_logs:
+ result.last_logs = traceback.format_exc()
 
  buildkite_group(":memo: Reporting results", open=True)
  for reporter in reporters or []:

diff --git a/release/ray_release/result.py b/release/ray_release/result.py
@@ -1,13 +1,29 @@
 import enum
+import os
 from dataclasses import dataclass
 from typing import Optional, Dict, Tuple
 
 
+class ResultStatus(enum.Enum):
+ """
+ Overall status of the result test run
+ """
+
+ SUCCESS = "success"
+ UNKNOWN = "unknown"
+ RUNTIME_ERROR = "runtime_error"
+ TRANSIENT_INFRA_ERROR = "transient_infra_error"
+ INFRA_ERROR = "infra_error"
+ INFRA_TIMEOUT = "infra_timeout"
+ ERROR = "error"
+ TIMEOUT = "timeout"
+
+
 @dataclass
 class Result:
  results: Optional[Dict] = None
 
- status: str = "invalid"
+ status: str = ResultStatus.UNKNOWN.value
  return_code: int = 0
  last_logs: Optional[str] = None
 
@@ -64,33 +80,34 @@ class ExitCode(enum.Enum):
  PREPARE_ERROR = 43
 
 
-def handle_exception(e: Exception) -> Tuple[ExitCode, str, Optional[int]]:
+def handle_exception(e: Exception) -> Tuple[ExitCode, ResultStatus, Optional[int]]:
  from ray_release.exception import ReleaseTestError
 
- if isinstance(e, ReleaseTestError):
- exit_code = e.exit_code
- # Legacy reporting
- if 1 <= exit_code.value < 10:
- error_type = "runtime_error"
- runtime = None
- elif 10 <= exit_code.value < 20:
- error_type = "infra_error"
- runtime = None
- elif 30 <= exit_code.value < 40:
- error_type = "infra_timeout"
- runtime = None
- elif exit_code == ExitCode.COMMAND_TIMEOUT:
- error_type = "timeout"
- runtime = 0
- elif 40 <= exit_code.value < 50:
- error_type = "error"
- runtime = 0
- else:
- error_type = "error"
- runtime = 0
- else:
- exit_code = ExitCode.UNKNOWN
- error_type = "unknown error"
+ if not isinstance(e, ReleaseTestError):
+ return ExitCode.UNKNOWN, ResultStatus.UNKNOWN, 0
+ exit_code = e.exit_code
+ if 1 <= exit_code.value < 10:
+ result_status = ResultStatus.RUNTIME_ERROR
+ runtime = None
+ elif 10 <= exit_code.value < 20:
+ result_status = ResultStatus.INFRA_ERROR
+ runtime = None
+ elif 30 <= exit_code.value < 40:
+ result_status = ResultStatus.INFRA_TIMEOUT
+ runtime = None
+ elif exit_code == ExitCode.COMMAND_TIMEOUT:
+ result_status = ResultStatus.TIMEOUT
  runtime = 0
+ elif 40 <= exit_code.value:
+ result_status = ResultStatus.ERROR
+ runtime = 0
+
+ # if this result is to be retried, mark its status as transient
+ # this logic should be in-sync with run_release_test.sh
+ if result_status in [ResultStatus.INFRA_ERROR, ResultStatus.INFRA_TIMEOUT]:
+ retry_count = int(os.environ.get("BUILDKITE_RETRY_COUNT", 0))
+ max_retry = int(os.environ.get("BUILDKITE_MAX_RETRIES", 1))
+ if retry_count < max_retry:
+ result_status = ResultStatus.TRANSIENT_INFRA_ERROR
 
- return exit_code, error_type, runtime
+ return exit_code, result_status, runtime
diff --git a/release/ray_release/tests/test_glue.py b/release/ray_release/tests/test_glue.py
@@ -621,7 +621,7 @@ def testFetchResultFailsReqNonEmptyResult(self):
  self._run(result)
  self.assertTrue(any("Could not fetch results" in o for o in cm.output))
  self.assertEqual(result.return_code, ExitCode.FETCH_RESULT_ERROR.value)
- self.assertEqual(result.status, "infra_error")
+ self.assertEqual(result.status, "transient_infra_error")
 
  # Ensure cluster was terminated, no matter what
  self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)
@@ -638,7 +638,6 @@ def testLastLogsFails(self):
  self.assertTrue(any("Error fetching logs" in o for o in cm.output))
  self.assertEqual(result.return_code, ExitCode.SUCCESS.value)
  self.assertEqual(result.status, "finished")
- self.assertIn("No logs", result.last_logs)
 
  # Ensure cluster was terminated
  self.assertGreaterEqual(self.sdk.call_counter["terminate_cluster"], 1)

diff --git a/release/ray_release/tests/test_run_script.py b/release/ray_release/tests/test_run_script.py
@@ -94,7 +94,7 @@ def test_repeat(setup):
  ExitCode.CLUSTER_WAIT_TIMEOUT,
  ExitCode.RAY_WHEELS_TIMEOUT,
  )
- == ExitCode.RAY_WHEELS_TIMEOUT.value
+ == 79 # BUILDKITE_RETRY_CODE
  )
  assert _read_state(state_file) == 3
 

diff --git a/release/run_release_test.sh b/release/run_release_test.sh
@@ -30,13 +30,15 @@ RAY_TEST_SCRIPT=${RAY_TEST_SCRIPT-ray_release/scripts/run_release_test.py}
 RAY_TEST_REPO=${RAY_TEST_REPO-https:https://github.com/ray-project/ray.git}
 RAY_TEST_BRANCH=${RAY_TEST_BRANCH-master}
 RELEASE_RESULTS_DIR=${RELEASE_RESULTS_DIR-/tmp/artifacts}
+BUILDKITE_MAX_RETRIES=1
+BUILDKITE_RETRY_CODE=79
 
 # This is not a great idea if your OS is different to the one
 # used in the product clusters. However, we need this in CI as reloading
 # Ray within the python process does not work for protobuf changes.
 INSTALL_MATCHING_RAY=${BUILDKITE-false}
 
-export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR
+export RAY_TEST_REPO RAY_TEST_BRANCH RELEASE_RESULTS_DIR BUILDKITE_MAX_RETRIES BUILDKITE_RETRY_CODE
 
 if [ -z "${NO_INSTALL}" ]; then
  pip install --use-deprecated=legacy-resolver -q -r requirements.txt
@@ -173,4 +175,8 @@ if [ -z "${NO_CLONE}" ]; then
  rm -rf "${TMPDIR}" || true
 fi
 
-exit $EXIT_CODE
+if [ "$REASON" == "infra error" ] || [ "$REASON" == "infra timeout" ]; then
+ exit $BUILDKITE_RETRY_CODE
+else
+ exit $EXIT_CODE
+fi