ray-project · rkooo567 · Sep 5, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -2531,6 +2531,7 @@ pyx_library(
  deps = [
  "//:core_worker_lib",
  "//:global_state_accessor_lib",
+ "//:gcs_server_lib",
  "//:raylet_lib",
  "//:redis_client",
  "//:src/ray/ray_exported_symbols.lds",

diff --git a/python/ray/_private/node.py b/python/ray/_private/node.py
@@ -24,8 +24,9 @@
 import ray._private.services
 import ray._private.utils
 from ray._private import storage
-from ray._raylet import GcsClient
+from ray._raylet import GcsClient, get_key_from_storage
 from ray._private.resource_spec import ResourceSpec
+from ray._private.services import serialize_config
 from ray._private.utils import open_log, try_to_create_directory, try_to_symlink
 
 # Logger for this module. It should be configured at the entry point
@@ -177,9 +178,34 @@ def __init__(
 
  # Register the temp dir.
  if head:
- # date including microsecond
- date_str = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S_%f")
- self._session_name = f"session_{date_str}_{os.getpid()}"
+ maybe_key = None
+ if self._ray_params.external_addresses is not None:
+ self._redis_address = self._ray_params.external_addresses[0]
+ parts = self._redis_address.split(":https://", 1)
+ enable_redis_ssl = False
+ if len(parts) == 1:
+ redis_ip_address, redis_port = parts[0].rsplit(":", 1)
+ else:
+ if len(parts) != 2 or parts[0] not in ("redis", "rediss"):
+ raise ValueError(f"Invalid redis address {self._redis_address}")
+ redis_ip_address, redis_port = parts[1].rsplit(":", 1)
+ if parts[0] == "rediss":
+ enable_redis_ssl = True
+ maybe_key = get_key_from_storage(
+ redis_ip_address,
+ int(redis_port),
+ self._ray_params.redis_password,
+ enable_redis_ssl,
+ serialize_config(self._config),
+ b"session_name",
+ )
+
+ if maybe_key is None:
+ # date including microsecond
+ date_str = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S_%f")
+ self._session_name = f"session_{date_str}_{os.getpid()}"
+ else:
+ self._session_name = ray._private.utils.decode(maybe_key)
  else:
  if ray_params.session_name is None:
  assert not self._default_worker
@@ -1215,13 +1241,9 @@ def start_head_processes(self):
  logger.debug(
  f"Process STDOUT and STDERR is being " f"redirected to {self._logs_dir}."
  )
- assert self._redis_address is None
  assert self._gcs_address is None
  assert self._gcs_client is None
 
- if self._ray_params.external_addresses is not None:
- self._redis_address = self._ray_params.external_addresses[0]
-
  self.start_gcs_server()
  assert self.get_gcs_client() is not None
  self._write_cluster_info_to_kv()

diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx
@@ -156,7 +156,7 @@ from ray.includes.libcoreworker cimport (
 
 from ray.includes.ray_config cimport RayConfig
 from ray.includes.global_state_accessor cimport CGlobalStateAccessor
-from ray.includes.global_state_accessor cimport RedisDelKeySync
+from ray.includes.global_state_accessor cimport RedisDelKeySync, RedisGetKeySync
 from ray.includes.optional cimport (
  optional, nullopt
 )
@@ -4574,3 +4574,13 @@ cdef void async_callback(shared_ptr[CRayObject] obj,
 
 def del_key_from_storage(host, port, password, use_ssl, key):
  return RedisDelKeySync(host, port, password, use_ssl, key)
+
+
+def get_key_from_storage(host, port, password, use_ssl, config, key):
+ cdef:
+ c_string data
+ result = RedisGetKeySync(host, port, password, use_ssl, config, key, &data)
+ if result:
+ return data
+ else:
+ return None
diff --git a/python/ray/includes/global_state_accessor.pxd b/python/ray/includes/global_state_accessor.pxd
@@ -46,6 +46,65 @@ cdef extern from "ray/gcs/gcs_client/global_state_accessor.h" nogil:
  const c_string &node_ip_address,
  c_string *node_to_connect)
 
+cdef extern from * namespace "ray::gcs" nogil:
+ """
+ #include <thread>
+ #include "ray/gcs/gcs_server/store_client_kv.h"
+ namespace ray {
+ namespace gcs {
+
+ bool RedisGetKeySync(const std::string& host,
+ int32_t port,
+ const std::string& password,
+ bool use_ssl,
+ const std::string& config,
+ const std::string& key,
+ std::string* data) {
+ RedisClientOptions options(host, port, password, false, use_ssl);
+
+ std::string config_list;
+ RAY_CHECK(absl::Base64Unescape(config, &config_list));
+ RayConfig::instance().initialize(config_list);
+
+ instrumented_io_context io_service;
+
+ auto redis_client = std::make_shared<RedisClient>(options);
+ auto status = redis_client->Connect(io_service);
+
+ if(!status.ok()) {
+ RAY_LOG(ERROR) << "Failed to connect to redis: " << status.ToString();
+ return false;
+ }
+ auto cli = std::make_unique<StoreClientInternalKV>(
+ std::make_unique<RedisStoreClient>(std::move(redis_client)));
+
+ bool ret_val = false;
+ cli->Get("session", key, [&](std::optional<std::string> result) {
+ if (result.has_value()) {
+ *data = result.value();
+ ret_val = true;
+ } else {
+ RAY_LOG(ERROR) << "Failed to get " << key;
- RAY_LOG(ERROR) << "Failed to get " << key;
+ RAY_LOG(ERROR) << "Failed to get a key, " << key << " from Redis storage.";
- RAY_LOG(ERROR) << "Failed to get " << key;
+ RAY_LOG(ERROR) << "Failed to get a key, " << key << " from Redis storage.";
+ ret_val = false;
+ }
+ });
+ io_service.run_for(std::chrono::milliseconds(1000));
+
+ return ret_val;
+ }
+
+ }
+ }
+ """
+ c_bool RedisGetKeySync(const c_string& host,
+ c_int32_t port,
+ const c_string& password,
+ c_bool use_ssl,
+ const c_string& config,
+ const c_string& key,
+ c_string* data)
+
+
 cdef extern from * namespace "ray::gcs" nogil:
  """
  #include <thread>

diff --git a/python/ray/tests/test_gcs_fault_tolerance.py b/python/ray/tests/test_gcs_fault_tolerance.py
@@ -868,6 +868,36 @@ def check_raylet_healthy():
  sleep(1)
 
 
+def test_session_name(ray_start_cluster):
+ # Kill GCS and check that raylets kill themselves when not backed by Redis,
+ # and stay alive when backed by Redis.
+ # Raylets should kill themselves due to cluster ID mismatch in the
+ # non-persisted case.
+ cluster = ray_start_cluster
+ cluster.add_node()
+ cluster.wait_for_nodes()
+
+ head_node = cluster.head_node
+ session_dir = head_node.get_session_dir_path()
+
+ gcs_server_process = head_node.all_processes["gcs_server"][0].process
+ gcs_server_pid = gcs_server_process.pid
+ cluster.remove_node(head_node, allow_graceful=False)
+ # Wait to prevent the gcs server process becoming zombie.
+ gcs_server_process.wait()
+ wait_for_pid_to_exit(gcs_server_pid, 1000)
+
+ # Add head node back
+ cluster.add_node()
+ head_node = cluster.head_node
+ new_session_dir = head_node.get_session_dir_path()
+
+ if not enable_external_redis():
+ assert session_dir != new_session_dir
+ else:
+ assert session_dir == new_session_dir
+
+
 @pytest.mark.parametrize(
  "ray_start_regular_with_external_redis",
  [