Skip to content

Commit

Permalink
[core] Make GCS python client retry number and timeout tunable (ray-p…
Browse files Browse the repository at this point in the history
…roject#39650)


## Why are these changes needed?
When the GCS is overloaded, it might not response in time. For example, we might get error like this:

```
[2023-09-13 16:00:17,030 C 110 110] gcs_client.cc:153:  Check failed: (_left_ != _right_)  0 vs 0
*** StackTrace Information ***
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0xfde20a) [0x7f7e422f920a] ray::operator<<()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0xfdfcf2) [0x7f7e422facf2] ray::SpdLogMessage::Flush()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(_ZN3ray6RayLogD1Ev+0x37) [0x7f7e422fb007] ray::RayLog::~RayLog()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0x8afd16) [0x7f7e41bcad16] ray::gcs::(anonymous namespace)::HandleGcsError()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(_ZN3ray3gcs15PythonGcsClient7ConnectERKNS_9ClusterIDElm+0x3ca) [0x7f7e41bd0e1a] ray::gcs::PythonGcsClient::Connect()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0x622700) [0x7f7e4193d700] __pyx_pw_3ray_7_raylet_9GcsClient_3_connect()
/home/ray/anaconda3/bin/python() [0x525877] cfunction_call
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0x5b6bbf) [0x7f7e418d1bbf] __Pyx__PyObject_CallOneArg()
/home/ray/anaconda3/lib/python3.11/site-packages/ray/_raylet.so(+0x656eb5) [0x7f7e41971eb5] __pyx_tp_new_3ray_7_raylet_GcsClient()
/home/ray/anaconda3/bin/python(_PyObject_MakeTpCall+0x182) [0x502662] _PyObject_MakeTpCall
/home/ray/anaconda3/bin/python(_PyEval_EvalFrameDefault+0x758) [0x50eaa8] _PyEval_EvalFrameDefault
/home/ray/anaconda3/bin/python(_PyFunction_Vectorcall+0x173) [0x535113] _PyFunction_Vectorcall
```

This PR makes these parameter tunable.
  • Loading branch information
fishbone committed Sep 18, 2023
1 parent 6aa4ad9 commit d2ec5a2
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 2 deletions.
7 changes: 5 additions & 2 deletions python/ray/_raylet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2388,7 +2388,10 @@ cdef class GcsClient:
object _nums_reconnect_retry
CClusterID cluster_id

def __cinit__(self, address, nums_reconnect_retry=5, cluster_id=None):
def __cinit__(self, address,
nums_reconnect_retry=RayConfig.instance().nums_py_gcs_reconnect_retry(
),
cluster_id=None):
cdef GcsClientOptions gcs_options = GcsClientOptions.from_gcs_address(address)
self.inner.reset(new CPythonGcsClient(dereference(gcs_options.native())))
self.address = address
Expand All @@ -2399,7 +2402,7 @@ cdef class GcsClient:
else:
c_cluster_id = cluster_id
self.cluster_id = CClusterID.FromHex(c_cluster_id)
self._connect(5)
self._connect(RayConfig.instance().py_gcs_connect_timeout_s())

def _connect(self, timeout_s=None):
cdef:
Expand Down
4 changes: 4 additions & 0 deletions python/ray/includes/ray_config.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ cdef extern from "ray/common/ray_config.h" nogil:
int64_t grpc_client_keepalive_timeout_ms() const

c_bool enable_autoscaler_v2() const

int64_t nums_py_gcs_reconnect_retry() const

int64_t py_gcs_connect_timeout_s() const
8 changes: 8 additions & 0 deletions python/ray/includes/ray_config.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,11 @@ cdef class Config:
@staticmethod
def enable_autoscaler_v2():
return RayConfig.instance().enable_autoscaler_v2()

@staticmethod
def nums_py_gcs_reconnect_retry():
return RayConfig.instance().nums_py_gcs_reconnect_retry()

@staticmethod
def py_gcs_connect_timeout_s():
return RayConfig.instance().py_gcs_connect_timeout_s()
2 changes: 2 additions & 0 deletions src/ray/common/ray_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ RayConfig &RayConfig::instance() {
return config;
}

RayConfig::RayConfig() { initialize(""); }

void RayConfig::initialize(const std::string &config_list) {
#define RAY_CONFIG(type, name, default_value) \
name##_ = ReadEnv<type>("RAY_" #name, #type, default_value);
Expand Down
2 changes: 2 additions & 0 deletions src/ray/common/ray_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ class RayConfig {
void initialize(const std::string &config_list);

private:
RayConfig();

template <typename T>
T ReadEnv(const std::string &name, const std::string &type_string, T default_value) {
auto value = std::getenv(name.c_str());
Expand Down
4 changes: 4 additions & 0 deletions src/ray/common/ray_config_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -853,3 +853,7 @@ RAY_CONFIG(bool, kill_child_processes_on_worker_exit, true)

// If autoscaler v2 is enabled.
RAY_CONFIG(bool, enable_autoscaler_v2, false)

// Python GCS client number of reconnection retry and timeout.
RAY_CONFIG(int64_t, nums_py_gcs_reconnect_retry, 5)
RAY_CONFIG(int64_t, py_gcs_connect_timeout_s, 30)

0 comments on commit d2ec5a2

Please sign in to comment.