Skip to content

Commit

Permalink
[core][3/3] Use the new standalone runtime env http server. (ray-proj…
Browse files Browse the repository at this point in the history
…ect#37585)

Rewrites agent_manager.cc. Removed its ability to do agent registration (no longer needs registration) and proxying runtime env agent (moved to the runtime_env_agent_client.cc). It will only do agent starting but we will have 2 instances in node_manager starting a dashboard agent and a runtime env agent.
Deletes the runtime env agent python code from dashboard agent.
Deletes the agent registration grpc interface, and the runtime env agent interface.
Starts the standalone runtime env http server in services.py.
Adds the extra port for the server everywhere: in services.py, node.py and gcs.proto.
added 1 more port to Node info: runtime_env_agent_port. Intended to be used with raylet_address, but in some cases (1 test IIRC) we don't have one and it'll be used with node_address
updated all related tests. Most tests used to use dashboard agent's port, now they use runtime env agent's port.

Signed-off-by: NripeshN <[email protected]>
  • Loading branch information
rynewang authored and NripeshN committed Aug 15, 2023
1 parent 44396f0 commit e100c80
Show file tree
Hide file tree
Showing 47 changed files with 673 additions and 1,413 deletions.
59 changes: 2 additions & 57 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -255,58 +255,6 @@ cc_library(
],
)

# Agent manager.
cc_grpc_library(
name = "agent_manager_cc_grpc",
srcs = ["//src/ray/protobuf:agent_manager_proto"],
grpc_only = True,
deps = ["//src/ray/protobuf:agent_manager_cc_proto"],
)

cc_library(
name = "agent_manager_rpc",
hdrs = glob([
"src/ray/rpc/agent_manager/*.h",
]),
copts = COPTS,
strip_include_prefix = "src",
deps = [
":agent_manager_cc_grpc",
":grpc_common_lib",
":ray_common",
"@boost//:asio",
"@com_github_grpc_grpc//:grpc++",
],
)

# runtime env.
cc_grpc_library(
name = "runtime_env_cc_grpc",
srcs = ["//src/ray/protobuf:runtime_env_agent_proto"],
grpc_only = True,
deps = [
"//src/ray/protobuf:agent_manager_cc_proto",
"//src/ray/protobuf:common_cc_proto",
"//src/ray/protobuf:runtime_env_agent_cc_proto",
],
)

cc_library(
name = "runtime_env_rpc",
hdrs = glob([
"src/ray/rpc/runtime_env/*.h",
]),
copts = COPTS,
strip_include_prefix = "src",
deps = [
":grpc_common_lib",
":ray_common",
":runtime_env_cc_grpc",
"@boost//:asio",
"@com_github_grpc_grpc//:grpc++",
],
)

# pubsub.
cc_grpc_library(
name = "pubsub_cc_grpc",
Expand Down Expand Up @@ -581,7 +529,6 @@ cc_library(
copts = COPTS,
strip_include_prefix = "src",
deps = [
":agent_manager_rpc",
":autoscaler_rpc",
":gcs",
":gcs_pub_sub_lib",
Expand All @@ -595,6 +542,7 @@ cc_library(
":raylet_client_lib",
":scheduler",
":worker_rpc",
"//src/ray/protobuf:agent_manager_cc_proto",
"@boost//:bimap",
"@com_github_grpc_grpc//src/proto/grpc/health/v1:health_proto",
"@com_google_absl//absl/container:btree",
Expand Down Expand Up @@ -774,7 +722,6 @@ cc_library(
strip_include_prefix = "src",
visibility = ["//visibility:public"],
deps = [
":agent_manager_rpc",
":gcs",
":gcs_client_lib",
":node_manager_fbs",
Expand All @@ -783,7 +730,6 @@ cc_library(
":plasma_client",
":pubsub_lib",
":ray_common",
":runtime_env_rpc",
":scheduler",
":stats_lib",
":worker_rpc",
Expand Down Expand Up @@ -825,7 +771,6 @@ cc_library(
strip_include_prefix = "src",
visibility = ["//visibility:public"],
deps = [
":agent_manager_rpc",
":node_manager_fbs",
":node_manager_rpc",
":ray_common",
Expand Down Expand Up @@ -2480,12 +2425,12 @@ cc_library(
copts = COPTS,
strip_include_prefix = "src",
deps = [
":agent_manager_rpc",
":hiredis",
":node_manager_fbs",
":node_manager_rpc",
":ray_common",
":stats_lib",
"//src/ray/protobuf:agent_manager_cc_proto",
"//src/ray/protobuf:gcs_cc_proto",
"//src/ray/protobuf:gcs_service_cc_proto",
"//src/ray/util",
Expand Down
25 changes: 1 addition & 24 deletions dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
setup_component_logger,
configure_log_file,
)
from ray.core.generated import agent_manager_pb2, agent_manager_pb2_grpc
from ray.experimental.internal_kv import (
_initialize_internal_kv,
_internal_kv_initialized,
Expand Down Expand Up @@ -84,7 +83,6 @@ def __init__(
log_dir: str,
temp_dir: str,
session_dir: str,
runtime_env_dir: str,
logging_params: dict,
agent_id: int,
session_name: str,
Expand All @@ -99,7 +97,6 @@ def __init__(

self.temp_dir = temp_dir
self.session_dir = session_dir
self.runtime_env_dir = runtime_env_dir
self.log_dir = log_dir
self.dashboard_agent_port = dashboard_agent_port
self.metrics_export_port = metrics_export_port
Expand Down Expand Up @@ -321,19 +318,6 @@ async def _check_parent():
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
)

# Register agent to agent manager.
raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub(
self.aiogrpc_raylet_channel
)

await raylet_stub.RegisterAgent(
agent_manager_pb2.RegisterAgentRequest(
agent_id=self.agent_id,
agent_port=self.grpc_port,
agent_ip_address=self.ip,
)
)

tasks = [m.run(self.server) for m in modules]
if sys.platform not in ["win32", "cygwin"]:
tasks.append(check_parent_task)
Expand Down Expand Up @@ -466,13 +450,7 @@ def open_capture_files(log_dir):
default=None,
help="Specify the path of this session.",
)
parser.add_argument(
"--runtime-env-dir",
required=True,
type=str,
default=None,
help="Specify the path of the resource directory used by runtime_env.",
)

parser.add_argument(
"--minimal",
action="store_true",
Expand Down Expand Up @@ -530,7 +508,6 @@ def open_capture_files(log_dir):
args.minimal,
temp_dir=args.temp_dir,
session_dir=args.session_dir,
runtime_env_dir=args.runtime_env_dir,
log_dir=args.log_dir,
metrics_export_port=args.metrics_export_port,
node_manager_port=args.node_manager_port,
Expand Down
5 changes: 0 additions & 5 deletions dashboard/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,6 @@
# Default value for datacenter (the default value in protobuf)
DEFAULT_LANGUAGE = "PYTHON"
DEFAULT_JOB_ID = "ffff"
# Cache TTL for bad runtime env. After this time, delete the cache and retry to create
# runtime env if needed.
BAD_RUNTIME_ENV_CACHE_TTL_SECONDS = env_integer(
"BAD_RUNTIME_ENV_CACHE_TTL_SECONDS", 60 * 10
)
# Hook that is invoked on the dashboard `/api/component_activities` endpoint.
# Environment variable stored here should be a callable that does not
# take any arguments and should return a dictionary mapping
Expand Down
2 changes: 1 addition & 1 deletion dashboard/modules/log/log_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def stream_logs(
yield streamed_log.data

def _verify_node_registered(self, node_id: str):
if node_id not in self.client.get_all_registered_agent_ids():
if node_id not in self.client.get_all_registered_log_agent_ids():
raise DataSourceUnavailable(
f"Given node id {node_id} is not available. "
"It's either the node is dead, or it is not registered. "
Expand Down
Empty file.
Loading

0 comments on commit e100c80

Please sign in to comment.