From 817ebe6ecd2093d8e54b5f14a4d047ac791fdcfa Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Mon, 3 Apr 2023 22:17:35 -0700 Subject: [PATCH 1/3] add telemetry for lightweight config updates Signed-off-by: Cindy Zhang --- python/ray/serve/_private/utils.py | 3 + python/ray/serve/controller.py | 28 +++--- python/ray/serve/tests/test_telemetry.py | 105 +++++++++++++++++++++++ src/ray/protobuf/usage.proto | 6 ++ 4 files changed, 132 insertions(+), 10 deletions(-) diff --git a/python/ray/serve/_private/utils.py b/python/ray/serve/_private/utils.py index 0dabf0a9a1582..3660bf1f1a8e5 100644 --- a/python/ray/serve/_private/utils.py +++ b/python/ray/serve/_private/utils.py @@ -511,6 +511,9 @@ def dict_keys_snake_to_camel_case(snake_dict: dict) -> dict: "SERVE_GRPC_INGRESS_USED": TagKey.SERVE_GRPC_INGRESS_USED, "SERVE_REST_API_VERSION": TagKey.SERVE_REST_API_VERSION, "SERVE_NUM_APPS": TagKey.SERVE_NUM_APPS, + "SERVE_NUM_REPLICAS_UPDATED": TagKey.SERVE_NUM_REPLICAS_UPDATED, + "SERVE_USER_CONFIG_UPDATED": TagKey.SERVE_USER_CONFIG_UPDATED, + "SERVE_AUTOSCALING_CONFIG_UPDATED": TagKey.SERVE_AUTOSCALING_CONFIG_UPDATED, } diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 25316e5b39f44..9a2b994436363 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -56,6 +56,7 @@ get_random_letters, ) from ray.serve._private.application_state import ApplicationStateManager +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -866,16 +867,17 @@ def _generate_deployment_config_versions( d["name"]: d for d in last_deployed_config.get("deployments", []) } + lightweight_update_options = { + "num_replicas": TagKey.SERVE_NUM_REPLICAS_UPDATED, + "user_config": TagKey.SERVE_USER_CONFIG_UPDATED, + "autoscaling_config": TagKey.SERVE_AUTOSCALING_CONFIG_UPDATED, + } + def exclude_lightweight_update_options(dict): # Exclude config options from dict that qualify for a lightweight config # update. Changes in any other config options are considered a code change, # and require a version change to trigger an update that tears # down existing replicas and replaces them with updated ones. - lightweight_update_options = [ - "num_replicas", - "user_config", - "autoscaling_config", - ] return { option: dict[option] for option in dict @@ -884,15 +886,21 @@ def exclude_lightweight_update_options(dict): updated_versions = {} for name in new_deployments: - new_deployment = exclude_lightweight_update_options(new_deployments[name]) - old_deployment = exclude_lightweight_update_options( - old_deployments.get(name, {}) - ) + old_deployment = old_deployments.get(name, {}) + new_deployment = new_deployments[name] + new_deployment_filtered = exclude_lightweight_update_options(new_deployment) + old_deployment_filtered = exclude_lightweight_update_options(old_deployment) # If config options haven't changed, version stays the same # otherwise, generate a new random version - if old_deployment == new_deployment: + if old_deployment_filtered == new_deployment_filtered: updated_versions[name] = last_deployed_versions[name] + + # If the rest of the options haven't changed, but a lightweight option has + # changed, then Serve will execute a lightweight update + for option, tagkey in lightweight_update_options.items(): + if old_deployment.get(option) != new_deployment.get(option): + record_extra_usage_tag(tagkey, "True") else: updated_versions[name] = get_random_letters() diff --git a/python/ray/serve/tests/test_telemetry.py b/python/ray/serve/tests/test_telemetry.py index ee8445e3c3802..e0468950f3791 100644 --- a/python/ray/serve/tests/test_telemetry.py +++ b/python/ray/serve/tests/test_telemetry.py @@ -18,6 +18,7 @@ from ray.serve.context import get_global_client from ray.serve._private.common import ApplicationStatus from ray._private.usage.usage_lib import get_extra_usage_tags_to_report +from ray.serve.schema import ServeDeploySchema TELEMETRY_ROUTE_PREFIX = "/telemetry" @@ -382,5 +383,109 @@ def test_rest_api(manage_ray, tmp_dir, version): assert int(report["extra_usage_tags"]["serve_num_deployments"]) == 1 +@serve.deployment(ray_actor_options={"num_cpus": 0}) +class Tester: + def __call__(self, *args): + pass + + def reconfigure(self, *args): + pass + + +tester = Tester.bind() + + +@pytest.mark.parametrize( + "lightweight_option,value", + [ + ("num_replicas", 2), + ("user_config", {"some_setting": 10}), + ("autoscaling_config", {"max_replicas": 5}), + ], +) +def test_lightweight_config_options(manage_ray, lightweight_option, value): + """ + Check that lightweight config options are detected by telemetry. + """ + + lightweight_tagkeys = [ + "serve_num_replicas_updated", + "serve_user_config_updated", + "serve_autoscaling_config_updated", + ] + + subprocess.check_output(["ray", "start", "--head"]) + wait_for_condition(check_ray_started, timeout=5) + storage = TelemetryStorage.remote() + + config = { + "applications": [ + { + "name": "receiver_app", + "import_path": "ray.serve.tests.test_telemetry.receiver_app", + "route_prefix": TELEMETRY_ROUTE_PREFIX, + }, + { + "name": "test_app", + "import_path": "ray.serve.tests.test_telemetry.tester", + "deployments": [{"name": "Tester"}], + }, + ] + } + + # Deploy first config + client = serve.start(detached=True) + client.deploy_apps(ServeDeploySchema(**config)) + wait_for_condition( + lambda: client.get_serve_status("receiver_app").app_status.status + == ApplicationStatus.RUNNING, + timeout=15, + ) + wait_for_condition( + lambda: client.get_serve_status("test_app").app_status.status + == ApplicationStatus.RUNNING, + timeout=15, + ) + wait_for_condition( + lambda: ray.get(storage.get_reports_received.remote()) > 0, timeout=5 + ) + report = ray.get(storage.get_report.remote()) + + # Check + assert int(report["extra_usage_tags"]["serve_num_apps"]) == 2 + assert report["extra_usage_tags"]["serve_api_version"] == "v2" + for tagkey in lightweight_tagkeys: + assert tagkey not in report["extra_usage_tags"] + + # Change config and deploy again + config["applications"][1]["deployments"][0][lightweight_option] = value + client.deploy_apps(ServeDeploySchema(**config)) + wait_for_condition( + lambda: client.get_serve_status("receiver_app").app_status.status + == ApplicationStatus.DEPLOYING, + timeout=15, + ) + wait_for_condition( + lambda: client.get_serve_status("test_app").app_status.status + == ApplicationStatus.RUNNING, + timeout=15, + ) + + # Check again + wait_for_condition( + lambda: ray.get(storage.get_report.remote())["extra_usage_tags"][ + f"serve_{lightweight_option}_updated" + ] + == "True", + timeout=5, + ) + report = ray.get(storage.get_report.remote()) + assert int(report["extra_usage_tags"]["serve_num_apps"]) == 2 + assert report["extra_usage_tags"]["serve_api_version"] == "v2" + for tagkey in lightweight_tagkeys: + if not tagkey == f"serve_{lightweight_option}_updated": + assert tagkey not in report["extra_usage_tags"] + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto index 14a188b4d80d9..8fab83c45db64 100644 --- a/src/ray/protobuf/usage.proto +++ b/src/ray/protobuf/usage.proto @@ -60,6 +60,12 @@ enum TagKey { SERVE_REST_API_VERSION = 13; // The number of serve apps running in the cluster as a string. SERVE_NUM_APPS = 14; + // Whether num replicas changed + SERVE_NUM_REPLICAS_UPDATED = 15; + // Whether user config changed + SERVE_USER_CONFIG_UPDATED = 16; + // Whether autoscaling config changed + SERVE_AUTOSCALING_CONFIG_UPDATED = 17; // Ray Core State API // NOTE(rickyxx): Currently only setting "1" for tracking existence of From ee490a0a8ad642258b69800dede920025ebe19ec Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Mon, 3 Apr 2023 22:23:13 -0700 Subject: [PATCH 2/3] improve comments Signed-off-by: Cindy Zhang --- src/ray/protobuf/usage.proto | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto index 8fab83c45db64..6f971b37a089b 100644 --- a/src/ray/protobuf/usage.proto +++ b/src/ray/protobuf/usage.proto @@ -60,11 +60,11 @@ enum TagKey { SERVE_REST_API_VERSION = 13; // The number of serve apps running in the cluster as a string. SERVE_NUM_APPS = 14; - // Whether num replicas changed + // Whether num_replicas changed as a lightweight config update SERVE_NUM_REPLICAS_UPDATED = 15; - // Whether user config changed + // Whether user_config changed as a lightweight config update SERVE_USER_CONFIG_UPDATED = 16; - // Whether autoscaling config changed + // Whether autoscaling_config changed as a lightweight config update SERVE_AUTOSCALING_CONFIG_UPDATED = 17; // Ray Core State API From 83ba54deb268f516e11f29e39dc77990c3889d12 Mon Sep 17 00:00:00 2001 From: Cindy Zhang Date: Tue, 4 Apr 2023 10:02:38 -0700 Subject: [PATCH 3/3] address comments - rename tagkeys Signed-off-by: Cindy Zhang --- python/ray/serve/_private/utils.py | 12 +++++++++--- python/ray/serve/controller.py | 6 +++--- python/ray/serve/tests/test_telemetry.py | 10 +++++----- src/ray/protobuf/usage.proto | 6 +++--- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/python/ray/serve/_private/utils.py b/python/ray/serve/_private/utils.py index 3660bf1f1a8e5..3fb3e932f8a1b 100644 --- a/python/ray/serve/_private/utils.py +++ b/python/ray/serve/_private/utils.py @@ -511,9 +511,15 @@ def dict_keys_snake_to_camel_case(snake_dict: dict) -> dict: "SERVE_GRPC_INGRESS_USED": TagKey.SERVE_GRPC_INGRESS_USED, "SERVE_REST_API_VERSION": TagKey.SERVE_REST_API_VERSION, "SERVE_NUM_APPS": TagKey.SERVE_NUM_APPS, - "SERVE_NUM_REPLICAS_UPDATED": TagKey.SERVE_NUM_REPLICAS_UPDATED, - "SERVE_USER_CONFIG_UPDATED": TagKey.SERVE_USER_CONFIG_UPDATED, - "SERVE_AUTOSCALING_CONFIG_UPDATED": TagKey.SERVE_AUTOSCALING_CONFIG_UPDATED, + "SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED": ( + TagKey.SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED + ), + "SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED": ( + TagKey.SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED + ), + "SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED": ( + TagKey.SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED + ), } diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 9a2b994436363..4bfe295270118 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -868,9 +868,9 @@ def _generate_deployment_config_versions( } lightweight_update_options = { - "num_replicas": TagKey.SERVE_NUM_REPLICAS_UPDATED, - "user_config": TagKey.SERVE_USER_CONFIG_UPDATED, - "autoscaling_config": TagKey.SERVE_AUTOSCALING_CONFIG_UPDATED, + "num_replicas": TagKey.SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED, + "user_config": TagKey.SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED, + "autoscaling_config": TagKey.SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED, } def exclude_lightweight_update_options(dict): diff --git a/python/ray/serve/tests/test_telemetry.py b/python/ray/serve/tests/test_telemetry.py index e0468950f3791..1c9575fb2d176 100644 --- a/python/ray/serve/tests/test_telemetry.py +++ b/python/ray/serve/tests/test_telemetry.py @@ -409,9 +409,9 @@ def test_lightweight_config_options(manage_ray, lightweight_option, value): """ lightweight_tagkeys = [ - "serve_num_replicas_updated", - "serve_user_config_updated", - "serve_autoscaling_config_updated", + "serve_num_replicas_lightweight_updated", + "serve_user_config_lightweight_updated", + "serve_autoscaling_config_lightweight_updated", ] subprocess.check_output(["ray", "start", "--head"]) @@ -474,7 +474,7 @@ def test_lightweight_config_options(manage_ray, lightweight_option, value): # Check again wait_for_condition( lambda: ray.get(storage.get_report.remote())["extra_usage_tags"][ - f"serve_{lightweight_option}_updated" + f"serve_{lightweight_option}_lightweight_updated" ] == "True", timeout=5, @@ -483,7 +483,7 @@ def test_lightweight_config_options(manage_ray, lightweight_option, value): assert int(report["extra_usage_tags"]["serve_num_apps"]) == 2 assert report["extra_usage_tags"]["serve_api_version"] == "v2" for tagkey in lightweight_tagkeys: - if not tagkey == f"serve_{lightweight_option}_updated": + if not tagkey == f"serve_{lightweight_option}_lightweight_updated": assert tagkey not in report["extra_usage_tags"] diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto index 6f971b37a089b..0f5197cf1529d 100644 --- a/src/ray/protobuf/usage.proto +++ b/src/ray/protobuf/usage.proto @@ -61,11 +61,11 @@ enum TagKey { // The number of serve apps running in the cluster as a string. SERVE_NUM_APPS = 14; // Whether num_replicas changed as a lightweight config update - SERVE_NUM_REPLICAS_UPDATED = 15; + SERVE_NUM_REPLICAS_LIGHTWEIGHT_UPDATED = 15; // Whether user_config changed as a lightweight config update - SERVE_USER_CONFIG_UPDATED = 16; + SERVE_USER_CONFIG_LIGHTWEIGHT_UPDATED = 16; // Whether autoscaling_config changed as a lightweight config update - SERVE_AUTOSCALING_CONFIG_UPDATED = 17; + SERVE_AUTOSCALING_CONFIG_LIGHTWEIGHT_UPDATED = 17; // Ray Core State API // NOTE(rickyxx): Currently only setting "1" for tracking existence of