Skip to content

Commit

Permalink
[Metrics] Add easy-install script for Prometheus (#42359)
Browse files Browse the repository at this point in the history
This PR simplifies some of the manual steps currently required for first-time users setting up Prometheus with Ray.

This PR adds an easy install script that automatically downloads and runs Prometheus with the appropriate configuration to scrape from Ray. It's for first-time users to run a quick demo of metrics and is not intended for production use.

---------

Signed-off-by: Archit Kulkarni <[email protected]>
  • Loading branch information
architkulkarni committed Jan 25, 2024
1 parent 1926d15 commit 44db192
Show file tree
Hide file tree
Showing 7 changed files with 330 additions and 15 deletions.
8 changes: 8 additions & 0 deletions dashboard/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ py_test(
tags = ["exclusive", "team:serve", "minimal"],
)

py_test(
name = "test_metrics_integration",
size = "medium",
srcs = ["tests/test_metrics_integration.py"],
deps = [":conftest"],
tags = ["exclusive", "team:clusters"],
)

py_test(
name = "test_state_head",
size = "small",
Expand Down
7 changes: 7 additions & 0 deletions dashboard/consts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from ray._private.ray_constants import env_integer, env_bool

DASHBOARD_LOG_FILENAME = "dashboard.log"
Expand Down Expand Up @@ -79,6 +80,12 @@
"dashboard",
"gcs",
}
METRICS_INPUT_ROOT = os.path.join(
os.path.dirname(__file__), "modules", "metrics", "export"
)
PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
)
PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
"RAY_enable_pipe_based_agent_to_parent_health_check", False
)
198 changes: 198 additions & 0 deletions dashboard/modules/metrics/install_and_start_prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import logging
import os
import platform
import subprocess
import sys
import tarfile
from pathlib import Path

import requests

from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH

# Configure basic logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

FALLBACK_PROMETHEUS_VERSION = "2.48.1"
DOWNLOAD_BLOCK_SIZE = 8192 # 8 KB
TEST_MODE_ENV_VAR = "RAY_PROMETHEUS_DOWNLOAD_TEST_MODE"


def get_system_info():
os_type = platform.system().lower()
architecture = platform.machine()
if architecture == "x86_64":
# In the Prometheus filename, it's called amd64
architecture = "amd64"
return os_type, architecture


def download_file(url, filename):
logging.info(f"Downloading {url} to {Path(filename).absolute()}...")
try:
test_mode = os.environ.get(TEST_MODE_ENV_VAR, False)
request_method = requests.head if test_mode else requests.get
response = request_method(url, stream=True)
response.raise_for_status()

total_size_in_bytes = int(response.headers.get("content-length", 0))
total_size_in_mb = total_size_in_bytes / (1024 * 1024)

downloaded_size_in_mb = 0
block_size = DOWNLOAD_BLOCK_SIZE

with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
file.write(chunk)
downloaded_size_in_mb += len(chunk) / (1024 * 1024)
print(
f"Downloaded: {downloaded_size_in_mb:.2f} MB / "
f"{total_size_in_mb:.2f} MB",
end="\r",
)

print("\nDownload completed.")
return True

except requests.RequestException as e:
logging.error(f"Error downloading file: {e}")
return False


def install_prometheus(file_path):
try:
with tarfile.open(file_path) as tar:
tar.extractall()
logging.info("Prometheus installed successfully.")
return True
except Exception as e:
logging.error(f"Error installing Prometheus: {e}")
return False


def start_prometheus(prometheus_dir):

# Currently, Ray never modifies this config file, so we can just use the
# hardcoded path. (It just copies it to a more user-friendly location, in
# MetricsHead._create_default_prometheus_configs.)
# However, if in the future Ray ever modifies this file at runtime, we'll
# need to use the user-friendly location instead, and reload the config
# file after it's updated by Ray.
config_file = Path(PROMETHEUS_CONFIG_INPUT_PATH)

if not config_file.exists():
raise FileNotFoundError(f"Prometheus config file not found: {config_file}")

prometheus_cmd = [
f"{prometheus_dir}/prometheus",
"--config.file",
str(config_file),
]
try:
process = subprocess.Popen(prometheus_cmd)
logging.info("Prometheus has started.")
return process
except Exception as e:
logging.error(f"Failed to start Prometheus: {e}")
return None


def print_shutdown_message(process_id):
message = (
f"Prometheus is running with PID {process_id}.\n"
"To stop Prometheus, use the command: "
f"'kill {process_id}', or if you need to force stop, "
f"use 'kill -9 {process_id}'."
)
print(message)

debug_message = (
"To list all processes running Prometheus, use the command: "
"'ps aux | grep prometheus'."
)
print(debug_message)


def get_latest_prometheus_version():
url = "https://api.github.com/repos/prometheus/prometheus/releases/latest"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
# Remove the leading 'v' from the version number
return data["tag_name"].lstrip("v")
except requests.RequestException as e:
logging.error(f"Error fetching latest Prometheus version: {e}")
return None


def get_prometheus_filename(os_type=None, architecture=None, prometheus_version=None):
if os_type is None or architecture is None:
os_type, architecture = get_system_info()

if prometheus_version is None:
prometheus_version = get_latest_prometheus_version()
if prometheus_version is None:
logging.warning(
"Failed to retrieve the latest Prometheus version. Falling "
f"back to {FALLBACK_PROMETHEUS_VERSION}."
)
# Fall back to a hardcoded version
prometheus_version = FALLBACK_PROMETHEUS_VERSION

return (
f"prometheus-{prometheus_version}.{os_type}-{architecture}.tar.gz",
prometheus_version,
)


def get_prometheus_download_url(
os_type=None, architecture=None, prometheus_version=None
):
file_name, prometheus_version = get_prometheus_filename(
os_type, architecture, prometheus_version
)
return (
"https://github.com/prometheus/prometheus/releases/"
f"download/v{prometheus_version}/{file_name}"
)


def download_prometheus(os_type=None, architecture=None, prometheus_version=None):
file_name, _ = get_prometheus_filename(os_type, architecture, prometheus_version)
download_url = get_prometheus_download_url(
os_type, architecture, prometheus_version
)

return download_file(download_url, file_name), file_name


def main():
logging.warning("This script is not intended for production use.")

downloaded, file_name = download_prometheus()
if not downloaded:
logging.error("Failed to download Prometheus.")
sys.exit(1)

# TODO: Verify the checksum of the downloaded file

if not install_prometheus(file_name):
logging.error("Installation failed.")
sys.exit(1)

# TODO: Add a check to see if Prometheus is already running

assert file_name.endswith(".tar.gz")
process = start_prometheus(
# remove the .tar.gz extension
prometheus_dir=file_name.rstrip(".tar.gz")
)
if process:
print_shutdown_message(process.pid)


if __name__ == "__main__":
main()
13 changes: 8 additions & 5 deletions dashboard/modules/metrics/metrics_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
)
import ray.dashboard.optional_utils as dashboard_optional_utils
import ray.dashboard.utils as dashboard_utils
from ray.dashboard.consts import AVAILABLE_COMPONENT_NAMES_FOR_METRICS
from ray.dashboard.consts import (
AVAILABLE_COMPONENT_NAMES_FOR_METRICS,
METRICS_INPUT_ROOT,
PROMETHEUS_CONFIG_INPUT_PATH,
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -33,16 +37,12 @@
routes = dashboard_optional_utils.DashboardHeadRouteTable

METRICS_OUTPUT_ROOT_ENV_VAR = "RAY_METRICS_OUTPUT_ROOT"
METRICS_INPUT_ROOT = os.path.join(os.path.dirname(__file__), "export")
METRICS_RECORD_INTERVAL_S = 5

DEFAULT_PROMETHEUS_HOST = "https://localhost:9090"
PROMETHEUS_HOST_ENV_VAR = "RAY_PROMETHEUS_HOST"
DEFAULT_PROMETHEUS_NAME = "Prometheus"
PROMETHEUS_NAME_ENV_VAR = "RAY_PROMETHEUS_NAME"
PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
)
PROMETHEUS_HEALTHCHECK_PATH = "-/healthy"

DEFAULT_GRAFANA_HOST = "https://localhost:3000"
Expand Down Expand Up @@ -309,6 +309,9 @@ def _create_default_prometheus_configs(self):
if os.path.exists(prometheus_config_output_path):
os.remove(prometheus_config_output_path)
os.makedirs(os.path.dirname(prometheus_config_output_path), exist_ok=True)
# Currently Ray directly copies this file without modifying it at runtime.
# If Ray ever modifies this file at runtime, please ensure start_prometheus
# in install_and_start_prometheus.py is updated to reload the config file.
shutil.copy(PROMETHEUS_CONFIG_INPUT_PATH, prometheus_config_output_path)

@dashboard_utils.async_loop_forever(METRICS_RECORD_INTERVAL_S)
Expand Down
41 changes: 41 additions & 0 deletions dashboard/modules/tests/test_metrics_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import subprocess
import pytest
import sys

from ray.dashboard.modules.metrics import install_and_start_prometheus


@pytest.mark.parametrize(
"os_type,architecture",
[
("linux", "amd64"),
("linux", "arm64"),
("darwin", "amd64"),
("darwin", "arm64"),
("windows", "amd64"),
("windows", "arm64"),
],
)
def test_download_prometheus(os_type, architecture, monkeypatch):
# set TEST_MODE_ENV_VAR to True to use requests.head instead of requests.get.
# This will make the download faster. We just want to make sure the URL
# exists.
monkeypatch.setenv(install_and_start_prometheus.TEST_MODE_ENV_VAR, "True")
downloaded, _ = install_and_start_prometheus.download_prometheus(
os_type, architecture
)
assert downloaded


def test_e2e(capsys):
install_and_start_prometheus.main()
captured = capsys.readouterr()
assert "Prometheus is running" in captured.out
# Find the Prometheus process and kill it.
# Find the PID from the output: "To stop Prometheus, use the command: 'kill 22790'"
pid = int(captured.out.split("kill ")[1].split("'")[0])
subprocess.run(["kill", str(pid)])


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
Loading

0 comments on commit 44db192

Please sign in to comment.