Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Metrics] Add easy-install script for Prometheus #42359

Merged
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dashboard/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ py_test(
tags = ["exclusive", "team:serve", "minimal"],
)

py_test(
name = "test_metrics_integration",
size = "medium",
srcs = ["tests/test_metrics_integration.py"],
deps = [":conftest"],
tags = ["exclusive", "team:clusters"],
)

py_test(
name = "test_state_head",
size = "small",
Expand Down
7 changes: 7 additions & 0 deletions dashboard/consts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from ray._private.ray_constants import env_integer, env_bool

DASHBOARD_LOG_FILENAME = "dashboard.log"
Expand Down Expand Up @@ -79,6 +80,12 @@
"dashboard",
"gcs",
}
METRICS_INPUT_ROOT = os.path.join(
os.path.dirname(__file__), "modules", "metrics", "export"
)
PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
)
PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
"RAY_enable_pipe_based_agent_to_parent_health_check", False
)
192 changes: 192 additions & 0 deletions dashboard/modules/metrics/install_and_start_prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import logging
import os
import platform
import subprocess
import sys
import tarfile
from pathlib import Path

import requests

from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH

# Configure basic logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

FALLBACK_PROMETHEUS_VERSION = "2.48.1"
DOWNLOAD_BLOCK_SIZE = 8192 # 8 KB
TEST_MODE_ENV_VAR = "RAY_PROMETHEUS_DOWNLOAD_TEST_MODE"


def get_system_info():
os_type = platform.system().lower()
architecture = platform.machine()
if architecture == "x86_64":
# In the Prometheus filename, it's called amd64
architecture = "amd64"
return os_type, architecture


def download_file(url, filename):
logging.info(f"Downloading {url} to {Path(filename).absolute()}...")
try:
test_mode = os.environ.get(TEST_MODE_ENV_VAR, False)
request_method = requests.head if test_mode else requests.get
response = request_method(url, stream=True)
response.raise_for_status()

total_size_in_bytes = int(response.headers.get("content-length", 0))
total_size_in_mb = total_size_in_bytes / (1024 * 1024)

downloaded_size_in_mb = 0
block_size = DOWNLOAD_BLOCK_SIZE

with open(filename, "wb") as file:
for chunk in response.iter_content(chunk_size=block_size):
file.write(chunk)
downloaded_size_in_mb += len(chunk) / (1024 * 1024)
print(
f"Downloaded: {downloaded_size_in_mb:.2f} MB / "
f"{total_size_in_mb:.2f} MB",
end="\r",
)

print("\nDownload completed.")
Comment on lines +37 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a higher level library for downloading byte data into files?
This feels more manual than necessary.

I would think somethign like library.download_file(url, out_path) would already exist in one of the libraries we already have in our environment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one used by Ray Data, but it doesn't provide a percent loading indicator, and I'd like to avoid the Ray Data dependency. I think the percent indicator is useful otherwise it looks like the script is hanging (50mb takes seconds or 1-2 minutes depending on the internet speed). I couldn't find a less manual way of getting a basic progress indicator.

return True

except requests.RequestException as e:
logging.error(f"Error downloading file: {e}")
return False


def install_prometheus(file_path):
try:
with tarfile.open(file_path) as tar:
tar.extractall()
logging.info("Prometheus installed successfully.")
return True
except Exception as e:
logging.error(f"Error installing Prometheus: {e}")
return False


def start_prometheus(prometheus_dir):

config_file = Path(PROMETHEUS_CONFIG_INPUT_PATH)

if not config_file.exists():
raise FileNotFoundError(f"Prometheus config file not found: {config_file}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This configuration only gets created once ray starts. It might make sense to see if we can move the config generation logic to this file instead since this runs before Ray starts.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed offline:

  • Currently in Ray, "creating the configuration" just consists of copying a hardcoded config from the site-packages Ray directory into some location in /tmp/ray. In this PR, we pull the config file from the first location, so we don't actually have to run Ray first.
  • However, if in the future we make a change to Ray that makes updates to the hardcoded config upon ray start before copying it to /tmp/ray, then install_and_start_prometheus.py won't have the updated config. So we should at the very least add code comments in the install script and in the "Ray creating the config" step to make sure future maintainers take this into account. I'll update the PR with these comments.


prometheus_cmd = [
f"{prometheus_dir}/prometheus",
"--config.file",
str(config_file),
]
try:
process = subprocess.Popen(prometheus_cmd)
logging.info("Prometheus has started.")
return process
except Exception as e:
logging.error(f"Failed to start Prometheus: {e}")
return None


def print_shutdown_message(process_id):
message = (
f"Prometheus is running with PID {process_id}.\n"
"To stop Prometheus, use the command: "
f"'kill {process_id}', or if you need to force stop, "
f"use 'kill -9 {process_id}'."
)
print(message)

debug_message = (
"To list all processes running Prometheus, use the command: "
"'ps aux | grep prometheus'."
)
print(debug_message)


def get_latest_prometheus_version():
url = "https://api.github.com/repos/prometheus/prometheus/releases/latest"
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
# Remove the leading 'v' from the version number
return data["tag_name"].lstrip("v")
except requests.RequestException as e:
logging.error(f"Error fetching latest Prometheus version: {e}")
return None


def get_prometheus_filename(os_type=None, architecture=None, prometheus_version=None):
if os_type is None or architecture is None:
os_type, architecture = get_system_info()

if prometheus_version is None:
prometheus_version = get_latest_prometheus_version()
if prometheus_version is None:
logging.warning(
"Failed to retrieve the latest Prometheus version. Falling "
f"back to {FALLBACK_PROMETHEUS_VERSION}."
)
# Fall back to a hardcoded version
prometheus_version = FALLBACK_PROMETHEUS_VERSION

return (
f"prometheus-{prometheus_version}.{os_type}-{architecture}.tar.gz",
prometheus_version,
)


def get_prometheus_download_url(
os_type=None, architecture=None, prometheus_version=None
):
file_name, prometheus_version = get_prometheus_filename(
os_type, architecture, prometheus_version
)
return (
"https://github.com/prometheus/prometheus/releases/"
f"download/v{prometheus_version}/{file_name}"
)


def download_prometheus(os_type=None, architecture=None, prometheus_version=None):
file_name, _ = get_prometheus_filename(os_type, architecture, prometheus_version)
download_url = get_prometheus_download_url(
os_type, architecture, prometheus_version
)

return download_file(download_url, file_name), file_name


def main():
logging.warning("This script is not intended for production use.")

downloaded, file_name = download_prometheus()
if not downloaded:
logging.error("Failed to download Prometheus.")
sys.exit(1)

# TODO: Verify the checksum of the downloaded file

if not install_prometheus(file_name):
logging.error("Installation failed.")
sys.exit(1)

# TODO: Add a check to see if Prometheus is already running
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's do this TODO. Maybe just a simple check of the prometheus port?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will file a followup PR for this. I remember Prometheus prints a reasonable error if the port is already in use, so it should be okay.


assert file_name.endswith(".tar.gz")
process = start_prometheus(
# remove the .tar.gz extension
prometheus_dir=file_name.rstrip(".tar.gz")
)
if process:
print_shutdown_message(process.pid)


if __name__ == "__main__":
main()
10 changes: 5 additions & 5 deletions dashboard/modules/metrics/metrics_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
)
import ray.dashboard.optional_utils as dashboard_optional_utils
import ray.dashboard.utils as dashboard_utils
from ray.dashboard.consts import AVAILABLE_COMPONENT_NAMES_FOR_METRICS
from ray.dashboard.consts import (
AVAILABLE_COMPONENT_NAMES_FOR_METRICS,
METRICS_INPUT_ROOT,
PROMETHEUS_CONFIG_INPUT_PATH,
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand All @@ -33,16 +37,12 @@
routes = dashboard_optional_utils.DashboardHeadRouteTable

METRICS_OUTPUT_ROOT_ENV_VAR = "RAY_METRICS_OUTPUT_ROOT"
METRICS_INPUT_ROOT = os.path.join(os.path.dirname(__file__), "export")
METRICS_RECORD_INTERVAL_S = 5

DEFAULT_PROMETHEUS_HOST = "https://localhost:9090"
PROMETHEUS_HOST_ENV_VAR = "RAY_PROMETHEUS_HOST"
DEFAULT_PROMETHEUS_NAME = "Prometheus"
PROMETHEUS_NAME_ENV_VAR = "RAY_PROMETHEUS_NAME"
PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
)
PROMETHEUS_HEALTHCHECK_PATH = "-/healthy"

DEFAULT_GRAFANA_HOST = "https://localhost:3000"
Expand Down
41 changes: 41 additions & 0 deletions dashboard/modules/tests/test_metrics_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import subprocess
import pytest
import sys

from ray.dashboard.modules.metrics import install_and_start_prometheus


@pytest.mark.parametrize(
"os_type,architecture",
[
("linux", "amd64"),
("linux", "arm64"),
("darwin", "amd64"),
("darwin", "arm64"),
("windows", "amd64"),
("windows", "arm64"),
],
)
def test_download_prometheus(os_type, architecture, monkeypatch):
# set TEST_MODE_ENV_VAR to True to use requests.head instead of requests.get.
# This will make the download faster. We just want to make sure the URL
# exists.
monkeypatch.setenv(install_and_start_prometheus.TEST_MODE_ENV_VAR, "True")
downloaded, _ = install_and_start_prometheus.download_prometheus(
os_type, architecture
)
assert downloaded


def test_e2e(capsys):
install_and_start_prometheus.main()
captured = capsys.readouterr()
assert "Prometheus is running" in captured.out
# Find the Prometheus process and kill it.
# Find the PID from the output: "To stop Prometheus, use the command: 'kill 22790'"
pid = int(captured.out.split("kill ")[1].split("'")[0])
subprocess.run(["kill", str(pid)])


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
Loading
Loading