-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Metrics] Add easy-install script for Prometheus #42359
Changes from 15 commits
5205a26
9513b40
7e8ff75
be86fbd
8eb57c6
8365afd
fe84cfb
71813e9
278987f
8bcf6f5
eeddffb
e6be641
aea6c71
de64e44
824535b
c7311f7
1024b2a
243a8fb
551ddf7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
import logging | ||
import os | ||
import platform | ||
import subprocess | ||
import sys | ||
import tarfile | ||
from pathlib import Path | ||
|
||
import requests | ||
|
||
from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH | ||
|
||
# Configure basic logging | ||
logging.basicConfig( | ||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | ||
) | ||
|
||
FALLBACK_PROMETHEUS_VERSION = "2.48.1" | ||
DOWNLOAD_BLOCK_SIZE = 8192 # 8 KB | ||
TEST_MODE_ENV_VAR = "RAY_PROMETHEUS_DOWNLOAD_TEST_MODE" | ||
|
||
|
||
def get_system_info(): | ||
os_type = platform.system().lower() | ||
architecture = platform.machine() | ||
if architecture == "x86_64": | ||
# In the Prometheus filename, it's called amd64 | ||
architecture = "amd64" | ||
return os_type, architecture | ||
|
||
|
||
def download_file(url, filename): | ||
logging.info(f"Downloading {url} to {Path(filename).absolute()}...") | ||
try: | ||
test_mode = os.environ.get(TEST_MODE_ENV_VAR, False) | ||
request_method = requests.head if test_mode else requests.get | ||
response = request_method(url, stream=True) | ||
response.raise_for_status() | ||
|
||
total_size_in_bytes = int(response.headers.get("content-length", 0)) | ||
total_size_in_mb = total_size_in_bytes / (1024 * 1024) | ||
|
||
downloaded_size_in_mb = 0 | ||
block_size = DOWNLOAD_BLOCK_SIZE | ||
|
||
with open(filename, "wb") as file: | ||
for chunk in response.iter_content(chunk_size=block_size): | ||
file.write(chunk) | ||
downloaded_size_in_mb += len(chunk) / (1024 * 1024) | ||
print( | ||
f"Downloaded: {downloaded_size_in_mb:.2f} MB / " | ||
f"{total_size_in_mb:.2f} MB", | ||
end="\r", | ||
) | ||
|
||
print("\nDownload completed.") | ||
return True | ||
|
||
except requests.RequestException as e: | ||
logging.error(f"Error downloading file: {e}") | ||
return False | ||
|
||
|
||
def install_prometheus(file_path): | ||
try: | ||
with tarfile.open(file_path) as tar: | ||
tar.extractall() | ||
logging.info("Prometheus installed successfully.") | ||
return True | ||
except Exception as e: | ||
logging.error(f"Error installing Prometheus: {e}") | ||
return False | ||
|
||
|
||
def start_prometheus(prometheus_dir): | ||
|
||
config_file = Path(PROMETHEUS_CONFIG_INPUT_PATH) | ||
|
||
if not config_file.exists(): | ||
raise FileNotFoundError(f"Prometheus config file not found: {config_file}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This configuration only gets created once ray starts. It might make sense to see if we can move the config generation logic to this file instead since this runs before Ray starts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline:
|
||
|
||
prometheus_cmd = [ | ||
f"{prometheus_dir}/prometheus", | ||
"--config.file", | ||
str(config_file), | ||
] | ||
try: | ||
process = subprocess.Popen(prometheus_cmd) | ||
logging.info("Prometheus has started.") | ||
return process | ||
except Exception as e: | ||
logging.error(f"Failed to start Prometheus: {e}") | ||
return None | ||
|
||
|
||
def print_shutdown_message(process_id): | ||
message = ( | ||
f"Prometheus is running with PID {process_id}.\n" | ||
"To stop Prometheus, use the command: " | ||
f"'kill {process_id}', or if you need to force stop, " | ||
f"use 'kill -9 {process_id}'." | ||
) | ||
print(message) | ||
|
||
debug_message = ( | ||
"To list all processes running Prometheus, use the command: " | ||
"'ps aux | grep prometheus'." | ||
) | ||
print(debug_message) | ||
|
||
|
||
def get_latest_prometheus_version(): | ||
url = "https://api.github.com/repos/prometheus/prometheus/releases/latest" | ||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
data = response.json() | ||
# Remove the leading 'v' from the version number | ||
return data["tag_name"].lstrip("v") | ||
except requests.RequestException as e: | ||
logging.error(f"Error fetching latest Prometheus version: {e}") | ||
return None | ||
|
||
|
||
def get_prometheus_filename(os_type=None, architecture=None, prometheus_version=None): | ||
if os_type is None or architecture is None: | ||
os_type, architecture = get_system_info() | ||
|
||
if prometheus_version is None: | ||
prometheus_version = get_latest_prometheus_version() | ||
if prometheus_version is None: | ||
logging.warning( | ||
"Failed to retrieve the latest Prometheus version. Falling " | ||
f"back to {FALLBACK_PROMETHEUS_VERSION}." | ||
) | ||
# Fall back to a hardcoded version | ||
prometheus_version = FALLBACK_PROMETHEUS_VERSION | ||
|
||
return ( | ||
f"prometheus-{prometheus_version}.{os_type}-{architecture}.tar.gz", | ||
prometheus_version, | ||
) | ||
|
||
|
||
def get_prometheus_download_url( | ||
os_type=None, architecture=None, prometheus_version=None | ||
): | ||
file_name, prometheus_version = get_prometheus_filename( | ||
os_type, architecture, prometheus_version | ||
) | ||
return ( | ||
"https://github.com/prometheus/prometheus/releases/" | ||
f"download/v{prometheus_version}/{file_name}" | ||
) | ||
|
||
|
||
def download_prometheus(os_type=None, architecture=None, prometheus_version=None): | ||
file_name, _ = get_prometheus_filename(os_type, architecture, prometheus_version) | ||
download_url = get_prometheus_download_url( | ||
os_type, architecture, prometheus_version | ||
) | ||
|
||
return download_file(download_url, file_name), file_name | ||
|
||
|
||
def main(): | ||
logging.warning("This script is not intended for production use.") | ||
|
||
downloaded, file_name = download_prometheus() | ||
if not downloaded: | ||
logging.error("Failed to download Prometheus.") | ||
sys.exit(1) | ||
|
||
# TODO: Verify the checksum of the downloaded file | ||
|
||
if not install_prometheus(file_name): | ||
logging.error("Installation failed.") | ||
sys.exit(1) | ||
|
||
# TODO: Add a check to see if Prometheus is already running | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's do this TODO. Maybe just a simple check of the prometheus port? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will file a followup PR for this. I remember Prometheus prints a reasonable error if the port is already in use, so it should be okay. |
||
|
||
assert file_name.endswith(".tar.gz") | ||
process = start_prometheus( | ||
# remove the .tar.gz extension | ||
prometheus_dir=file_name.rstrip(".tar.gz") | ||
) | ||
if process: | ||
print_shutdown_message(process.pid) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import subprocess | ||
import pytest | ||
import sys | ||
|
||
from ray.dashboard.modules.metrics import install_and_start_prometheus | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"os_type,architecture", | ||
[ | ||
("linux", "amd64"), | ||
("linux", "arm64"), | ||
("darwin", "amd64"), | ||
("darwin", "arm64"), | ||
("windows", "amd64"), | ||
("windows", "arm64"), | ||
], | ||
) | ||
def test_download_prometheus(os_type, architecture, monkeypatch): | ||
# set TEST_MODE_ENV_VAR to True to use requests.head instead of requests.get. | ||
# This will make the download faster. We just want to make sure the URL | ||
# exists. | ||
monkeypatch.setenv(install_and_start_prometheus.TEST_MODE_ENV_VAR, "True") | ||
downloaded, _ = install_and_start_prometheus.download_prometheus( | ||
os_type, architecture | ||
) | ||
assert downloaded | ||
|
||
|
||
def test_e2e(capsys): | ||
install_and_start_prometheus.main() | ||
captured = capsys.readouterr() | ||
assert "Prometheus is running" in captured.out | ||
# Find the Prometheus process and kill it. | ||
# Find the PID from the output: "To stop Prometheus, use the command: 'kill 22790'" | ||
pid = int(captured.out.split("kill ")[1].split("'")[0]) | ||
subprocess.run(["kill", str(pid)]) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a higher level library for downloading byte data into files?
This feels more manual than necessary.
I would think somethign like
library.download_file(url, out_path)
would already exist in one of the libraries we already have in our environment.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is one used by Ray Data, but it doesn't provide a percent loading indicator, and I'd like to avoid the Ray Data dependency. I think the percent indicator is useful otherwise it looks like the script is hanging (50mb takes seconds or 1-2 minutes depending on the internet speed). I couldn't find a less manual way of getting a basic progress indicator.