lint and more

ray-project · ijrsvt · Jun 1, 2021 · May 24, 2021 · May 24, 2021 · May 25, 2021
commit f170974bdada3b7c18c9a35998ebb734c43dfd06
diff --git a/python/ray/_private/metrics_agent.py b/python/ray/_private/metrics_agent.py
@@ -197,6 +197,8 @@ def get_file_discovery_content(self):
  node["MetricsExportPort"]) for node in nodes
  if node["alive"] is True
  ]
+ # TODO(ckw): how to get autoscaler ip?
+ # autoscaler_export_addr = "{}:{}".format("????", AUTOSCALER_METRIC_PO
  return json.dumps([{
  "labels": {
  "job": "ray"

@@ -31,10 +31,13 @@
 from ray.autoscaler._private.util import ConcurrentCounter, validate_config, \
  with_head_node_ip, hash_launch_conf, hash_runtime_conf, \
  format_info_string
-from ray.autoscaler._private.constants import \
- AUTOSCALER_MAX_NUM_FAILURES, AUTOSCALER_MAX_LAUNCH_BATCH, \
- AUTOSCALER_MAX_CONCURRENT_LAUNCHES, AUTOSCALER_METRIC_PORT, \
- AUTOSCALER_UPDATE_INTERVAL_S, AUTOSCALER_HEARTBEAT_TIMEOUT_S
+from ray.autoscaler._private.constants import AUTOSCALER_MAX_NUM_FAILURES, \
+ AUTOSCALER_MAX_LAUNCH_BATCH, AUTOSCALER_MAX_CONCURRENT_LAUNCHES, \
+ AUTOSCALER_METRIC_PORT, AUTOSCALER_UPDATE_INTERVAL_S, \
+ AUTOSCALER_HEARTBEAT_TIMEOUT_S
+from ray.autoscaler._private.prom_metrics import \
+ AUTOSCALER_EXCEPTIONS_COUNTER, AUTOSCALER_STOPPED_NODES_COUNTER, \
+ AUTOSCALER_METRIC_REGISTRY, AUTOSCALER_RUNNING_NODES_GAUGE
 from six.moves import queue
 
 logger = logging.getLogger(__name__)
@@ -49,40 +52,9 @@
  "AutoscalerSummary",
  ["active_nodes", "pending_nodes", "pending_launches", "failed_nodes"])
 
-# Metrics
-AUTOSCALER_METRIC_REGISTRY = prometheus_client.CollectorRegistry()
-prometheus_client.start_http_server(
- AUTOSCALER_METRIC_PORT, registry=AUTOSCALER_METRIC_REGISTRY)
-WORKER_STARTUP_TIME_HISTOGRAM = prometheus_client.Histogram(
- "worker_startup_time_seconds",
- "Worker startup time",
- unit="seconds",
- namespace="autoscaler",
- registry=AUTOSCALER_METRIC_REGISTRY)
-NODES_STARTED_COUNTER = prometheus_client.Counter(
- "started_nodes",
- "Number of nodes started",
- unit="nodes",
- namespace="autoscaler",
- registry=AUTOSCALER_METRIC_REGISTRY)
-NODES_STOPPED_COUNTER = prometheus_client.Counter(
- "stopped_nodes",
- "Number of nodes stopped",
- unit="nodes",
- namespace="autoscaler",
- registry=AUTOSCALER_METRIC_REGISTRY)
-NUM_EXCEPTIONS_COUNTER = prometheus_client.Counter(
- "exceptions",
- "Number of exceptions",
- unit="exceptions",
- namespace="autoscaler",
- registry=AUTOSCALER_METRIC_REGISTRY)
-NODES_RUNNING_GAUGE = prometheus_client.Gauge(
- "running_nodes",
- "Number of nodes running",
- unit="nodes",
- namespace="autoscaler",
- registry=AUTOSCALER_METRIC_REGISTRY)
+# Prevent multiple servers from starting if multiple autoscaler's
+# are instantiated for some reaso
+_metric_server_started = False
 
 
 class StandardAutoscaler:
@@ -148,7 +120,6 @@ def __init__(self,
  queue=self.launch_queue,
  index=i,
  pending=self.pending_launches,
- startup_histogram=WORKER_STARTUP_TIME_HISTOGRAM,
  node_types=self.available_node_types,
  )
  node_launcher.daemon = True
@@ -169,13 +140,20 @@ def __init__(self,
  for local_path in self.config["file_mounts"].values():
  assert os.path.exists(local_path)
 
+ global _metric_server_started
+ if not _metric_server_started:
+ _metric_server_started = True
+ prometheus_client.start_http_server(
+ AUTOSCALER_METRIC_PORT, registry=AUTOSCALER_METRIC_REGISTRY)
+
  logger.info("StandardAutoscaler: {}".format(self.config))
 
  def update(self):
  try:
  self.reset(errors_fatal=False)
  self._update()
  except Exception as e:
+ AUTOSCALER_EXCEPTIONS_COUNTER.inc()
  logger.exception("StandardAutoscaler: "
  "Error during autoscaling.")
  # Don't abort the autoscaler if the K8s API server is down.
@@ -199,7 +177,6 @@ def _update(self):
 
  self.last_update_time = now
  nodes = self.workers()
- NODES_RUNNING_GAUGE.set(len(nodes))
 
  self.load_metrics.prune_active_ips([
  self.provider.internal_ip(node_id)
@@ -255,9 +232,8 @@ def _update(self):
  self.provider.terminate_nodes(nodes_to_terminate)
  for node in nodes_to_terminate:
  self.node_tracker.untrack(node)
- NODES_STOPPED_COUNTER.inc()
+ AUTOSCALER_STOPPED_NODES_COUNTER.inc()
  nodes = self.workers()
- NODES_RUNNING_GAUGE.set(len(nodes))
 
  # Terminate nodes if there are too many
  nodes_to_terminate = []
@@ -277,9 +253,8 @@ def _update(self):
  self.provider.terminate_nodes(nodes_to_terminate)
  for node in nodes_to_terminate:
  self.node_tracker.untrack(node)
- NODES_STOPPED_COUNTER.inc()
+ AUTOSCALER_STOPPED_NODES_COUNTER.inc()
  nodes = self.workers()
- NODES_RUNNING_GAUGE.set(len(nodes))
 
  to_launch = self.resource_demand_scheduler.get_nodes_to_launch(
  self.provider.non_terminated_nodes(tag_filters={}),
@@ -291,11 +266,9 @@ def _update(self):
  ensure_min_cluster_size=self.load_metrics.get_resource_requests())
  for node_type, count in to_launch.items():
  self.launch_new_node(count, node_type=node_type)
- NODES_STARTED_COUNTER.inc()
 
  if to_launch:
  nodes = self.workers()
- NODES_RUNNING_GAUGE.set(len(nodes))
 
  # Process any completed updates
  completed_nodes = []
@@ -514,6 +487,7 @@ def reset(self, errors_fatal=False):
  try:
  validate_config(new_config)
  except Exception as e:
+ AUTOSCALER_EXCEPTIONS_COUNTER.inc()
  logger.debug(
  "Cluster config validation failed. The version of "
  "the ray CLI you launched this cluster with may "
@@ -577,6 +551,7 @@ def reset(self, errors_fatal=False):
  upscaling_speed)
 
  except Exception as e:
+ AUTOSCALER_EXCEPTIONS_COUNTER.inc()
  if errors_fatal:
  raise e
  else:
@@ -782,6 +757,8 @@ def all_workers(self):
  def workers(self):
  nodes = self.provider.non_terminated_nodes(
  tag_filters={TAG_RAY_NODE_KIND: NODE_KIND_WORKER})
+ # Update running nodes gauge whenever we check workers
+ AUTOSCALER_RUNNING_NODES_GAUGE.set(len(nodes))
  return nodes
 
  def unmanaged_workers(self):

@@ -61,6 +61,7 @@ def env_integer(key, default):
 # to run.
 AUTOSCALER_MAX_RESOURCE_DEMAND_VECTOR_SIZE = 1000
 
+# Port that autoscaler prometheus metrics will be exported to
 AUTOSCALER_METRIC_PORT = env_integer("AUTOSCALER_METRIC_PORT", 44217)
 
 # Max number of retries to AWS (default is 5, time increases exponentially)

@@ -9,6 +9,9 @@
  TAG_RAY_USER_NODE_TYPE, STATUS_UNINITIALIZED,
  NODE_KIND_WORKER)
 from ray.autoscaler._private.util import hash_launch_conf
+from ray.autoscaler._private.prom_metrics import (
+ AUTOSCALER_EXCEPTIONS_COUNTER, AUTOSCALER_WORKER_STARTUP_TIME_HISTOGRAM,
+ AUTOSCALER_STARTED_NODES_COUNTER)
 
 logger = logging.getLogger(__name__)
 
@@ -20,15 +23,13 @@ def __init__(self,
  provider,
  queue,
  pending,
- startup_histogram=None,
  node_types=None,
  index=None,
  *args,
  **kwargs):
  self.queue = queue
  self.pending = pending
  self.provider = provider
- self.startup_histogram = startup_histogram
  self.node_types = node_types
  self.index = str(index) if index is not None else ""
  super(NodeLauncher, self).__init__(*args, **kwargs)
@@ -63,9 +64,9 @@ def _launch_node(self, config: Dict[str, Any], count: int,
  launch_start_time = time.time()
  self.provider.create_node(node_config, node_tags, count)
  startup_time = time.time() - launch_start_time
- if self.startup_histogram:
- for _ in range(count):
-  self.startup_histogram.observe(startup_time)
+ for _ in range(count):
+ AUTOSCALER_WORKER_STARTUP_TIME_HISTOGRAM.observe(startup_time)
+ AUTOSCALER_STARTED_NODES_COUNTER.inc(count)
  after = self.provider.non_terminated_nodes(tag_filters=worker_filter)
  if set(after).issubset(before):
  self.log("No new nodes reported after node creation.")
@@ -77,6 +78,7 @@ def run(self):
  try:
  self._launch_node(config, count, node_type)
  except Exception:
+ AUTOSCALER_EXCEPTIONS_COUNTER.inc()
  logger.exception("Launch failed")
  finally:
  self.pending.dec(node_type, count)

@@ -0,0 +1,33 @@
+import prometheus_client
+
+AUTOSCALER_METRIC_REGISTRY = prometheus_client.CollectorRegistry()
+AUTOSCALER_WORKER_STARTUP_TIME_HISTOGRAM = prometheus_client.Histogram(
+ "worker_startup_time_seconds",
+ "Worker startup time",
+ unit="seconds",
+ namespace="autoscaler",
+ registry=AUTOSCALER_METRIC_REGISTRY)
+AUTOSCALER_STARTED_NODES_COUNTER = prometheus_client.Counter(
+ "started_nodes",
+ "Number of nodes started",
+ unit="nodes",
+ namespace="autoscaler",
+ registry=AUTOSCALER_METRIC_REGISTRY)
+AUTOSCALER_STOPPED_NODES_COUNTER = prometheus_client.Counter(
+ "stopped_nodes",
+ "Number of nodes stopped",
+ unit="nodes",
+ namespace="autoscaler",
+ registry=AUTOSCALER_METRIC_REGISTRY)
+AUTOSCALER_RUNNING_NODES_GAUGE = prometheus_client.Gauge(
+ "running_nodes",
+ "Number of nodes running",
+ unit="nodes",
+ namespace="autoscaler",
+ registry=AUTOSCALER_METRIC_REGISTRY)
+AUTOSCALER_EXCEPTIONS_COUNTER = prometheus_client.Counter(
+ "exceptions",
+ "Number of exceptions",
+ unit="exceptions",
+ namespace="autoscaler",
+ registry=AUTOSCALER_METRIC_REGISTRY)