Skip to content

Commit

Permalink
feat: adding using of public dashboard for mainnet for health statuses (
Browse files Browse the repository at this point in the history
#362)

* adding using of public dashboard for mainnet for health statuses

* fixing imports and renaming

* clippy fix
  • Loading branch information
NikolaMilosa committed May 9, 2024
1 parent f6bda6e commit 70b4c5f
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 6 deletions.
2 changes: 1 addition & 1 deletion rs/cli/src/operations/hostos_rollout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use anyhow::anyhow;
use async_recursion::async_recursion;
use futures_util::future::try_join;
use ic_base_types::{NodeId, PrincipalId};
use ic_management_backend::health;
use ic_management_backend::health::{self, HealthStatusQuerier};
use ic_management_backend::proposal::ProposalAgent;
use ic_management_types::{Network, Node, Status, Subnet, UpdateNodesHostosVersionsProposal};
use log::{debug, info};
Expand Down
1 change: 1 addition & 0 deletions rs/ic-management-backend/src/endpoints/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod query_decentralization;
pub mod release;
pub mod subnet;

use crate::health::HealthStatusQuerier;
use crate::{
gitlab_dfinity, health, prometheus, proposal, registry, registry::RegistryState,
release::list_subnets_release_statuses, release::RolloutBuilder,
Expand Down
2 changes: 1 addition & 1 deletion rs/ic-management-backend/src/endpoints/nodes_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use ic_management_types::requests::{NodeRemoval, NodeRemovalReason, NodesRemoveR
use itertools::Itertools;

use super::*;
use crate::health;
use crate::health::{self, HealthStatusQuerier};
use decentralization::network::Node as DecentralizationNode;

/// Finds all nodes that need to be removed from the network either because
Expand Down
1 change: 1 addition & 0 deletions rs/ic-management-backend/src/endpoints/subnet.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::*;
use crate::health::HealthStatusQuerier;
use crate::{health, subnets::get_proposed_subnet_changes};
use decentralization::network::{SubnetQueryBy, TopologyManager};
use ic_base_types::PrincipalId;
Expand Down
221 changes: 218 additions & 3 deletions rs/ic-management-backend/src/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,239 @@ use std::{

use ic_base_types::PrincipalId;
use ic_management_types::{Network, Status};
use log::warn;
use prometheus_http_query::{Client, Selector};
use reqwest::{Client as ReqwestClient, Method};
use serde_json::Value;
use url::Url;

use crate::prometheus;

pub struct HealthClient {
implementation: HealthStatusQuerierImplementations,
}

impl HealthClient {
pub fn new(network: Network) -> Self {
Self {
implementation: network.into(),
}
}
}

impl HealthStatusQuerier for HealthClient {
async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
match &self.implementation {
HealthStatusQuerierImplementations::Dashboard(c) => c.subnet(subnet).await,
HealthStatusQuerierImplementations::Prometheus(c) => c.subnet(subnet).await,
}
}

async fn nodes(&self) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
match &self.implementation {
HealthStatusQuerierImplementations::Dashboard(c) => c.nodes().await,
HealthStatusQuerierImplementations::Prometheus(c) => c.nodes().await,
}
}
}

enum HealthStatusQuerierImplementations {
Dashboard(PublicDashboardHealthClient),
Prometheus(PrometheusHealthClient),
}

impl From<Network> for HealthStatusQuerierImplementations {
fn from(value: Network) -> Self {
match value.name.as_str() {
"mainnet" => HealthStatusQuerierImplementations::Dashboard(PublicDashboardHealthClient::new(None)),
_ => HealthStatusQuerierImplementations::Prometheus(PrometheusHealthClient::new(value.clone())),
}
}
}

pub trait HealthStatusQuerier {
fn subnet(
&self,
subnet: PrincipalId,
) -> impl std::future::Future<Output = anyhow::Result<BTreeMap<PrincipalId, Status>>> + Send;
fn nodes(&self) -> impl std::future::Future<Output = anyhow::Result<BTreeMap<PrincipalId, Status>>> + Send;
}

pub struct PublicDashboardHealthClient {
client: ReqwestClient,
base_url: Url,
}

impl PublicDashboardHealthClient {
pub fn new(base_url: Option<Url>) -> Self {
Self {
client: ReqwestClient::new(),
base_url: match base_url {
Some(u) => u,
None => Url::from_str("https://ic-api.internetcomputer.org/").expect("Should be a valid url"),
},
}
}

fn api_node_list(&self) -> anyhow::Result<Url> {
self.base_url
.join("/api/node-list")
.map_err(|e| anyhow::anyhow!("Error joining url: {:?}", e))
}

async fn get_all_nodes(&self) -> anyhow::Result<Vec<ShortNodeInfo>> {
let request = self
.client
.request(Method::GET, self.api_node_list()?)
.header("accept", "application/json")
.build()
.map_err(|e| anyhow::anyhow!("Error building a request: {:?}", e))?;
let response = self
.client
.execute(request)
.await
.map_err(|e| anyhow::anyhow!("Error while fetching data from public dashboard: {:?}", e))?;

let response = response
.json::<Value>()
.await
.map_err(|e| anyhow::anyhow!("Error unmarshaling json: {:?}", e))?;

let nodes = match response.get("nodes") {
None => return Err(anyhow::anyhow!("Unexpected data contract. Missing 'nodes' key.")),
Some(v) => v,
};

let mut response = vec![];

let nodes = match nodes.as_array() {
None => {
return Err(anyhow::anyhow!(
"Unexpected data contract. Couldn't parse response as array"
))
}
Some(n) => n,
};

for node in nodes {
let node_id = match node.get("node_id") {
None => {
warn!("Didn't find pricipal while checking node health which shouldn't happen!");
continue;
}
Some(p) => {
// Serde to_string() returns quoted strings which means we have to skip first and last char.
let p = p.to_string();
let p = get_unquoted(&p);
match PrincipalId::from_str(p) {
Ok(p) => p,
Err(e) => {
warn!(
"Couldn't parse principal from string {} which shouldn't happen! Error: {:?}",
p, e
);
continue;
}
}
}
};

let status = match node.get("status") {
None => {
warn!("Didn't find node while checking node health which shouldn't happen!");
continue;
}
Some(s) => {
let s = s.to_string();
let s = get_unquoted(&s);
Status::from_str_from_dashboard(s)
}
};

let maybe_subnet = match node.get("subnet_id") {
None => None,
Some(pr) => {
let p = pr.to_string();
let p = get_unquoted(&p);
match PrincipalId::from_str(p) {
Ok(p) => Some(p),
// Serde returns quoted strings but if the value is null it doesn't quote it, meaning we get (after skipping) 'ul'
Err(_) if p == "ul" => None,
Err(e) => {
warn!(
"Couldn't parse principal from string '{}' which shouldn't happen! Error: {:?}",
p, e
);
None
}
}
}
};

response.push(ShortNodeInfo {
node_id,
subnet_id: maybe_subnet,
status,
})
}

Ok(response)
}
}

fn get_unquoted(s: &str) -> &str {
let mut chars = s.chars();
chars.next();
chars.next_back();
chars.as_str()
}

struct ShortNodeInfo {
node_id: PrincipalId,
subnet_id: Option<PrincipalId>,
status: Status,
}

impl HealthStatusQuerier for PublicDashboardHealthClient {
async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
Ok(self
.get_all_nodes()
.await?
.into_iter()
.filter(|n| match n.subnet_id {
None => false,
Some(p) => p.eq(&subnet),
})
.map(|n| (n.node_id, n.status))
.collect())
}

async fn nodes(&self) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
Ok(self
.get_all_nodes()
.await?
.into_iter()
.map(|n| (n.node_id, n.status))
.collect())
}
}

pub struct PrometheusHealthClient {
client: Client,
network: Network,
}

impl HealthClient {
impl PrometheusHealthClient {
pub fn new(network: Network) -> Self {
Self {
client: prometheus::client(&network),
network,
}
}
}

pub async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
impl HealthStatusQuerier for PrometheusHealthClient {
async fn subnet(&self, subnet: PrincipalId) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
let ic_name = self.network.legacy_name();
let subnet_name = subnet.to_string();
let query_up = Selector::new()
Expand Down Expand Up @@ -70,7 +285,7 @@ impl HealthClient {
.collect())
}

pub async fn nodes(&self) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
async fn nodes(&self) -> anyhow::Result<BTreeMap<PrincipalId, Status>> {
let query = format!(
r#"ic_replica_orchestrator:health_state:bottomk_1{{ic="{network}"}}"#,
network = self.network.legacy_name(),
Expand Down
1 change: 1 addition & 0 deletions rs/ic-management-backend/src/registry.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::factsdb;
use crate::git_ic_repo::IcRepo;
use crate::health::HealthStatusQuerier;
use crate::proposal::{self, SubnetUpdateProposal, UpdateUnassignedNodesProposal};
use crate::public_dashboard::query_ic_dashboard_list;
use async_trait::async_trait;
Expand Down
12 changes: 12 additions & 0 deletions rs/ic-management-types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,18 @@ pub enum Status {
Unknown,
}

/// Even if `from_str` is implemented by `EnumString` in derive, public api returns them capitalized and this is the implementation for that convertion
impl Status {
pub fn from_str_from_dashboard(s: &str) -> Self {
match s {
"UP" => Self::Healthy,
"DEGRADED" => Self::Degraded,
"DOWN" => Self::Dead,
_ => Self::Unknown,
}
}
}

impl From<i64> for Health {
fn from(value: i64) -> Self {
match value {
Expand Down
6 changes: 5 additions & 1 deletion rs/np-notifications/src/health_check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ use core::time;
use ic_management_types::NodeProvidersResponse;
use std::sync::mpsc::Sender;

use ic_management_backend::{health::HealthClient, public_dashboard::query_ic_dashboard_list, registry::RegistryState};
use ic_management_backend::{
health::{HealthClient, HealthStatusQuerier},
public_dashboard::query_ic_dashboard_list,
registry::RegistryState,
};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};

Expand Down

0 comments on commit 70b4c5f

Please sign in to comment.