Skip to content

Commit

Permalink
Merge "Fix incorrect prometheus alert names in nagios"
Browse files Browse the repository at this point in the history
  • Loading branch information
Zuul authored and openstack-gerrit committed Jan 15, 2020
2 parents 042ac52 + 4fdcff5 commit cc399a0
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 11 deletions.
20 changes: 10 additions & 10 deletions nagios/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -524,23 +524,23 @@ conf:
}
define service {
check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset="prometheus"!statefulset {statefulset} has lesser than configured replicas
check_interval 60
hostgroup_name prometheus-hosts
service_description Prometheus_replica-count
use notifying_service
}
define service {
check_command check_prom_alert_with_labels!replicas_unavailable_statefulset!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
check_command check_prom_alert_with_labels!kube_statefulset_replicas_unavailable!statefulset="alertmanager"!statefulset {statefulset} has lesser than configured replicas
check_interval 60
hostgroup_name prometheus-hosts
service_description PrometheusAlertmanager_replica-count
use notifying_service
}
define service {
check_command check_prom_alert!replicas_unavailable_statefulset!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
check_command check_prom_alert!kube_statefulset_replicas_unavailable!CRITICAL- statefulset {statefulset} has lesser than configured replicas!OK- All statefulsets have configured amount of replicas
check_interval 60
hostgroup_name prometheus-hosts
service_description Statefulset_replica-count
Expand Down Expand Up @@ -752,7 +752,7 @@ conf:
}
define command {
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_memory_load' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Node memory usage is more than 85%' --ok_message 'OK- Node memory usage is less than 85%'
command_name check_memory_usage
}
Expand Down Expand Up @@ -782,22 +782,22 @@ conf:
}
define command {
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network reception.' --ok_message 'OK- network packet receive drops not high.'
command_name check_network_receive_drop_high
}
define command {
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high drop in network transmission.' --ok_message 'OK- network packet tramsmit drops not high.'
command_name check_network_transmit_drop_high
}
define command {
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_rcv' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network reception.' --ok_message 'OK- network reception errors not high.'
command_name check_network_receive_errors_high
}
define command {
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'high_network_drop_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
command_line $USER1$/query_prometheus_alerts.py --prometheus_api $USER2$ --alertname 'node_high_network_errs_send' --labels_csv 'instance=~"$HOSTADDRESS$.*"' --msg_format 'CRITICAL- Host system has an unusally high error rate in network transmission.' --ok_message 'OK- network transmission errors not high.'
command_name check_network_transmit_errors_high
}
Expand Down Expand Up @@ -990,7 +990,7 @@ conf:
}
define service {
check_command check_prom_alert!ceph_monitor_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
check_command check_prom_alert!ceph_mon_quorum_low!CRITICAL- ceph monitor quorum does not exist!OK- ceph monitor quorum exists
check_interval 60
hostgroup_name prometheus-hosts
service_description CEPH_quorum
Expand Down Expand Up @@ -1022,7 +1022,7 @@ conf:
}
define service {
check_command check_prom_alert!ceph_monitor_clock_skew_high!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
check_command check_prom_alert_with_labels!node_ntp_clock_skew_high!ceph-mon="enabled"!CRITICAL- CEPH clock skew is more than 2 seconds!OK- CEPH clock skew is less than 2 seconds
check_interval 60
hostgroup_name prometheus-hosts
service_description CEPH_Clock-skew
Expand Down
2 changes: 1 addition & 1 deletion prometheus/values_overrides/kubernetes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ conf:
annotations:
description: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} has been in pending status for more than 10 minutes'
summary: 'Pod {{$labels.pod}} in namespace {{$labels.namespace}} in pending status'
- alert: pod_error_image_pull
- alert: pod_status_error_image_pull
expr: kube_pod_container_status_waiting_reason {reason="ErrImagePull"} == 1
for: 10m
labels:
Expand Down

0 comments on commit cc399a0

Please sign in to comment.