Skip to content

Commit

Permalink
feat(prometheus): add node-problem-detector to prometheus addon
Browse files Browse the repository at this point in the history
Signed-off-by: Feng Kun <[email protected]>
  • Loading branch information
kevinfeng authored and tke-robot committed Jul 20, 2020
1 parent aac8bdf commit af967de
Show file tree
Hide file tree
Showing 19 changed files with 1,111 additions and 470 deletions.
2 changes: 2 additions & 0 deletions api/monitor/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ type PrometheusSpec struct {
RunOnMaster bool
// +optional
AlertRepeatInterval string
// +optional
WithNPD bool
}

// PrometheusStatus is information about the current status of a Prometheus.
Expand Down
282 changes: 157 additions & 125 deletions api/monitor/v1/generated.pb.go

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions api/monitor/v1/generated.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions api/monitor/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ type PrometheusSpec struct {
// +optional
// AlertRepeatInterval indicates repeat interval of alerts
AlertRepeatInterval string `json:"alertRepeatInterval,omitempty" protobuf:"bytes,9,opt,name=alertRepeatInterval"`
// +optional
// WithNPD indicates whether to deploy node-problem-detector or not
WithNPD bool `json:"withNPD,omitempty" protobuf:"bytes,10,opt,name=withNPD"`
}

// PrometheusStatus is information about the current status of a Prometheus.
Expand Down
1 change: 1 addition & 0 deletions api/monitor/v1/types_swagger_doc_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions api/monitor/v1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions api/openapi/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions api/platform/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,8 @@ type PrometheusSpec struct {
RunOnMaster bool
// +optional
AlertRepeatInterval string
// +optional
WithNPD bool
}

// PrometheusStatus is information about the current status of a Prometheus.
Expand Down
717 changes: 374 additions & 343 deletions api/platform/v1/generated.pb.go

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions api/platform/v1/generated.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions api/platform/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,9 @@ type PrometheusSpec struct {
// +optional
// AlertRepeatInterval indicates repeat interval of alerts
AlertRepeatInterval string `json:"alertRepeatInterval,omitempty" protobuf:"bytes,9,opt,name=alertRepeatInterval"`
// +optional
// WithNPD indicates whether to deploy node-problem-detector or not
WithNPD bool `json:"withNPD,omitempty" protobuf:"bytes,10,opt,name=withNPD"`
}

// PrometheusStatus is information about the current status of a Prometheus.
Expand Down
1 change: 1 addition & 0 deletions api/platform/v1/types_swagger_doc_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions api/platform/v1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

229 changes: 229 additions & 0 deletions pkg/monitor/controller/prometheus/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ const (
kubeStateRole = "kube-state-metrics-resizer"
kubeStateWorkLoad = "kube-state-metrics"

nodeProblemDetectorWorkload = "node-problem-detector"
nodeProblemDetectorServiceAccount = "node-problem-detector"
nodeProblemDetectorClusterRoleBinding = "node-problem-detector"
nodeProblemDetectorClusterRole = "node-problem-detector"

specialLabelName = "k8s-submitter"
specialLabelValue = "controller"
)
Expand Down Expand Up @@ -801,6 +806,25 @@ func (c *Controller) installPrometheus(ctx context.Context, prometheus *v1.Prome
}
prometheus.Status.SubVersion[kubeStateService] = components.KubeStateService.Tag

if prometheus.Spec.WithNPD {
// ServiceAccount for node-problem-detector
if _, err := kubeClient.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Create(ctx, createServiceAccountForNPD(), metav1.CreateOptions{}); err != nil {
return err
}
// ClusterRole for node-problem-detector
if _, err := kubeClient.RbacV1().ClusterRoles().Create(ctx, createClusterRoleForNPD(), metav1.CreateOptions{}); err != nil {
return err
}
// ClusterRoleBinding for node-problem-detector
if _, err := kubeClient.RbacV1().ClusterRoleBindings().Create(ctx, createClusterRoleBindingForNPD(), metav1.CreateOptions{}); err != nil {
return err
}
// DaemonSet for node-problem-detector
if _, err := kubeClient.AppsV1().DaemonSets(metav1.NamespaceSystem).Create(ctx, createDaemonSetForNPD(components), metav1.CreateOptions{}); err != nil {
return err
}
prometheus.Status.SubVersion[nodeProblemDetectorWorkload] = components.NodeProblemDetector.Tag
}
return nil
}

Expand Down Expand Up @@ -1944,6 +1968,188 @@ func createAppsDeploymentForMetrics(components images.Components, prometheus *v1
return deploy
}

func createServiceAccountForNPD() *corev1.ServiceAccount {
return &corev1.ServiceAccount{
TypeMeta: metav1.TypeMeta{
APIVersion: "v1",
Kind: "ServiceAccount",
},
ObjectMeta: metav1.ObjectMeta{
Name: nodeProblemDetectorServiceAccount,
Namespace: metav1.NamespaceSystem,
Labels: map[string]string{"kubernetes.io/cluster-service": "true", "addonmanager.kubernetes.io/mode": "Reconcile"},
},
}
}

func createClusterRoleForNPD() *rbacv1.ClusterRole {
return &rbacv1.ClusterRole{
TypeMeta: metav1.TypeMeta{
APIVersion: "rbac.authorization.k8s.io/v1beta1",
Kind: "ClusterRole",
},
ObjectMeta: metav1.ObjectMeta{
Name: nodeProblemDetectorClusterRole,
Labels: map[string]string{"kubernetes.io/cluster-service": "true", "addonmanager.kubernetes.io/mode": "Reconcile"},
},
Rules: []rbacv1.PolicyRule{
{
APIGroups: []string{""},
Resources: []string{"nodes", "pods"},
Verbs: []string{"list", "get"},
},
{
APIGroups: []string{""},
Resources: []string{"nodes/status"},
Verbs: []string{"patch"},
},
{
APIGroups: []string{""},
Resources: []string{"events"},
Verbs: []string{"create", "patch", "update"},
},
},
}
}

func createClusterRoleBindingForNPD() *rbacv1.ClusterRoleBinding {
return &rbacv1.ClusterRoleBinding{
TypeMeta: metav1.TypeMeta{
Kind: "ClusterRoleBinding",
APIVersion: "rbac.authorization.k8s.io/v1",
},
ObjectMeta: metav1.ObjectMeta{
Name: nodeProblemDetectorClusterRoleBinding,
Labels: map[string]string{"kubernetes.io/cluster-service": "true", "addonmanager.kubernetes.io/mode": "Reconcile"},
},
RoleRef: rbacv1.RoleRef{
APIGroup: "rbac.authorization.k8s.io",
Kind: "ClusterRole",
Name: nodeProblemDetectorClusterRole,
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
Name: nodeProblemDetectorServiceAccount,
Namespace: metav1.NamespaceSystem,
},
},
}
}

var selectorForNPD = metav1.LabelSelector{
MatchLabels: map[string]string{specialLabelName: specialLabelValue, "k8s-app": "node-problem-detector"},
}

func createDaemonSetForNPD(components images.Components) *appsv1.DaemonSet {
return &appsv1.DaemonSet{
TypeMeta: metav1.TypeMeta{
APIVersion: "apps/v1",
Kind: "DaemonSet",
},
ObjectMeta: metav1.ObjectMeta{
Name: nodeProblemDetectorWorkload,
Namespace: metav1.NamespaceSystem,
Labels: map[string]string{"kubernetes.io/cluster-service": "true", "addonmanager.kubernetes.io/mode": "Reconcile", specialLabelName: specialLabelValue, "k8s-app": "node-problem-detector"},
},
Spec: appsv1.DaemonSetSpec{
Selector: &selectorForNPD,
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
Type: appsv1.RollingUpdateDaemonSetStrategyType,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{specialLabelName: specialLabelValue, "k8s-app": "node-problem-detector"},
Annotations: map[string]string{"scheduler.alpha.kubernetes.io/critical-pod": "", "tke.prometheus.io/scrape": "true", "prometheus.io/scheme": "http", "prometheus.io/port": "20257"},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: nodeProblemDetectorWorkload,
Image: components.NodeProblemDetector.FullName(),
Command: []string{
"/node-problem-detector",
"--logtostderr",
"--prometheus-address=0.0.0.0",
"--config.system-log-monitor=/config/kernel-monitor.json,/config/docker-monitor.json",
},
SecurityContext: &corev1.SecurityContext{
Privileged: boolPointer(true),
},
Env: []corev1.EnvVar{
{
Name: "NODE_NAME",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "spec.nodeName",
},
},
},
},
VolumeMounts: []corev1.VolumeMount{
{
MountPath: "/var/log",
Name: "log",
ReadOnly: true,
},
{
MountPath: "/dev/kmsg",
Name: "kmsg",
ReadOnly: true,
},
{
MountPath: "/etc/localtime",
Name: "localtime",
ReadOnly: true,
},
},
},
},
HostNetwork: true,
HostPID: true,
Volumes: []corev1.Volume{
{
Name: "log",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/log",
},
},
},
{
Name: "kmsg",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/dev/kmsg",
},
},
},
{
Name: "localtime",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/etc/localtime",
},
},
},
},
ServiceAccountName: nodeProblemDetectorServiceAccount,
Tolerations: []corev1.Toleration{
{
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
},
{
Operator: corev1.TolerationOpExists,
Key: "CriticalAddonsOnly",
},
},
},
},
},
}
}

func (c *Controller) uninstallPrometheus(ctx context.Context, prometheus *v1.Prometheus, dropData bool) error {
var err error
var errs []error
Expand Down Expand Up @@ -2087,6 +2293,25 @@ func (c *Controller) uninstallPrometheus(ctx context.Context, prometheus *v1.Pro
errs = append(errs, err)
}

if prometheus.Spec.WithNPD {
err = kubeClient.AppsV1().DaemonSets(metav1.NamespaceSystem).Delete(ctx, nodeProblemDetectorWorkload, metav1.DeleteOptions{})
if err != nil && !errors.IsNotFound(err) {
errs = append(errs, err)
}
err = kubeClient.RbacV1().ClusterRoleBindings().Delete(ctx, nodeProblemDetectorClusterRoleBinding, metav1.DeleteOptions{})
if err != nil && !errors.IsNotFound(err) {
errs = append(errs, err)
}
err = kubeClient.RbacV1().ClusterRoles().Delete(ctx, nodeProblemDetectorClusterRole, metav1.DeleteOptions{})
if err != nil && !errors.IsNotFound(err) {
errs = append(errs, err)
}
err = kubeClient.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Delete(ctx, nodeProblemDetectorServiceAccount, metav1.DeleteOptions{})
if err != nil && !errors.IsNotFound(err) {
errs = append(errs, err)
}
}

// delete prometheus-operator
err = kubeClient.AppsV1().Deployments(metav1.NamespaceSystem).Delete(ctx, prometheusOperatorWorkLoad, metav1.DeleteOptions{})
if err != nil && !errors.IsNotFound(err) {
Expand Down Expand Up @@ -2592,3 +2817,7 @@ func (c *Controller) initThanos() ([]string, error) {

return remoteWrites, nil
}

func boolPointer(value bool) *bool {
return &value
}
Loading

0 comments on commit af967de

Please sign in to comment.