From de8edaa8aafc0401cd80cca929ea200efcd0b0e6 Mon Sep 17 00:00:00 2001 From: listai Date: Wed, 20 Jul 2022 22:46:31 +0800 Subject: [PATCH] feat(monitor): support monitor qgpu resources --- pkg/monitor/controller/prometheus/yamls.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/monitor/controller/prometheus/yamls.go b/pkg/monitor/controller/prometheus/yamls.go index cdf7a5663..c9e6cea1f 100755 --- a/pkg/monitor/controller/prometheus/yamls.go +++ b/pkg/monitor/controller/prometheus/yamls.go @@ -156,6 +156,7 @@ func scrapeConfigForPrometheus() string { - job_name: 'tke-service-endpoints' scrape_timeout: 60s + honor_labels: true tls_config: insecure_skip_verify: true kubernetes_sd_configs: @@ -204,6 +205,7 @@ func scrapeConfigForPrometheus() string { - job_name: 'kubernetes-service-endpoints' scrape_timeout: 60s + honor_labels: true kubernetes_sd_configs: - role: endpoints relabel_configs: @@ -456,16 +458,16 @@ groups: expr: '{__name__=~"kubelet_running_pod_count|kubelet_running_pods"}*0 + 1' - record: kube_node_status_capacity_gpu - expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_core"}) + expr: sum by(node) (kube_node_status_capacity{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"}) - record: kube_node_status_capacity_gpu_memory - expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"}) + expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_capacity{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024 - record: kube_node_status_allocatable_gpu - expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_core"}) + expr: sum by(node) (kube_node_status_allocatable{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"}) - record: kube_node_status_allocatable_gpu_memory - expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"}) + expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_allocatable{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024 - record: __pod_info1 expr: kube_pod_info* on(node) group_left(node_role) kube_node_labels @@ -531,10 +533,10 @@ groups: expr: k8s_container_gpu_used * 100 / on(node) group_left kube_node_status_capacity_gpu - record: k8s_container_gpu_memory_used - expr: container_gpu_memory_total{gpu_memory="total"} / 256 * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2 + expr: container_gpu_memory_total{gpu_memory="total"} * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2 - record: k8s_container_rate_gpu_memory_used_request - expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory / 256) + expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory) - record: k8s_container_rate_gpu_memory_used_node expr: k8s_container_gpu_memory_used * 100 / on(node) group_left() kube_node_status_capacity_gpu_memory @@ -627,7 +629,7 @@ groups: expr: sum(k8s_container_gpu_memory_used) without (container_name,container_id) - record: k8s_pod_gpu_memory_request - expr: sum(container_request_gpu_memory / 256) without(container_name) + expr: sum(container_request_gpu_memory) without(container_name) - record: k8s_pod_rate_gpu_memory_used_request expr: sum(k8s_container_gpu_memory_used + on (container_name, pod_name, namespace) group_left container_request_gpu_memory * 0) without(container_name) * 100 / on (pod_name,namespace) group_left k8s_pod_gpu_memory_request