Skip to content

Commit

Permalink
feat(monitor): support monitor qgpu resources (#2025)
Browse files Browse the repository at this point in the history
  • Loading branch information
Huimintai committed Jul 25, 2022
1 parent 037d0a1 commit bde64a6
Showing 1 changed file with 9 additions and 7 deletions.
16 changes: 9 additions & 7 deletions pkg/monitor/controller/prometheus/yamls.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ func scrapeConfigForPrometheus() string {

- job_name: 'tke-service-endpoints'
scrape_timeout: 60s
honor_labels: true
tls_config:
insecure_skip_verify: true
kubernetes_sd_configs:
Expand Down Expand Up @@ -204,6 +205,7 @@ func scrapeConfigForPrometheus() string {

- job_name: 'kubernetes-service-endpoints'
scrape_timeout: 60s
honor_labels: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
Expand Down Expand Up @@ -456,16 +458,16 @@ groups:
expr: '{__name__=~"kubelet_running_pod_count|kubelet_running_pods"}*0 + 1'

- record: kube_node_status_capacity_gpu
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_core"})
expr: sum by(node) (kube_node_status_capacity{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"})

- record: kube_node_status_capacity_gpu_memory
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"})
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_capacity{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024

- record: kube_node_status_allocatable_gpu
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_core"})
expr: sum by(node) (kube_node_status_allocatable{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"})

- record: kube_node_status_allocatable_gpu_memory
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"})
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_allocatable{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024

- record: __pod_info1
expr: kube_pod_info* on(node) group_left(node_role) kube_node_labels
Expand Down Expand Up @@ -531,10 +533,10 @@ groups:
expr: k8s_container_gpu_used * 100 / on(node) group_left kube_node_status_capacity_gpu

- record: k8s_container_gpu_memory_used
expr: container_gpu_memory_total{gpu_memory="total"} / 256 * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2
expr: container_gpu_memory_total{gpu_memory="total"} * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_container_rate_gpu_memory_used_request
expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory / 256)
expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory)

- record: k8s_container_rate_gpu_memory_used_node
expr: k8s_container_gpu_memory_used * 100 / on(node) group_left() kube_node_status_capacity_gpu_memory
Expand Down Expand Up @@ -627,7 +629,7 @@ groups:
expr: sum(k8s_container_gpu_memory_used) without (container_name,container_id)

- record: k8s_pod_gpu_memory_request
expr: sum(container_request_gpu_memory / 256) without(container_name)
expr: sum(container_request_gpu_memory) without(container_name)

- record: k8s_pod_rate_gpu_memory_used_request
expr: sum(k8s_container_gpu_memory_used + on (container_name, pod_name, namespace) group_left container_request_gpu_memory * 0) without(container_name) * 100 / on (pod_name,namespace) group_left k8s_pod_gpu_memory_request
Expand Down

0 comments on commit bde64a6

Please sign in to comment.