Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(monitor): support monitor qgpu resources #2025

Merged
merged 1 commit into from
Jul 25, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
feat(monitor): support monitor qgpu resources
  • Loading branch information
Huimintai committed Jul 25, 2022
commit de8edaa8aafc0401cd80cca929ea200efcd0b0e6
16 changes: 9 additions & 7 deletions pkg/monitor/controller/prometheus/yamls.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ func scrapeConfigForPrometheus() string {

- job_name: 'tke-service-endpoints'
scrape_timeout: 60s
honor_labels: true
tls_config:
insecure_skip_verify: true
kubernetes_sd_configs:
Expand Down Expand Up @@ -204,6 +205,7 @@ func scrapeConfigForPrometheus() string {

- job_name: 'kubernetes-service-endpoints'
scrape_timeout: 60s
honor_labels: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
Expand Down Expand Up @@ -456,16 +458,16 @@ groups:
expr: '{__name__=~"kubelet_running_pod_count|kubelet_running_pods"}*0 + 1'

- record: kube_node_status_capacity_gpu
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_core"})
expr: sum by(node) (kube_node_status_capacity{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"})

- record: kube_node_status_capacity_gpu_memory
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"})
expr: sum by(node) (kube_node_status_capacity{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_capacity{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024

- record: kube_node_status_allocatable_gpu
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_core"})
expr: sum by(node) (kube_node_status_allocatable{resource=~"tencent_com_vcuda_core|tke_cloud_tencent_com_qgpu_core"})

- record: kube_node_status_allocatable_gpu_memory
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"})
expr: sum by(node) (kube_node_status_allocatable{resource="tencent_com_vcuda_memory"}) or sum by(node) (kube_node_status_allocatable{resource="tke_cloud_tencent_com_qgpu_memory"}) * 1024

- record: __pod_info1
expr: kube_pod_info* on(node) group_left(node_role) kube_node_labels
Expand Down Expand Up @@ -531,10 +533,10 @@ groups:
expr: k8s_container_gpu_used * 100 / on(node) group_left kube_node_status_capacity_gpu

- record: k8s_container_gpu_memory_used
expr: container_gpu_memory_total{gpu_memory="total"} / 256 * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2
expr: container_gpu_memory_total{gpu_memory="total"} * on(namespace, pod_name) group_left(workload_kind,workload_name,node, node_role) __pod_info2

- record: k8s_container_rate_gpu_memory_used_request
expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory / 256)
expr: k8s_container_gpu_memory_used * 100 / on (pod_name,namespace,container_name) group_left() (container_request_gpu_memory)

- record: k8s_container_rate_gpu_memory_used_node
expr: k8s_container_gpu_memory_used * 100 / on(node) group_left() kube_node_status_capacity_gpu_memory
Expand Down Expand Up @@ -627,7 +629,7 @@ groups:
expr: sum(k8s_container_gpu_memory_used) without (container_name,container_id)

- record: k8s_pod_gpu_memory_request
expr: sum(container_request_gpu_memory / 256) without(container_name)
expr: sum(container_request_gpu_memory) without(container_name)

- record: k8s_pod_rate_gpu_memory_used_request
expr: sum(k8s_container_gpu_memory_used + on (container_name, pod_name, namespace) group_left container_request_gpu_memory * 0) without(container_name) * 100 / on (pod_name,namespace) group_left k8s_pod_gpu_memory_request
Expand Down