A simple go http server serving per pod GPU metrics at localhost:9400/gpu/metrics. The exporter connects to kubelet gRPC server (/var/lib/kubelet/pod-resources) to identify the GPUs running on a pod leveraging Kubernetes device assignment feature and appends the GPU device's pod information to metrics collected by dcgm-exporter.
The http server allows Prometheus to scrape GPU metrics directly via a separate endpoint without relying on node-exporter. But if you still want to scrape GPU metrics via node-exporter, follow these instructions.
- NVIDIA Tesla drivers = R384+ (download from NVIDIA Driver Downloads page)
- nvidia-docker version > 2.0 (see how to install and it's prerequisites)
- Set the default runtime to nvidia
- Kubernetes version = 1.13
- Set KubeletPodResources in /etc/default/kubelet: KUBELET_EXTRA_ARGS=--feature-gates=KubeletPodResources=true
# Deploy nvidia-k8s-device-plugin
# Deploy GPU Pods
# Create the monitoring namespace
$ kubectl create namespace monitoring
# Add gpu metrics endpoint to prometheus
$ kubectl create -f prometheus/prometheus-configmap.yaml
# Deploy prometheus
$ kubectl create -f prometheus/prometheus-deployment.yaml
$ kubectl create -f pod-gpu-metrics-exporter-daemonset.yaml
# Open in browser: localhost:9090
$ docker build -t pod-gpu-metrics-exporter .
# Make sure to run dcgm-exporter
$ docker run -d --runtime=nvidia --rm --name=nvidia-dcgm-exporter nvidia/dcgm-exporter
$ docker run -d --privileged --rm -p 9400:9400 -v /var/lib/kubelet/pod-resources:/var/lib/kubelet/pod-resources --volumes-from nvidia-dcgm-exporter:ro nvidia/pod-gpu-metrics-exporter:v1.0.0-alpha
# Check GPU metrics
$ curl -s localhost:9400/gpu/metrics
# Sample output
# HELP dcgm_gpu_temp GPU temperature (in C).
# TYPE dcgm_gpu_temp gauge
dcgm_gpu_temp{container_name="pod1-ctr",gpu="0",pod_name="pod1",pod_namespace="default",uuid="GPU-2b399198-c670-a848-173b-d3400051a200"} 33
dcgm_gpu_temp{container_name="pod1-ctr",gpu="1",pod_name="pod1",pod_namespace="default",uuid="GPU-9567a9e7-341e-bb7e-fcf5-788d8caa50f9"} 34
# HELP dcgm_gpu_utilization GPU utilization (in %).
# TYPE dcgm_gpu_utilization gauge
dcgm_gpu_utilization{container_name="pod1-ctr",gpu="0",pod_name="pod1",pod_namespace="default",uuid="GPU-2b399198-c670-a848-173b-d3400051a200"} 0
dcgm_gpu_utilization{container_name="pod1-ctr",gpu="1",pod_name="pod1",pod_namespace="default",uuid="GPU-9567a9e7-341e-bb7e-fcf5-788d8caa50f9"} 0
# HELP dcgm_low_util_violation Throttling duration due to low utilization (in us).
# TYPE dcgm_low_util_violation counter
dcgm_low_util_violation{container_name="pod1-ctr",gpu="0",pod_name="pod1",pod_namespace="default",uuid="GPU-2b399198-c670-a848-173b-d3400051a200"} 0
dcgm_low_util_violation{container_name="pod1-ctr",gpu="1",pod_name="pod1",pod_namespace="default",uuid="GPU-9567a9e7-341e-bb7e-fcf5-788d8caa50f9"} 0
# HELP dcgm_mem_copy_utilization Memory utilization (in %).
# TYPE dcgm_mem_copy_utilization gauge
dcgm_mem_copy_utilization{container_name="pod1-ctr",gpu="0",pod_name="pod1",pod_namespace="default",uuid="GPU-2b399198-c670-a848-173b-d3400051a200"} 0
dcgm_mem_copy_utilization{container_name="pod1-ctr",gpu="1",pod_name="pod1",pod_namespace="default",uuid="GPU-9567a9e7-341e-bb7e-fcf5-788d8caa50f9"} 0
# HELP dcgm_memory_clock Memory clock frequency (in MHz).
# TYPE dcgm_memory_clock gauge
dcgm_memory_clock{container_name="pod1-ctr",gpu="0",pod_name="pod1",pod_namespace="default",uuid="GPU-2b399198-c670-a848-173b-d3400051a200"} 810
dcgm_memory_clock{container_name="pod1-ctr",gpu="1",pod_name="pod1",pod_namespace="default",uuid="GPU-9567a9e7-341e-bb7e-fcf5-788d8caa50f9"} 810
$ git clone
$ cd src && go build。
$ sudo ./src
add /var/run/docker.sock # used to get pod container pid
add hostPID: true # used to check whether the parent porcess of the gpu process is pod container process
use nvml # used to get the gpu process used memory
# TYPE dcgm_process_mem_used gauge
# HELP dcgm_process_mem_used process memory used (in MiB).
dcgm_process_mem_used{gpu="0",uuid="GPU-ad365448-e6c2-68f2-24e4-517b1e56e937",pod_name="test-pod-01",pod_namespace="default",container_name="nvidia-test",process_name="python",process_pid="617",process_type="C"} 847
dcgm_process_mem_used{gpu="0",uuid="GPU-ad365448-e6c2-68f2-24e4-517b1e56e937",pod_name="test-pod-01",pod_namespace="default",container_name="nvidia-test",process_name="python",process_pid="16187",process_type="C"} 587
# TYPE dcgm_gpu_logic_used gauge
# HELP dcgm_gpu_logic_used gpu used (in 0(unused)/1(used) ).
dcgm_gpu_logic_used{hostname="amax-pcl1",count="1",used="1"} 1
dcgm_gpu_logic_used{hostname="amax-pcl2",count="4",used="1 1 0 0"} 12
kubectl apply -f ./prometheus
kubectl apply -f ./grafana