Collect container-level GPU metrics using NVML.

When cAdvisor starts up, it would read the `vendor` files in `/sys/bus/pci/devices/*` to see if any NVIDIA devices (vendor ID: 0x10de) are attached to the node. If no NVIDIA devices are found, this code path would become dormant for the rest of cAdvisor lifetime. If NVIDIA devices are found, we would start a goroutine that would check for the presence of NVML by trying to dynamically load it at regular intervals. We need to do this regular checking instead of doing it just once because it may happen that cAdvisor is started before the NVIDIA drivers and NVML are installed. Once the NVML dynamic loading succeeds, we would use NVML’s query methods to find out how many devices exist on the node and create a map from their minor numbers to their handles and cache that map. The goroutine would exit at this point. If we detected the presence of NVML in the previous step, whenever a new container is detected by cAdvisor, cAdvisor would read the `devices.list` file from the container's devices cgroup. The `devices.list` file lists the major:minor number of all the devices that the container is allowed to access. If we find any device with major number 195 (which is the major number assigned to NVIDIA devices), we would cache the list of corresponding minor numbers for that container. During every housekeeping operation, in addition to collecting all the existing metrics, we will use the cached NVIDIA device minor numbers and the map from minor numbers to device handles to get metrics for GPU devices attached to the container.
McPo · Nov 6, 2017 · 4a35130 · 4a35130
1 parent 318f28b
commit 4a35130
Show file tree

Hide file tree

Showing 7 changed files with 491 additions and 2 deletions.
diff --git a/accelerators/nvidia.go b/accelerators/nvidia.go
@@ -0,0 +1,239 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package accelerators
+
+import (
+	"bufio"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	info "github.com/google/cadvisor/info/v1"
+
+	"github.com/golang/glog"
+	"github.com/mindprince/gonvml"
+)
+
+type NvidiaManager struct {
+	// true if the NVML library (libnvidia-ml.so.1) was loaded successfully
+	nvmlInitialized bool
+
+	// nvidiaDevices is a map from device minor number to a handle that can be used to get metrics about the device
+	nvidiaDevices map[int]gonvml.Device
+}
+
+var sysFsPCIDevicesPath = "/sys/bus/pci/devices/"
+
+const nvidiaVendorId = "0x10de"
+
+// Setup initializes NVML if nvidia devices are present on the node.
+func (nm *NvidiaManager) Setup() {
+	if !detectDevices(nvidiaVendorId) {
+		glog.Info("No NVIDIA devices found.")
+		return
+	}
+
+	go func() {
+		glog.Info("Starting goroutine to initialize NVML")
+		nm.initializeNVML()
+		if nm.nvmlInitialized {
+			return
+		}
+		// TODO: use globalHousekeepingInterval
+		for range time.Tick(time.Minute) {
+			nm.initializeNVML()
+			if nm.nvmlInitialized {
+				return
+			}
+		}
+	}()
+}
+
+// detectDevices returns true if a device with given pci id is present on the node.
+func detectDevices(vendorId string) bool {
+	devices, err := ioutil.ReadDir(sysFsPCIDevicesPath)
+	if err != nil {
+		glog.Warningf("error reading %q: %v", sysFsPCIDevicesPath, err)
+		return false
+	}
+
+	for _, device := range devices {
+		vendorPath := filepath.Join(sysFsPCIDevicesPath, device.Name(), "vendor")
+		content, err := ioutil.ReadFile(vendorPath)
+		if err != nil {
+			glog.Infof("Error while reading %q: %v", vendorPath, err)
+			continue
+		}
+		if strings.EqualFold(strings.TrimSpace(string(content)), vendorId) {
+			glog.Infof("Found device with vendorId %q", vendorId)
+			return true
+		}
+	}
+	return false
+}
+
+// initializeNVML initializes the NVML library and sets up the nvmlDevices map.
+func (nm *NvidiaManager) initializeNVML() {
+	if err := gonvml.Initialize(); err != nil {
+		// This is under a logging level because otherwise we may cause
+		// log spam if the drivers/nvml is not installed on the system.
+		glog.V(3).Infof("Could not initialize NVML: %v", err)
+		return
+	}
+	nm.nvmlInitialized = true
+	numDevices, err := gonvml.DeviceCount()
+	if err != nil {
+		glog.Warningf("GPU metrics would not be available. Failed to get the number of nvidia devices: %v", err)
+		return
+	}
+	glog.Infof("NVML initialized. Number of nvidia devices: %v", numDevices)
+	nm.nvidiaDevices = make(map[int]gonvml.Device, numDevices)
+	for i := 0; i < int(numDevices); i++ {
+		device, err := gonvml.DeviceHandleByIndex(uint(i))
+		if err != nil {
+			glog.Warningf("Failed to get nvidia device handle %d: %v", i, err)
+			continue
+		}
+		minorNumber, err := device.MinorNumber()
+		if err != nil {
+			glog.Warningf("Failed to get nvidia device minor number: %v", err)
+			continue
+		}
+		nm.nvidiaDevices[int(minorNumber)] = device
+	}
+}
+
+// Destroy shuts down NVML.
+func (nm *NvidiaManager) Destroy() {
+	if nm.nvmlInitialized {
+		gonvml.Shutdown()
+	}
+}
+
+// GetCollector returns a collector that can fetch nvidia gpu metrics for nvidia devices
+// present in the devices.list file in the given devicesCgroupPath.
+func (nm *NvidiaManager) GetCollector(devicesCgroupPath string) (AcceleratorCollector, error) {
+	nc := &NvidiaCollector{}
+	if !nm.nvmlInitialized || len(nm.nvidiaDevices) == 0 {
+		return nc, nil
+	}
+	nvidiaMinorNumbers, err := parseDevicesCgroup(devicesCgroupPath)
+	if err != nil {
+		return nc, err
+	}
+	for _, minor := range nvidiaMinorNumbers {
+		device, ok := nm.nvidiaDevices[minor]
+		if !ok {
+			return nc, fmt.Errorf("nvidia device minor number %d not found in cached devices", minor)
+		}
+		nc.Devices = append(nc.Devices, device)
+	}
+	return nc, nil
+}
+
+// parseDevicesCgroup parses the devices cgroup devices.list file for the container
+// and returns a list of minor numbers corresponding to NVIDIA GPU devices that the
+// container is allowed to access. In cases where the container has access to all
+// devices or all NVIDIA devices but the devices are not enumerated separately in
+// the devices.list file, we return an empty list.
+// This is defined as a variable to help in testing.
+var parseDevicesCgroup = func(devicesCgroupPath string) ([]int, error) {
+	// Always return a non-nil slice
+	nvidiaMinorNumbers := []int{}
+
+	devicesList := filepath.Join(devicesCgroupPath, "devices.list")
+	f, err := os.Open(devicesList)
+	if err != nil {
+		return nvidiaMinorNumbers, fmt.Errorf("error while opening devices cgroup file %q: %v", devicesList, err)
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+
+	// See https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt for the file format
+	for s.Scan() {
+		text := s.Text()
+
+		fields := strings.Fields(text)
+		if len(fields) != 3 {
+			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: must contain three whitespace-separated fields", text)
+		}
+
+		// Split the second field to find out major:minor numbers
+		majorMinor := strings.Split(fields[1], ":")
+		if len(majorMinor) != 2 {
+			return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: second field should have one colon", text)
+		}
+
+		// NVIDIA graphics devices are character devices with major number 195.
+		// https://github.com/torvalds/linux/blob/v4.13/Documentation/admin-guide/devices.txt#L2583
+		if fields[0] == "c" && majorMinor[0] == "195" {
+			minorNumber, err := strconv.Atoi(majorMinor[1])
+			if err != nil {
+				return nvidiaMinorNumbers, fmt.Errorf("invalid devices cgroup entry %q: minor number is not integer", text)
+			}
+			// We don't want devices like nvidiactl (195:255) and nvidia-modeset (195:254)
+			if minorNumber < 128 {
+				nvidiaMinorNumbers = append(nvidiaMinorNumbers, minorNumber)
+			}
+			// We are ignoring the "195:*" case
+			// where the container has access to all NVIDIA devices on the machine.
+		}
+		// We are ignoring the "*:*" case
+		// where the container has access to all devices on the machine.
+	}
+	return nvidiaMinorNumbers, nil
+}
+
+type NvidiaCollector struct {
+	// Exposed for testing
+	Devices []gonvml.Device
+}
+
+// UpdateStats updates the stats for NVIDIA GPUs (if any) attached to the container.
+func (nc *NvidiaCollector) UpdateStats(stats *info.ContainerStats) error {
+	for _, device := range nc.Devices {
+		model, err := device.Name()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu name: %v", err)
+		}
+		uuid, err := device.UUID()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu uuid: %v", err)
+		}
+		memoryTotal, memoryUsed, err := device.MemoryInfo()
+		if err != nil {
+			return fmt.Errorf("error while getting gpu memory info: %v", err)
+		}
+		//TODO: Use housekeepingInterval
+		utilizationGPU, err := device.AverageGPUUtilization(10 * time.Second)
+		if err != nil {
+			return fmt.Errorf("error while getting gpu utilization: %v", err)
+		}
+
+		stats.Accelerators = append(stats.Accelerators, info.AcceleratorStats{
+			Make:        "nvidia",
+			Model:       model,
+			ID:          uuid,
+			MemoryTotal: memoryTotal,
+			MemoryUsed:  memoryUsed,
+			DutyCycle:   uint64(utilizationGPU),
+		})
+	}
+	return nil
+}