Skip to content

Commit

Permalink
Merge branch 'driver-status-file' into 'master'
Browse files Browse the repository at this point in the history
Improve driver validation by relying on a status file that is managed by driver container

See merge request nvidia/kubernetes/gpu-operator!580
  • Loading branch information
cdesiniotis committed Jan 4, 2023
2 parents 1aa8f38 + 6e40ab8 commit 8f91d38
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 8 deletions.
6 changes: 5 additions & 1 deletion assets/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,15 @@ spec:
startupProbe:
exec:
command:
[sh, -c, 'lsmod | grep nvidia']
[sh, -c, 'nvidia-smi && touch /run/nvidia/validations/.driver-ctr-ready']
initialDelaySeconds: 30
failureThreshold: 60
successThreshold: 1
periodSeconds: 10
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready"]
- image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
name: nvidia-peermem-ctr
Expand Down
39 changes: 32 additions & 7 deletions validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,17 +567,42 @@ func cleanupStatusFiles() error {
return nil
}

func (d *Driver) runValidation(silent bool) (hostDriver bool, err error) {
// invoke validation command
command := "chroot"
args := []string{"/run/nvidia/driver", "nvidia-smi"}

func getDriverRoot() (string, bool) {
// check if driver is pre-installed on the host and use host path for validation
if _, err := os.Lstat("/host/usr/bin/nvidia-smi"); err == nil {
args = []string{"/host", "nvidia-smi"}
hostDriver = true
log.Infof("Detected pre-installed driver on the host")
return "/host", true
}

return "/run/nvidia/driver", false
}

// For driver container installs, check existence of .driver-ctr-ready to confirm running driver
// container has completed and is in Ready state.
func assertDriverContainerReady(silent, withWaitFlag bool) error {
command := "bash"
args := []string{"-c", "stat /run/nvidia/validations/.driver-ctr-ready"}

if withWaitFlag {
return runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
}

return runCommand(command, args, silent)
}

func (d *Driver) runValidation(silent bool) (hostDriver bool, err error) {
driverRoot, hostDriver := getDriverRoot()
if !hostDriver {
log.Infof("Driver is not pre-installed on the host. Checking driver container status.")
if err := assertDriverContainerReady(silent, withWaitFlag); err != nil {
return hostDriver, fmt.Errorf("error checking driver container status: %v", err)
}
}

// invoke validation command
command := "chroot"
args := []string{driverRoot, "nvidia-smi"}

if withWaitFlag {
return hostDriver, runCommandWithWait(command, args, sleepIntervalSecondsFlag, silent)
}
Expand Down

0 comments on commit 8f91d38

Please sign in to comment.