Skip to content

Commit

Permalink
[GPU Driver Upgrade] use common label to select DaemonSets for stateMap
Browse files Browse the repository at this point in the history
  • Loading branch information
tariq1890 committed Sep 26, 2023
1 parent edcaae9 commit bd806dd
Show file tree
Hide file tree
Showing 10 changed files with 24 additions and 9 deletions.
2 changes: 2 additions & 0 deletions assets/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ metadata:
labels:
app: nvidia-driver-daemonset
nvidia.com/precompiled: "false"
app.kubernetes.io/component: nvidia-driver
name: nvidia-driver-daemonset
namespace: "FILLED BY THE OPERATOR"
annotations:
Expand All @@ -21,6 +22,7 @@ spec:
labels:
app: nvidia-driver-daemonset
nvidia.com/precompiled: "false"
app.kubernetes.io/component: nvidia-driver
spec:
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
Expand Down
15 changes: 6 additions & 9 deletions controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ const (
DriverLabelValue = "nvidia-driver-daemonset"
// UpgradeSkipDrainLabelSelector indicates the pod selector label to skip with drain
UpgradeSkipDrainLabelSelector = "nvidia.com/gpu-driver-upgrade-drain.skip!=true"
// AppComponentLabelKey indicates the label key of the component
AppComponentLabelKey = "app.kubernetes.io/component"
// AppComponentLabelValue indicates the label values of the nvidia-gpu-driver component
AppComponentLabelValue = "nvidia-driver"
)

//nolint
Expand Down Expand Up @@ -119,15 +123,8 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
clusterPolicyCtrl.operatorMetrics.driverAutoUpgradeEnabled.Set(driverAutoUpgradeEnabled)
}

driverLabelKey := DriverLabelKey
driverLabelValue := DriverLabelValue
if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled {
// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes
// based on rhcos version. Hence use DTK label instead
driverLabelKey = ocpDriverToolkitIdentificationLabel
driverLabelValue = ocpDriverToolkitIdentificationValue
}
state, err := r.StateManager.BuildState(ctx, clusterPolicyCtrl.operatorNamespace, map[string]string{driverLabelKey: driverLabelValue})
state, err := r.StateManager.BuildState(ctx, clusterPolicyCtrl.operatorNamespace,
map[string]string{AppComponentLabelKey: AppComponentLabelValue})
if err != nil {
r.Log.Error(err, "Failed to build cluster upgrade state")
return ctrl.Result{}, err
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-additional-configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
name: nvidia-gpu-driver-ubuntu22.04
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
spec:
containers:
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-gds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
name: nvidia-gpu-driver-ubuntu22.04
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
spec:
containers:
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
name: nvidia-gpu-driver-ubuntu22.04
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
spec:
containers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-openshift
labels:
app: nvidia-driver-daemonset-413.92.202304252344-0
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
openshift.driver-toolkit: "true"
openshift.driver-toolkit.rhcos: 413.92.202304252344-0
Expand All @@ -190,6 +191,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset-413.92.202304252344-0
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
openshift.driver-toolkit: "true"
spec:
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "true"
name: nvidia-gpu-driver-ubuntu22.04-5.4.0-150-generic
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "true"
spec:
containers:
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-rdma-hostmofed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
name: nvidia-gpu-driver-ubuntu22.04
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
spec:
containers:
Expand Down
2 changes: 2 additions & 0 deletions internal/state/testdata/golden/driver-rdma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ metadata:
openshift.io/scc: nvidia-gpu-driver-ubuntu22.04
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
name: nvidia-gpu-driver-ubuntu22.04
namespace: test-operator
Expand All @@ -135,6 +136,7 @@ spec:
kubectl.kubernetes.io/default-container: nvidia-driver-ctr
labels:
app: nvidia-driver-daemonset
app.kubernetes.io/component: nvidia-driver
nvidia.com/precompiled: "false"
spec:
containers:
Expand Down
2 changes: 2 additions & 0 deletions manifests/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ metadata:
app: nvidia-driver-daemonset
{{- end }}
nvidia.com/precompiled: {{ toString (deref .Driver.Spec.UsePrecompiled) | quote }}
app.kubernetes.io/component: "nvidia-driver"
{{- if and (.Openshift) (.Runtime.OpenshiftDriverToolkitEnabled) }}
openshift.driver-toolkit.rhcos: {{ .Openshift.RHCOSVersion | quote }}
openshift.driver-toolkit: "true"
Expand Down Expand Up @@ -50,6 +51,7 @@ spec:
app: nvidia-driver-daemonset
{{- end }}
nvidia.com/precompiled: {{ toString (deref .Driver.Spec.UsePrecompiled) | quote }}
app.kubernetes.io/component: "nvidia-driver"
spec:
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
Expand Down

0 comments on commit bd806dd

Please sign in to comment.