Skip to content

Commit

Permalink
Merge branch 'add_node_watcher' into 'heterogeneous_drivers'
Browse files Browse the repository at this point in the history
Add watcher for node labels to reconcile NVIDIADriver instances

See merge request nvidia/kubernetes/gpu-operator!860
  • Loading branch information
shivamerla committed Oct 4, 2023
2 parents 77bff8b + 024b894 commit cc33ceb
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 10 deletions.
5 changes: 2 additions & 3 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,11 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques

errStr := fmt.Sprintf("ClusterPolicy is not ready, states not ready: %v", statesNotReady)
r.Log.Error(nil, errStr)
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, errStr)
if condErr != nil {
r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error())
}
updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady)
return ctrl.Result{RequeueAfter: time.Second * 5}, nil
}

Expand Down Expand Up @@ -322,7 +322,6 @@ func addWatchNewGPUNode(ctx context.Context, r *ClusterPolicyReconciler, c contr
"osTreeLabelChanged", osTreeLabelChanged,
)
}

return needsUpdate
},
DeleteFunc: func(e event.DeleteEvent) bool {
Expand Down Expand Up @@ -361,7 +360,7 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient())

// Watch for changes to primary resource ClusterPolicy
err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{})
err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{})
if err != nil {
return err
}
Expand Down
44 changes: 41 additions & 3 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,19 @@ package controllers
import (
"context"
"fmt"
"maps"
"time"

"github.com/NVIDIA/k8s-operator-libs/pkg/consts"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
Expand Down Expand Up @@ -179,7 +182,6 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, "Reconciled", "All resources have been successfully reconciled"); condErr != nil {
return ctrl.Result{}, condErr
}

return reconcile.Result{}, nil
}

Expand Down Expand Up @@ -212,7 +214,7 @@ func (r *NVIDIADriverReconciler) updateCrStatus(
}

// SetupWithManager sets up the controller with the Manager.
func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error {
func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
// Create state manager
stateManager, err := state.NewManager(
nvidiav1alpha1.NVIDIADriverCRDName,
Expand Down Expand Up @@ -240,7 +242,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error {
}

// Watch for changes to the primary resource NVIDIaDriver
err = c.Watch(source.Kind(mgr.GetCache(), &nvidiav1alpha1.NVIDIADriver{}), &handler.EnqueueRequestForObject{})
err = c.Watch(source.Kind(mgr.GetCache(), &nvidiav1alpha1.NVIDIADriver{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{})
if err != nil {
return err
}
Expand Down Expand Up @@ -281,6 +283,42 @@ func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error {
return err
}

nodePredicate := predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
labels := e.Object.GetLabels()
return hasGPULabels(labels)
},
UpdateFunc: func(e event.UpdateEvent) bool {
logger := log.FromContext(ctx)
newLabels := e.ObjectNew.GetLabels()
oldLabels := e.ObjectOld.GetLabels()
nodeName := e.ObjectNew.GetName()

needsUpdate := hasGPULabels(newLabels) && !maps.Equal(newLabels, oldLabels)

if needsUpdate {
logger.Info("Node labels have been changed",
"name", nodeName,
)
}
return needsUpdate
},
DeleteFunc: func(e event.DeleteEvent) bool {
labels := e.Object.GetLabels()
return hasGPULabels(labels)
},
}

// Watch for changes to node labels
err = c.Watch(
source.Kind(mgr.GetCache(), &corev1.Node{}),
handler.EnqueueRequestsFromMapFunc(mapFn),
nodePredicate,
)
if err != nil {
return err
}

// Watch for changes to secondary resources which each state manager manages
watchSources := stateManager.GetWatchSources(mgr)
for _, watchSource := range watchSources {
Expand Down
4 changes: 2 additions & 2 deletions controllers/upgrade_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,15 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er
// SetupWithManager sets up the controller with the Manager.
//
//nolint:dupl
func (r *UpgradeReconciler) SetupWithManager(mgr ctrl.Manager) error {
func (r *UpgradeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
// Create a new controller
c, err := controller.New("upgrade-controller", mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: 1, RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(minDelayCR, maxDelayCR)})
if err != nil {
return err
}

// Watch for changes to primary resource ClusterPolicy
err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{})
err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{})
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func main() {
Log: upgradeLogger,
Scheme: mgr.GetScheme(),
StateManager: clusterUpgradeStateManager,
}).SetupWithManager(mgr); err != nil {
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Upgrade")
os.Exit(1)
}
Expand All @@ -158,7 +158,7 @@ func main() {
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
ClusterInfo: clusterInfo,
}).SetupWithManager(mgr); err != nil {
}).SetupWithManager(ctx, mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NVIDIADriver")
os.Exit(1)
}
Expand Down

0 comments on commit cc33ceb

Please sign in to comment.