diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index 1ac447b4c..30b7dec0c 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -185,11 +185,11 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques errStr := fmt.Sprintf("ClusterPolicy is not ready, states not ready: %v", statesNotReady) r.Log.Error(nil, errStr) + updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady) condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, errStr) if condErr != nil { r.Log.V(consts.LogLevelDebug).Error(nil, condErr.Error()) } - updateCRState(ctx, r, req.NamespacedName, gpuv1.NotReady) return ctrl.Result{RequeueAfter: time.Second * 5}, nil } @@ -322,7 +322,6 @@ func addWatchNewGPUNode(ctx context.Context, r *ClusterPolicyReconciler, c contr "osTreeLabelChanged", osTreeLabelChanged, ) } - return needsUpdate }, DeleteFunc: func(e event.DeleteEvent) bool { @@ -361,7 +360,7 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl r.conditionUpdater = conditions.NewClusterPolicyUpdater(mgr.GetClient()) // Watch for changes to primary resource ClusterPolicy - err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}) + err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{}) if err != nil { return err } diff --git a/controllers/nvidiadriver_controller.go b/controllers/nvidiadriver_controller.go index f2975abb3..0707d69a0 100644 --- a/controllers/nvidiadriver_controller.go +++ b/controllers/nvidiadriver_controller.go @@ -19,9 +19,11 @@ package controllers import ( "context" "fmt" + "maps" "time" "github.com/NVIDIA/k8s-operator-libs/pkg/consts" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -29,6 +31,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -179,7 +182,6 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request if condErr = r.conditionUpdater.SetConditionsReady(ctx, instance, "Reconciled", "All resources have been successfully reconciled"); condErr != nil { return ctrl.Result{}, condErr } - return reconcile.Result{}, nil } @@ -212,7 +214,7 @@ func (r *NVIDIADriverReconciler) updateCrStatus( } // SetupWithManager sets up the controller with the Manager. -func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { // Create state manager stateManager, err := state.NewManager( nvidiav1alpha1.NVIDIADriverCRDName, @@ -240,7 +242,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error { } // Watch for changes to the primary resource NVIDIaDriver - err = c.Watch(source.Kind(mgr.GetCache(), &nvidiav1alpha1.NVIDIADriver{}), &handler.EnqueueRequestForObject{}) + err = c.Watch(source.Kind(mgr.GetCache(), &nvidiav1alpha1.NVIDIADriver{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{}) if err != nil { return err } @@ -281,6 +283,42 @@ func (r *NVIDIADriverReconciler) SetupWithManager(mgr ctrl.Manager) error { return err } + nodePredicate := predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + labels := e.Object.GetLabels() + return hasGPULabels(labels) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + logger := log.FromContext(ctx) + newLabels := e.ObjectNew.GetLabels() + oldLabels := e.ObjectOld.GetLabels() + nodeName := e.ObjectNew.GetName() + + needsUpdate := hasGPULabels(newLabels) && !maps.Equal(newLabels, oldLabels) + + if needsUpdate { + logger.Info("Node labels have been changed", + "name", nodeName, + ) + } + return needsUpdate + }, + DeleteFunc: func(e event.DeleteEvent) bool { + labels := e.Object.GetLabels() + return hasGPULabels(labels) + }, + } + + // Watch for changes to node labels + err = c.Watch( + source.Kind(mgr.GetCache(), &corev1.Node{}), + handler.EnqueueRequestsFromMapFunc(mapFn), + nodePredicate, + ) + if err != nil { + return err + } + // Watch for changes to secondary resources which each state manager manages watchSources := stateManager.GetWatchSources(mgr) for _, watchSource := range watchSources { diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index 58e7d16a8..ea413990f 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -209,7 +209,7 @@ func (r *UpgradeReconciler) removeNodeUpgradeStateLabels(ctx context.Context) er // SetupWithManager sets up the controller with the Manager. // //nolint:dupl -func (r *UpgradeReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *UpgradeReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { // Create a new controller c, err := controller.New("upgrade-controller", mgr, controller.Options{Reconciler: r, MaxConcurrentReconciles: 1, RateLimiter: workqueue.NewItemExponentialFailureRateLimiter(minDelayCR, maxDelayCR)}) if err != nil { @@ -217,7 +217,7 @@ func (r *UpgradeReconciler) SetupWithManager(mgr ctrl.Manager) error { } // Watch for changes to primary resource ClusterPolicy - err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}) + err = c.Watch(source.Kind(mgr.GetCache(), &gpuv1.ClusterPolicy{}), &handler.EnqueueRequestForObject{}, predicate.GenerationChangedPredicate{}) if err != nil { return err } diff --git a/main.go b/main.go index 93c8bf6fe..eaf8a1853 100644 --- a/main.go +++ b/main.go @@ -139,7 +139,7 @@ func main() { Log: upgradeLogger, Scheme: mgr.GetScheme(), StateManager: clusterUpgradeStateManager, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Upgrade") os.Exit(1) } @@ -158,7 +158,7 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), ClusterInfo: clusterInfo, - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NVIDIADriver") os.Exit(1) }