From dd325c15d2902e1951b61280c0d664becdf5d35b Mon Sep 17 00:00:00 2001 From: Artur Zych <5843875+azych@users.noreply.github.com> Date: Fri, 14 Feb 2025 13:53:37 +0100 Subject: [PATCH 1/2] Handle interrupted helm releases in applier Introduces a workaround for 'interrupted' helm releases which enter into a 'pending' (-install/uninstall/rollback) state. If that happens, for example because of immediate application exit with one of those operations being in flight, helm is not able to resolve it automatically which means we end up in a permanent reconcile error state. One of the workarounds for this that has been repeated in the community is to remove metadata of the pending release, which is the solution chosen here. For full context see: https://github.com/operator-framework/operator-controller/pull/1776 https://github.com/helm/helm/issues/5595 --- go.mod | 2 +- go.sum | 4 +- .../operator-controller/action/helm_test.go | 6 ++ internal/operator-controller/applier/helm.go | 26 +++++- .../operator-controller/applier/helm_test.go | 81 +++++++++++++++++++ .../clusterextension_controller_test.go | 5 ++ 6 files changed, 119 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 7190953db..0f33333ab 100644 --- a/go.mod +++ b/go.mod @@ -19,7 +19,7 @@ require ( github.com/opencontainers/image-spec v1.1.1 github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 github.com/operator-framework/api v0.31.0 - github.com/operator-framework/helm-operator-plugins v0.8.0 + github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b github.com/operator-framework/operator-registry v1.55.0 github.com/prometheus/client_golang v1.22.0 github.com/spf13/cobra v1.9.1 diff --git a/go.sum b/go.sum index 812840e25..946953a2a 100644 --- a/go.sum +++ b/go.sum @@ -402,8 +402,8 @@ github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11 h1:eT github.com/openshift/crd-schema-checker v0.0.0-20240404194209-35a9033b1d11/go.mod h1:EmVJt97N+pfWFsli/ipXTBZqSG5F5KGQhm3c3IsGq1o= github.com/operator-framework/api v0.31.0 h1:tRsFTuZ51xD8U5QgiPo3+mZgVipHZVgRXYrI6RRXOh8= github.com/operator-framework/api v0.31.0/go.mod h1:57oCiHNeWcxmzu1Se8qlnwEKr/GGXnuHvspIYFCcXmY= -github.com/operator-framework/helm-operator-plugins v0.8.0 h1:0f6HOQC5likkf0b/OvGvw7nhDb6h8Cj5twdCNjwNzMc= -github.com/operator-framework/helm-operator-plugins v0.8.0/go.mod h1:Sc+8bE38xTCgCChBUvtq/PxatEg9fAypr7S5iAw8nlA= +github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b h1:andoOL7lpEafTXdjPGDuNLJlJQh0UmRzgj0+D31K8HU= +github.com/operator-framework/helm-operator-plugins v0.8.1-0.20250222123620-6261f2574b3b/go.mod h1:qioyxwOSI89RAtTWhccc+RyaPQdKTTJRc6sFkT6kqys= github.com/operator-framework/operator-lib v0.17.0 h1:cbz51wZ9+GpWR1ZYP4CSKSSBxDlWxmmnseaHVZZjZt4= github.com/operator-framework/operator-lib v0.17.0/go.mod h1:TGopBxIE8L6E/Cojzo26R3NFp1eNlqhQNmzqhOblaLw= github.com/operator-framework/operator-registry v1.55.0 h1:iXlv53fYyg2VtLqSDEalXD72/5Uzc7Rfx17j35+8plA= diff --git a/internal/operator-controller/action/helm_test.go b/internal/operator-controller/action/helm_test.go index 3f1f02f9d..b301bcd4c 100644 --- a/internal/operator-controller/action/helm_test.go +++ b/internal/operator-controller/action/helm_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" "sigs.k8s.io/controller-runtime/pkg/client" @@ -70,6 +71,11 @@ func (m *mockActionClient) Reconcile(rel *release.Release) error { return args.Error(0) } +func (m *mockActionClient) Config() *action.Configuration { + args := m.Called() + return args.Get(0).(*action.Configuration) +} + var _ actionclient.ActionClientGetter = &mockActionClientGetter{} type mockActionClientGetter struct { diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index cc47cc5a3..dba3ebc25 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -143,7 +143,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return nil, "", err } - rel, desiredRel, state, err := h.getReleaseState(ac, ext, chrt, values, post) + rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post) if err != nil { return nil, "", fmt.Errorf("failed to get release state using server-side dry-run: %w", err) } @@ -240,9 +240,30 @@ func (h *Helm) renderClientOnlyRelease(ctx context.Context, ext *ocv1.ClusterExt }, helmclient.AppendInstallPostRenderer(post)) } -func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) { +func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterface, ext *ocv1.ClusterExtension, chrt *chart.Chart, values chartutil.Values, post postrender.PostRenderer) (*release.Release, *release.Release, string, error) { + logger := log.FromContext(ctx) currentRelease, err := cl.Get(ext.GetName()) + + // if a release is pending at this point, that means that a helm action + // (installation/upgrade) we were attempting was likely interrupted in-flight. + // Pending release would leave us in reconciliation error loop because helm + // wouldn't be able to progress automatically from it. + // + // one of the workarounds is to try and remove helm metadata relating to + // that pending release which should 'reset' its state communicated to helm + // and the next reconciliation should be able to successfully pick up from here + // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 + // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 + if err == nil && currentRelease.Info.Status.IsPending() { + if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil { + return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err) + } + // return error to try to detect proper state (installation/upgrade) at next reconciliation + return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version) + } + if errors.Is(err, driver.ErrReleaseNotFound) { + logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName()) desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error { i.DryRun = true i.DryRunOption = "server" @@ -258,6 +279,7 @@ func (h *Helm) getReleaseState(cl helmclient.ActionInterface, ext *ocv1.ClusterE } desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error { + logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName()) upgrade.MaxHistory = maxHelmReleaseHistory upgrade.DryRun = true upgrade.DryRunOption = "server" diff --git a/internal/operator-controller/applier/helm_test.go b/internal/operator-controller/applier/helm_test.go index 66017eafa..2dbdaf124 100644 --- a/internal/operator-controller/applier/helm_test.go +++ b/internal/operator-controller/applier/helm_test.go @@ -13,6 +13,7 @@ import ( "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" + "helm.sh/helm/v3/pkg/storage" "helm.sh/helm/v3/pkg/storage/driver" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -66,6 +67,7 @@ type mockActionGetter struct { reconcileErr error desiredRel *release.Release currentRel *release.Release + config *action.Configuration } func (mag *mockActionGetter) ActionClientFor(ctx context.Context, obj client.Object) (helmclient.ActionInterface, error) { @@ -114,6 +116,10 @@ func (mag *mockActionGetter) Reconcile(rel *release.Release) error { return mag.reconcileErr } +func (mag *mockActionGetter) Config() *action.Configuration { + return mag.config +} + var ( // required for unmockable call to convert.RegistryV1ToHelmChart validFS = fstest.MapFS{ @@ -223,6 +229,80 @@ func TestApply_Base(t *testing.T) { }) } +func TestApply_InterruptedRelease(t *testing.T) { + t.Run("fails removing an interrupted install release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} + testStorage := storage.Init(driver.NewMemory()) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{}, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removing interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("fails removing an interrupted upgrade release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} + testStorage := storage.Init(driver.NewMemory()) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{}, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removing interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("successfully removes an interrupted install release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} + testStorage := storage.Init(driver.NewMemory()) + err := testStorage.Create(testRel) + require.NoError(t, err) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{}, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removed interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) + + t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) { + testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} + testStorage := storage.Init(driver.NewMemory()) + err := testStorage.Create(testRel) + require.NoError(t, err) + + mockAcg := &mockActionGetter{currentRel: testRel, config: &action.Configuration{Releases: testStorage}} + helmApplier := applier.Helm{ + ActionClientGetter: mockAcg, + BundleToHelmChartConverter: &convert.BundleToHelmChartConverter{}, + } + + objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) + require.Error(t, err) + require.ErrorContains(t, err, "removed interrupted release") + require.Nil(t, objs) + require.Empty(t, state) + }) +} + func TestApply_Installation(t *testing.T) { t.Run("fails during dry-run installation", func(t *testing.T) { mockAcg := &mockActionGetter{ @@ -439,6 +519,7 @@ func TestApply_Upgrade(t *testing.T) { t.Run("fails during dry-run upgrade", func(t *testing.T) { mockAcg := &mockActionGetter{ + currentRel: testCurrentRelease, dryRunUpgradeErr: errors.New("failed attempting to dry-run upgrade chart"), } helmApplier := applier.Helm{ diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index be61891a0..1531c9318 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -12,6 +12,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart" "helm.sh/helm/v3/pkg/release" "helm.sh/helm/v3/pkg/storage/driver" @@ -1411,6 +1412,10 @@ func (mag *MockActionGetter) Reconcile(rel *release.Release) error { return nil } +func (mag *MockActionGetter) Config() *action.Configuration { + return nil +} + func TestGetInstalledBundleHistory(t *testing.T) { getter := controllers.DefaultInstalledBundleGetter{} From 6e652064cb6d48d6bf0f2e53caab2008d725e467 Mon Sep 17 00:00:00 2001 From: Artur Zych <5843875+azych@users.noreply.github.com> Date: Mon, 10 Mar 2025 15:52:02 +0100 Subject: [PATCH 2/2] move pending release handling out of getReleaseState --- internal/operator-controller/applier/helm.go | 45 +++++++++++-------- .../operator-controller/applier/helm_test.go | 18 ++++---- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index dba3ebc25..96e84a887 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -39,6 +39,11 @@ const ( maxHelmReleaseHistory = 10 ) +var ( + errPendingRelease = errors.New("release is in a pending state") + errBundleToHelmChartConverterNil = errors.New("BundleToHelmChartConverter is nil") +) + // Preflight is a check that should be run before making any changes to the cluster type Preflight interface { // Install runs checks that should be successful prior @@ -144,6 +149,23 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte } rel, desiredRel, state, err := h.getReleaseState(ctx, ac, ext, chrt, values, post) + // if a release is pending, that means that a helm action + // (installation/upgrade) we were attempting was likely interrupted in-flight. + // Pending release would leave us in reconciliation error loop because helm + // wouldn't be able to progress automatically from it. + // + // one of the workarounds is to try and remove helm metadata relating to + // that pending release which should 'reset' its state communicated to helm + // and the next reconciliation should be able to successfully pick up from here + // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 + // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 + if errors.Is(err, errPendingRelease) { + if _, err := ac.Config().Releases.Delete(rel.Name, rel.Version); err != nil { + return nil, "", fmt.Errorf("failed removing pending release %q version %d metadata: %w", rel.Name, rel.Version, err) + } + // return an error to try to detect proper state (installation/upgrade) at next reconciliation + return nil, "", fmt.Errorf("removed pending release %q version %d metadata", rel.Name, rel.Version) + } if err != nil { return nil, "", fmt.Errorf("failed to get release state using server-side dry-run: %w", err) } @@ -203,7 +225,7 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.BundleToHelmChartConverter == nil { - return nil, errors.New("BundleToHelmChartConverter is nil") + return nil, errBundleToHelmChartConverterNil } watchNamespace, err := GetWatchNamespace(ext) if err != nil { @@ -244,24 +266,6 @@ func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterfac logger := log.FromContext(ctx) currentRelease, err := cl.Get(ext.GetName()) - // if a release is pending at this point, that means that a helm action - // (installation/upgrade) we were attempting was likely interrupted in-flight. - // Pending release would leave us in reconciliation error loop because helm - // wouldn't be able to progress automatically from it. - // - // one of the workarounds is to try and remove helm metadata relating to - // that pending release which should 'reset' its state communicated to helm - // and the next reconciliation should be able to successfully pick up from here - // for context see: https://github.com/helm/helm/issues/5595 and https://github.com/helm/helm/issues/7476 - // and the discussion in https://github.com/operator-framework/operator-controller/pull/1776 - if err == nil && currentRelease.Info.Status.IsPending() { - if _, err = cl.Config().Releases.Delete(currentRelease.Name, currentRelease.Version); err != nil { - return nil, nil, StateError, fmt.Errorf("failed removing interrupted release %q version %d metadata: %w", currentRelease.Name, currentRelease.Version, err) - } - // return error to try to detect proper state (installation/upgrade) at next reconciliation - return nil, nil, StateError, fmt.Errorf("removed interrupted release %q version %d metadata", currentRelease.Name, currentRelease.Version) - } - if errors.Is(err, driver.ErrReleaseNotFound) { logger.V(4).Info("ClusterExtension dry-run install", "extension", ext.GetName()) desiredRelease, err := cl.Install(ext.GetName(), ext.Spec.Namespace, chrt, values, func(i *action.Install) error { @@ -277,6 +281,9 @@ func (h *Helm) getReleaseState(ctx context.Context, cl helmclient.ActionInterfac if err != nil { return nil, nil, StateError, err } + if currentRelease.Info.Status.IsPending() { + return currentRelease, nil, StateError, errPendingRelease + } desiredRelease, err := cl.Upgrade(ext.GetName(), ext.Spec.Namespace, chrt, values, func(upgrade *action.Upgrade) error { logger.V(4).Info("ClusterExtension dry-run upgrade", "extension", ext.GetName()) diff --git a/internal/operator-controller/applier/helm_test.go b/internal/operator-controller/applier/helm_test.go index 2dbdaf124..19ffb6469 100644 --- a/internal/operator-controller/applier/helm_test.go +++ b/internal/operator-controller/applier/helm_test.go @@ -229,8 +229,8 @@ func TestApply_Base(t *testing.T) { }) } -func TestApply_InterruptedRelease(t *testing.T) { - t.Run("fails removing an interrupted install release", func(t *testing.T) { +func TestApply_PendingRelease(t *testing.T) { + t.Run("fails removing a pending install release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} testStorage := storage.Init(driver.NewMemory()) @@ -242,12 +242,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removing interrupted release") + require.ErrorContains(t, err, "removing pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("fails removing an interrupted upgrade release", func(t *testing.T) { + t.Run("fails removing a pending upgrade release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} testStorage := storage.Init(driver.NewMemory()) @@ -259,12 +259,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removing interrupted release") + require.ErrorContains(t, err, "removing pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("successfully removes an interrupted install release", func(t *testing.T) { + t.Run("successfully removes a pending install release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingInstall}} testStorage := storage.Init(driver.NewMemory()) err := testStorage.Create(testRel) @@ -278,12 +278,12 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removed interrupted release") + require.ErrorContains(t, err, "removed pending release") require.Nil(t, objs) require.Empty(t, state) }) - t.Run("successfully removes an interrupted upgrade release", func(t *testing.T) { + t.Run("successfully removes an pending upgrade release", func(t *testing.T) { testRel := &release.Release{Name: "testrel", Version: 0, Info: &release.Info{Status: release.StatusPendingUpgrade}} testStorage := storage.Init(driver.NewMemory()) err := testStorage.Create(testRel) @@ -297,7 +297,7 @@ func TestApply_InterruptedRelease(t *testing.T) { objs, state, err := helmApplier.Apply(context.TODO(), validFS, testCE, testObjectLabels, testStorageLabels) require.Error(t, err) - require.ErrorContains(t, err, "removed interrupted release") + require.ErrorContains(t, err, "removed pending release") require.Nil(t, objs) require.Empty(t, state) })