diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index 055f962ce..d06567c76 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -31,7 +31,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -344,29 +343,12 @@ type Sourcoser interface { } func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error { - skipProgressDeadlineExceededPredicate := predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet) - if !ok { - return true - } - // allow deletions to happen - if !rev.DeletionTimestamp.IsZero() { - return true - } - if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded { - return false - } - return true - }, - } c.Clock = clock.RealClock{} return ctrl.NewControllerManagedBy(mgr). For( &ocv1.ClusterObjectSet{}, builder.WithPredicates( predicate.ResourceVersionChangedPredicate{}, - skipProgressDeadlineExceededPredicate, ), ). WatchesRawSource( @@ -684,7 +666,31 @@ func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) { } } +// markAsProgressing sets the Progressing condition to True with the given reason. +// +// Once ProgressDeadlineExceeded has been set for the current generation, this function +// becomes a no-op for every reason except Succeeded. This prevents a reconcile loop +// where the reconciler would set Progressing=True on each run only for the deadline +// check to immediately reset it back to ProgressDeadlineExceeded, producing an +// unnecessary status update that triggers another reconcile. +// +// If the generation changed (spec update) since ProgressDeadlineExceeded was recorded, +// the guard allows the update through so the condition reflects the new generation. +// +// Succeeded is allowed through so that a revision whose objects eventually finish +// rolling out can still complete normally. +// +// NOTE: new reasons added in the future will be blocked by default. If a new reason +// should be allowed to overwrite ProgressDeadlineExceeded, add it to the check below. func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) { + if cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && + cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded && + cnd.ObservedGeneration == cos.Generation && + reason != ocv1.ReasonSucceeded { + log.Log.V(1).Info("skipping markAsProgressing: ProgressDeadlineExceeded already set", + "requestedReason", reason, "revision", cos.Name) + return + } meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{ Type: ocv1.ClusterObjectSetTypeProgressing, Status: metav1.ConditionTrue, diff --git a/internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go b/internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go index 15300f63c..345b51364 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go @@ -6,6 +6,7 @@ import ( "time" "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -235,3 +236,113 @@ func (m *mockTrackingCacheInternal) Watch(ctx context.Context, user client.Objec func (m *mockTrackingCacheInternal) Source(h handler.EventHandler, predicates ...predicate.Predicate) source.Source { return nil } + +func Test_markAsProgressing_doesNotOverwriteProgressDeadlineExceeded(t *testing.T) { + for _, tc := range []struct { + name string + generation int64 + existingConditions []metav1.Condition + reason string + expectProgressing metav1.ConditionStatus + expectReason string + }{ + { + name: "sets Progressing when no condition exists", + generation: 1, + existingConditions: nil, + reason: ocv1.ReasonRollingOut, + expectProgressing: metav1.ConditionTrue, + expectReason: ocv1.ReasonRollingOut, + }, + { + name: "sets Progressing when currently RollingOut", + generation: 1, + existingConditions: []metav1.Condition{ + { + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionTrue, + Reason: ocv1.ReasonRollingOut, + ObservedGeneration: 1, + }, + }, + reason: ocv1.ReasonSucceeded, + expectProgressing: metav1.ConditionTrue, + expectReason: ocv1.ReasonSucceeded, + }, + { + name: "does not overwrite ProgressDeadlineExceeded with RollingOut (same generation)", + generation: 1, + existingConditions: []metav1.Condition{ + { + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + ObservedGeneration: 1, + }, + }, + reason: ocv1.ReasonRollingOut, + expectProgressing: metav1.ConditionFalse, + expectReason: ocv1.ReasonProgressDeadlineExceeded, + }, + { + name: "allows Succeeded to overwrite ProgressDeadlineExceeded", + generation: 1, + existingConditions: []metav1.Condition{ + { + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + ObservedGeneration: 1, + }, + }, + reason: ocv1.ReasonSucceeded, + expectProgressing: metav1.ConditionTrue, + expectReason: ocv1.ReasonSucceeded, + }, + { + name: "does not overwrite ProgressDeadlineExceeded with Retrying (same generation)", + generation: 1, + existingConditions: []metav1.Condition{ + { + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + ObservedGeneration: 1, + }, + }, + reason: ocv1.ClusterObjectSetReasonRetrying, + expectProgressing: metav1.ConditionFalse, + expectReason: ocv1.ReasonProgressDeadlineExceeded, + }, + { + name: "allows RollingOut when generation changed since ProgressDeadlineExceeded", + generation: 2, + existingConditions: []metav1.Condition{ + { + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + ObservedGeneration: 1, + }, + }, + reason: ocv1.ReasonRollingOut, + expectProgressing: metav1.ConditionTrue, + expectReason: ocv1.ReasonRollingOut, + }, + } { + t.Run(tc.name, func(t *testing.T) { + cos := &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{Generation: tc.generation}, + Status: ocv1.ClusterObjectSetStatus{ + Conditions: tc.existingConditions, + }, + } + markAsProgressing(cos, tc.reason, "test message") + + cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cnd) + require.Equal(t, tc.expectProgressing, cnd.Status) + require.Equal(t, tc.expectReason, cnd.Reason) + }) + } +} diff --git a/internal/operator-controller/controllers/clusterobjectset_controller_test.go b/internal/operator-controller/controllers/clusterobjectset_controller_test.go index ccfb9755e..0610910bf 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller_test.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller_test.go @@ -1070,6 +1070,185 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) { } } +// Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded verifies that +// a COS with ProgressDeadlineExceeded can still be archived. It simulates the real scenario: +// the COS starts as Active with ProgressDeadlineExceeded, then a spec patch sets +// lifecycleState to Archived (as a succeeding revision would do), and a subsequent +// reconcile processes the archival. +func Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded(t *testing.T) { + const clusterObjectSetName = "test-ext-1" + + testScheme := newScheme(t) + require.NoError(t, corev1.AddToScheme(testScheme)) + + ext := newTestClusterExtension() + rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme) + rev1.Finalizers = []string{"olm.operatorframework.io/teardown"} + meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{ + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + Message: "Revision has not rolled out for 1 minute(s).", + ObservedGeneration: rev1.Generation, + }) + + testClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithStatusSubresource(&ocv1.ClusterObjectSet{}). + WithObjects(rev1, ext). + Build() + + // Simulate the patch that a succeeding revision would apply. + patch := []byte(`{"spec":{"lifecycleState":"Archived"}}`) + require.NoError(t, testClient.Patch(t.Context(), + &ocv1.ClusterObjectSet{ObjectMeta: metav1.ObjectMeta{Name: clusterObjectSetName}}, + client.RawPatch(types.MergePatchType, patch), + )) + + mockEngine := &mockRevisionEngine{ + teardown: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionTeardownOption) (machinery.RevisionTeardownResult, error) { + return &mockRevisionTeardownResult{isComplete: true}, nil + }, + } + + result, err := (&controllers.ClusterObjectSetReconciler{ + Client: testClient, + RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine}, + TrackingCache: &mockTrackingCache{client: testClient}, + }).Reconcile(t.Context(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: clusterObjectSetName}, + }) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + + rev := &ocv1.ClusterObjectSet{} + require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev)) + + cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status) + require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason) + + cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionUnknown, cond.Status) + require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason) +} + +// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky verifies that +// once ProgressDeadlineExceeded is set, it is not overwritten when the reconciler runs again +// and sees objects still in transition. The Progressing condition should stay at +// ProgressDeadlineExceeded instead of being set back to RollingOut. +func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky(t *testing.T) { + const clusterObjectSetName = "test-ext-1" + + testScheme := newScheme(t) + require.NoError(t, corev1.AddToScheme(testScheme)) + + ext := newTestClusterExtension() + rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme) + rev1.Spec.ProgressDeadlineMinutes = 1 + rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute)) + meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{ + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + Message: "Revision has not rolled out for 1 minute(s).", + ObservedGeneration: rev1.Generation, + }) + + testClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithStatusSubresource(&ocv1.ClusterObjectSet{}). + WithObjects(rev1, ext). + Build() + + mockEngine := &mockRevisionEngine{ + reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) { + return &mockRevisionResult{inTransition: true}, nil + }, + } + + result, err := (&controllers.ClusterObjectSetReconciler{ + Client: testClient, + RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine}, + TrackingCache: &mockTrackingCache{client: testClient}, + }).Reconcile(t.Context(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: clusterObjectSetName}, + }) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + + rev := &ocv1.ClusterObjectSet{} + require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev)) + + cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionFalse, cond.Status, "Progressing should stay False") + require.Equal(t, ocv1.ReasonProgressDeadlineExceeded, cond.Reason, "Reason should stay ProgressDeadlineExceeded") +} + +// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides verifies +// that if a revision's objects eventually complete rolling out after the deadline was exceeded, +// the COS can still transition to Succeeded. ProgressDeadlineExceeded should not prevent a +// revision from completing when its objects become ready. +func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides(t *testing.T) { + const clusterObjectSetName = "test-ext-1" + + testScheme := newScheme(t) + require.NoError(t, corev1.AddToScheme(testScheme)) + + ext := newTestClusterExtension() + rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme) + rev1.Spec.ProgressDeadlineMinutes = 1 + rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute)) + meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{ + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + Message: "Revision has not rolled out for 1 minute(s).", + ObservedGeneration: rev1.Generation, + }) + + testClient := fake.NewClientBuilder(). + WithScheme(testScheme). + WithStatusSubresource(&ocv1.ClusterObjectSet{}). + WithObjects(rev1, ext). + Build() + + mockEngine := &mockRevisionEngine{ + reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) { + return &mockRevisionResult{isComplete: true}, nil + }, + } + + result, err := (&controllers.ClusterObjectSetReconciler{ + Client: testClient, + RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine}, + TrackingCache: &mockTrackingCache{client: testClient}, + }).Reconcile(t.Context(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: clusterObjectSetName}, + }) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + + rev := &ocv1.ClusterObjectSet{} + require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev)) + + cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status, "Progressing should transition to True") + require.Equal(t, ocv1.ReasonSucceeded, cond.Reason, "Reason should be Succeeded") + + cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) + + cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) +} + func newTestClusterExtension() *ocv1.ClusterExtension { return &ocv1.ClusterExtension{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e/features/update.feature b/test/e2e/features/update.feature index ab9408365..1899d4f2c 100644 --- a/test/e2e/features/update.feature +++ b/test/e2e/features/update.feature @@ -325,3 +325,36 @@ Feature: Update ClusterExtension And ClusterObjectSet "${NAME}-2" reports Progressing as True with Reason RollingOut And ClusterObjectSet "${NAME}-2" reports Available as False with Reason ProbeFailure + @BoxcutterRuntime + @ProgressDeadline + Scenario: Old revision with ProgressDeadlineExceeded is archived when a new revision succeeds + Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1 + And min value for ClusterObjectSet .spec.progressDeadlineMinutes is set to 1 + When ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + progressDeadlineMinutes: 1 + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: 1.0.2 + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterObjectSet "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded + When ClusterExtension is updated to version "1.0.0" + Then ClusterExtension is rolled out + And ClusterExtension is available + And ClusterObjectSet "${NAME}-2" reports Progressing as True with Reason Succeeded + And ClusterObjectSet "${NAME}-1" is archived +