From 6f691dd82b05377b0929df2402064f5f9cac50b0 Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Wed, 29 Apr 2026 15:55:46 +0000 Subject: [PATCH 1/3] feat: add BakeFailureDisabled and DeploymentBlocked rollout conditions Surface two recovery-mode signals that previously had no observable state: - BakeFailureDisabled: True when an active deploy cannot be failed by health check errors. Either because the previous bake was not Succeeded (rollback recovery), or because a manual deploy proceeded over unhealthy health checks (force-deploy during incident) and bake has not yet started. - DeploymentBlocked: True when automatic deployment of new versions is blocked by unhealthy health checks. Set independently of gate blocking so both blockers surface concurrently in the UI. To support the force-deploy-during-incident case, history entries created via manual deploy while at least one health check is Unhealthy are tagged with DeployedWithUnhealthyHealthChecks=true. handleBakeTime treats these entries as recovery mode (shouldFail=false) for both the deploy-timeout and health-check-error paths until BakeStartTime is set, so transient errors during the incident do not fail a deployment that was knowingly issued into a broken state. Adds 22 specs in recovery_mode_test.go covering the condition helpers, the recovery flag's effect on shouldFail in both paths, and the DeploymentBlocked + gate concurrent-block regression. Co-Authored-By: Claude Sonnet 4.6 --- api/v1alpha1/rollout_types.go | 13 + api/v1alpha1/zz_generated.deepcopy.go | 5 + config/crd/bases/kuberik.com_rollouts.yaml | 8 + internal/controller/recovery_mode_test.go | 545 +++++++++++++++++++++ internal/controller/rollout_controller.go | 179 +++++-- 5 files changed, 719 insertions(+), 31 deletions(-) create mode 100644 internal/controller/recovery_mode_test.go diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index d1db042..fb1cc26 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -339,6 +339,14 @@ type DeploymentHistoryEntry struct { // ignored as they reflect the pre-retry state. // +optional LastRetryTimestamp *metav1.Time `json:"lastRetryTimestamp,omitempty"` + + // DeployedWithUnhealthyHealthChecks records that this entry was created via a manual + // deployment (WantedVersion or force-deploy) while at least one health check was already + // unhealthy. The rollout treats this as recovery-mode-during-incident: while the bake has + // not yet started (BakeStartTime is nil), health check errors do not fail the rollout, + // since the user explicitly chose to deploy into an active incident. + // +optional + DeployedWithUnhealthyHealthChecks *bool `json:"deployedWithUnhealthyHealthChecks,omitempty"` } // +kubebuilder:object:root=true @@ -382,6 +390,11 @@ const ( RolloutBakeTimeRetrying = "BakeTimeRetrying" // RolloutInvalidBakeTimeConfiguration means the bake time configuration is invalid. RolloutInvalidBakeTimeConfiguration = "InvalidBakeTimeConfiguration" + // RolloutDeploymentBlocked means deployment is blocked because health checks are unhealthy. + RolloutDeploymentBlocked = "DeploymentBlocked" + // RolloutBakeFailureDisabled means health check failures will not fail the current deployment. + // This occurs when the previous deployment also failed, allowing recovery without cascading failures. + RolloutBakeFailureDisabled = "BakeFailureDisabled" ) const ( diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 6807cae..00d53a5 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -217,6 +217,11 @@ func (in *DeploymentHistoryEntry) DeepCopyInto(out *DeploymentHistoryEntry) { in, out := &in.LastRetryTimestamp, &out.LastRetryTimestamp *out = (*in).DeepCopy() } + if in.DeployedWithUnhealthyHealthChecks != nil { + in, out := &in.DeployedWithUnhealthyHealthChecks, &out.DeployedWithUnhealthyHealthChecks + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentHistoryEntry. diff --git a/config/crd/bases/kuberik.com_rollouts.yaml b/config/crd/bases/kuberik.com_rollouts.yaml index 47b9730..914d883 100644 --- a/config/crd/bases/kuberik.com_rollouts.yaml +++ b/config/crd/bases/kuberik.com_rollouts.yaml @@ -429,6 +429,14 @@ spec: BakeStatusMessage provides details about the bake state for this deployment This field contains human-readable information about why the bake status is what it is. type: string + deployedWithUnhealthyHealthChecks: + description: |- + DeployedWithUnhealthyHealthChecks records that this entry was created via a manual + deployment (WantedVersion or force-deploy) while at least one health check was already + unhealthy. The rollout treats this as recovery-mode-during-incident: while the bake has + not yet started (BakeStartTime is nil), health check errors do not fail the rollout, + since the user explicitly chose to deploy into an active incident. + type: boolean failedHealthChecks: description: |- FailedHealthChecks contains all health checks that failed during bake. diff --git a/internal/controller/recovery_mode_test.go b/internal/controller/recovery_mode_test.go new file mode 100644 index 0000000..194ac58 --- /dev/null +++ b/internal/controller/recovery_mode_test.go @@ -0,0 +1,545 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package controller + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + k8sptr "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + corev1 "k8s.io/api/core/v1" + + imagev1beta2 "github.com/fluxcd/image-reflector-controller/api/v1beta2" + fluxmeta "github.com/fluxcd/pkg/apis/meta" + rolloutv1alpha1 "github.com/kuberik/rollout-controller/api/v1alpha1" +) + +// Helper that builds a rollout with the supplied current+previous bake states. +func makeRolloutWithHistory(current *string, previous *string, deployedUnhealthy *bool, bakeStartTime *metav1.Time) *rolloutv1alpha1.Rollout { + r := &rolloutv1alpha1.Rollout{} + if current == nil { + return r + } + entry := rolloutv1alpha1.DeploymentHistoryEntry{ + BakeStatus: current, + DeployedWithUnhealthyHealthChecks: deployedUnhealthy, + BakeStartTime: bakeStartTime, + } + r.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{entry} + if previous != nil { + r.Status.History = append(r.Status.History, rolloutv1alpha1.DeploymentHistoryEntry{ + BakeStatus: previous, + }) + } + return r +} + +var _ = Describe("setBakeFailureDisabledCondition", func() { + var reconciler *RolloutReconciler + BeforeEach(func() { reconciler = &RolloutReconciler{} }) + + condition := func(r *rolloutv1alpha1.Rollout) *metav1.Condition { + return meta.FindStatusCondition(r.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + } + + It("is False when there is no history", func() { + r := &rolloutv1alpha1.Rollout{} + reconciler.setBakeFailureDisabledCondition(r) + c := condition(r) + Expect(c).NotTo(BeNil()) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("Normal")) + }) + + It("is False when the only history entry is active and there is no predecessor", func() { + // First-ever deploy can fail normally — there's no previous entry to gate on. + deploying := rolloutv1alpha1.BakeStatusDeploying + r := makeRolloutWithHistory(&deploying, nil, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) + + It("is False when the active entry's predecessor succeeded", func() { + deploying := rolloutv1alpha1.BakeStatusDeploying + succeeded := rolloutv1alpha1.BakeStatusSucceeded + r := makeRolloutWithHistory(&deploying, &succeeded, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) + + It("is True with reason PreviousBakeFailed when the predecessor failed", func() { + deploying := rolloutv1alpha1.BakeStatusDeploying + failed := rolloutv1alpha1.BakeStatusFailed + r := makeRolloutWithHistory(&deploying, &failed, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) + Expect(condition(r).Reason).To(Equal("PreviousBakeFailed")) + }) + + It("is True with reason PreviousBakeFailed when the predecessor was cancelled (any non-Succeeded)", func() { + inProgress := rolloutv1alpha1.BakeStatusInProgress + cancelled := rolloutv1alpha1.BakeStatusCancelled + r := makeRolloutWithHistory(&inProgress, &cancelled, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) + Expect(condition(r).Reason).To(Equal("PreviousBakeFailed")) + }) + + It("is True with reason DeployedWithUnhealthyHealthChecks when the flag is set and bake hasn't started", func() { + deploying := rolloutv1alpha1.BakeStatusDeploying + succeeded := rolloutv1alpha1.BakeStatusSucceeded + r := makeRolloutWithHistory(&deploying, &succeeded, k8sptr.To(true), nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) + Expect(condition(r).Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) + }) + + It("is False once bake has started, even if DeployedWithUnhealthyHealthChecks=true", func() { + // The recovery-mode-during-incident window closes once BakeStartTime is set. + // Normal failure detection resumes for any errors during the bake. + inProgress := rolloutv1alpha1.BakeStatusInProgress + succeeded := rolloutv1alpha1.BakeStatusSucceeded + now := metav1.Now() + r := makeRolloutWithHistory(&inProgress, &succeeded, k8sptr.To(true), &now) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) + + It("is False once the bake has reached a terminal state (Succeeded)", func() { + succeeded := rolloutv1alpha1.BakeStatusSucceeded + failedPrev := rolloutv1alpha1.BakeStatusFailed + r := makeRolloutWithHistory(&succeeded, &failedPrev, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + // PreviousBakeFailed only applies while the current entry is active. + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) + + It("is False when the bake has reached a terminal state (Failed)", func() { + failed := rolloutv1alpha1.BakeStatusFailed + failedPrev := rolloutv1alpha1.BakeStatusFailed + r := makeRolloutWithHistory(&failed, &failedPrev, nil, nil) + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) + + It("treats predecessor with nil BakeStatus as not-failed (failure allowed)", func() { + // Defensive: a predecessor without a BakeStatus shouldn't accidentally trip recovery mode. + deploying := rolloutv1alpha1.BakeStatusDeploying + r := &rolloutv1alpha1.Rollout{ + Status: rolloutv1alpha1.RolloutStatus{ + History: []rolloutv1alpha1.DeploymentHistoryEntry{ + {BakeStatus: &deploying}, + {BakeStatus: nil}, + }, + }, + } + reconciler.setBakeFailureDisabledCondition(r) + Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) + }) +}) + +var _ = Describe("setDeploymentBlockedCondition", func() { + var reconciler *RolloutReconciler + BeforeEach(func() { reconciler = &RolloutReconciler{} }) + + condition := func(r *rolloutv1alpha1.Rollout) *metav1.Condition { + return meta.FindStatusCondition(r.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + } + + It("is False with reason ManualDeployment when WantedVersion is set", func() { + r := &rolloutv1alpha1.Rollout{ + Spec: rolloutv1alpha1.RolloutSpec{WantedVersion: k8sptr.To("v1")}, + } + // healthChecksHealthy doesn't matter for manual deployments. + reconciler.setDeploymentBlockedCondition(r, false, "ignored") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("ManualDeployment")) + }) + + It("is False with reason ManualDeployment when force-deploy annotation is present", func() { + r := &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"rollout.kuberik.com/force-deploy": "v2"}, + }, + } + reconciler.setDeploymentBlockedCondition(r, false, "ignored") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("ManualDeployment")) + }) + + It("is True with reason UnhealthyHealthChecks when health checks are unhealthy and not manual", func() { + r := &rolloutv1alpha1.Rollout{} + reconciler.setDeploymentBlockedCondition(r, false, "hc 'foo' is unhealthy") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("UnhealthyHealthChecks")) + Expect(c.Message).To(Equal("hc 'foo' is unhealthy")) + }) + + It("is False with reason HealthChecksHealthy when health checks pass and not manual", func() { + r := &rolloutv1alpha1.Rollout{} + reconciler.setDeploymentBlockedCondition(r, true, "") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("HealthChecksHealthy")) + }) +}) + +// Integration tests below exercise the full reconcile loop and handleBakeTime +// to verify that the recovery-mode flag survives a real reconcile. +var _ = Describe("Force-deploy during incident (recovery mode)", func() { + ctx := context.Background() + + var ( + namespace string + rollout *rolloutv1alpha1.Rollout + imagePolicy *imagev1beta2.ImagePolicy + healthCheck *rolloutv1alpha1.HealthCheck + fakeClock *FakeClock + reconciler *RolloutReconciler + key types.NamespacedName + ) + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "rec-ns-"}} + Expect(k8sClient.Create(ctx, ns)).To(Succeed()) + namespace = ns.Name + + imagePolicy = &imagev1beta2.ImagePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "rec-ip", Namespace: namespace}, + Spec: imagev1beta2.ImagePolicySpec{ + ImageRepositoryRef: fluxmeta.NamespacedObjectReference{Name: "ignored"}, + Policy: imagev1beta2.ImagePolicyChoice{ + SemVer: &imagev1beta2.SemVerPolicy{Range: ">=0.0.0"}, + }, + }, + } + Expect(k8sClient.Create(ctx, imagePolicy)).To(Succeed()) + imagePolicy.Status.Conditions = []metav1.Condition{{ + Type: "Ready", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), Reason: "Ready", + }} + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "1.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) + + rollout = &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{Name: "rec-rollout", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutSpec{ + ReleasesImagePolicy: corev1.LocalObjectReference{Name: "rec-ip"}, + BakeTime: &metav1.Duration{Duration: 5 * time.Minute}, + HealthCheckSelector: &rolloutv1alpha1.HealthCheckSelectorConfig{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test-app"}}, + }, + }, + } + Expect(k8sClient.Create(ctx, rollout)).To(Succeed()) + + healthCheck = &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rec-hc", Namespace: namespace, + Labels: map[string]string{"app": "test-app"}, + }, + } + Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed()) + + fakeClock = NewFakeClock() + reconciler = &RolloutReconciler{Client: k8sClient, Scheme: k8sClient.Scheme(), Clock: fakeClock} + key = types.NamespacedName{Name: rollout.Name, Namespace: namespace} + }) + + AfterEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} + Expect(k8sClient.Delete(ctx, ns)).To(Succeed()) + }) + + It("sets DeployedWithUnhealthyHealthChecks=true on a new entry created via WantedVersion when health checks are unhealthy", func() { + By("Marking the health check unhealthy") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + By("Setting WantedVersion to force a manual deploy that bypasses the health check block") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying the new history entry carries the recovery-mode flag") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + entry := rollout.Status.History[0] + Expect(entry.Version.Tag).To(Equal("1.0.0")) + Expect(entry.DeployedWithUnhealthyHealthChecks).NotTo(BeNil()) + Expect(*entry.DeployedWithUnhealthyHealthChecks).To(BeTrue()) + }) + + It("does NOT set DeployedWithUnhealthyHealthChecks when health checks are healthy at deploy time", func() { + By("Health check is healthy") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + Expect(rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeNil()) + }) + + It("does NOT set DeployedWithUnhealthyHealthChecks for automatic deployments (the block prevents the deploy)", func() { + // Automatic deploys never reach deployRelease while health checks are unhealthy + // because the controller returns early. The flag is for force-deploy / WantedVersion only. + By("Health check is unhealthy") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying no deployment was created (automatic deploy was blocked)") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(BeEmpty()) + }) + + It("does NOT fail the rollout via deploy timeout while DeployedWithUnhealthyHealthChecks=true and bake hasn't started", func() { + By("Configure a short deploy timeout") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second} + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + By("Health check unhealthy at deploy time") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + By("Initial reconcile creates the new history entry with the flag") + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + Expect(rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).NotTo(BeNil()) + Expect(*rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeTrue()) + + By("Advance the clock past the deploy timeout") + fakeClock.Add(2 * time.Minute) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Bake status should still be Deploying — the deploy timeout did not flip it to Failed") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying)) + }) + + It("does NOT fail the rollout via health check error while DeployedWithUnhealthyHealthChecks=true and bake hasn't started", func() { + By("Health check unhealthy at deploy time") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now().Add(-1 * time.Minute)} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + By("Initial reconcile creates the new history entry with the flag") + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Health check fires another error AFTER deploy time (post-deploy errorCutoff)") + fakeClock.Add(10 * time.Second) + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Without the recovery flag, this would have flipped to Failed (previous was Succeeded). With the flag, it stays Deploying.") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying)) + }) + + It("DOES fail the rollout once bake has started even when DeployedWithUnhealthyHealthChecks=true", func() { + By("Health check unhealthy at deploy time so the new entry gets the flag") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History).To(HaveLen(1)) + Expect(*rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeTrue()) + + By("Health check recovers and reports a fresh LastChangeTime so bake can start") + fakeClock.Add(5 * time.Second) + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + healthCheck.Status.LastChangeTime = &metav1.Time{Time: fakeClock.Now()} + healthCheck.Status.LastErrorTime = nil + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + By("Bake should have started") + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(rollout.Status.History[0].BakeStartTime).NotTo(BeNil()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusInProgress)) + + By("Health check now reports an error AFTER bake started — recovery-mode no longer applies") + fakeClock.Add(10 * time.Second) + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed), + "bake errors after BakeStartTime should fail the rollout regardless of the recovery flag") + }) +}) + +var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", func() { + ctx := context.Background() + + var ( + namespace string + rollout *rolloutv1alpha1.Rollout + imagePolicy *imagev1beta2.ImagePolicy + healthCheck *rolloutv1alpha1.HealthCheck + fakeClock *FakeClock + reconciler *RolloutReconciler + key types.NamespacedName + ) + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "block-ns-"}} + Expect(k8sClient.Create(ctx, ns)).To(Succeed()) + namespace = ns.Name + + imagePolicy = &imagev1beta2.ImagePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "block-ip", Namespace: namespace}, + Spec: imagev1beta2.ImagePolicySpec{ + ImageRepositoryRef: fluxmeta.NamespacedObjectReference{Name: "ignored"}, + Policy: imagev1beta2.ImagePolicyChoice{ + SemVer: &imagev1beta2.SemVerPolicy{Range: ">=0.0.0"}, + }, + }, + } + Expect(k8sClient.Create(ctx, imagePolicy)).To(Succeed()) + imagePolicy.Status.Conditions = []metav1.Condition{{ + Type: "Ready", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), Reason: "Ready", + }} + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "1.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) + + rollout = &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{Name: "block-rollout", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutSpec{ + ReleasesImagePolicy: corev1.LocalObjectReference{Name: "block-ip"}, + HealthCheckSelector: &rolloutv1alpha1.HealthCheckSelectorConfig{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test-app"}}, + }, + }, + } + Expect(k8sClient.Create(ctx, rollout)).To(Succeed()) + + healthCheck = &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "block-hc", Namespace: namespace, + Labels: map[string]string{"app": "test-app"}, + }, + } + Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed()) + + fakeClock = NewFakeClock() + reconciler = &RolloutReconciler{Client: k8sClient, Scheme: k8sClient.Scheme(), Clock: fakeClock} + key = types.NamespacedName{Name: rollout.Name, Namespace: namespace} + }) + + AfterEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} + Expect(k8sClient.Delete(ctx, ns)).To(Succeed()) + }) + + It("surfaces DeploymentBlocked=True even when a blocking gate would otherwise return early", func() { + // Regression: previously the gate early-return at the top of Reconcile prevented + // the DeploymentBlocked condition from being set, so the UI showed no signal that + // health checks were also unhealthy. + By("Creating a blocking gate") + blockingGate := &rolloutv1alpha1.RolloutGate{ + ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutGateSpec{ + RolloutRef: &corev1.LocalObjectReference{Name: rollout.Name}, + Passing: k8sptr.To(false), + }, + } + Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) + + By("Marking the health check unhealthy") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + healthCheck.Status.Message = k8sptr.To("simulated incident") + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + cond := meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + Expect(cond).NotTo(BeNil(), "DeploymentBlocked condition should be persisted even with gate blocking") + Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + Expect(cond.Reason).To(Equal("UnhealthyHealthChecks")) + Expect(cond.Message).To(ContainSubstring("simulated incident")) + }) + + It("clears DeploymentBlocked once health checks recover, even while gates still block", func() { + By("Creating a blocking gate") + blockingGate := &rolloutv1alpha1.RolloutGate{ + ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutGateSpec{ + RolloutRef: &corev1.LocalObjectReference{Name: rollout.Name}, + Passing: k8sptr.To(false), + }, + } + Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) + + By("Health check is healthy from the start") + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + cond := meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + Expect(cond).NotTo(BeNil()) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal("HealthChecksHealthy")) + }) +}) diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index 1dde080..02c44fc 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -146,6 +146,34 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } rollout.Status.GatedReleaseCandidates = gatedReleaseCandidates + // Evaluate health checks once. The result feeds both the DeploymentBlocked condition + // (set before any early returns) and the deployment-blocking guard further down. This + // avoids redundant API calls and ensures the condition is set even when gates also + // block (otherwise the gate early-return would leave the condition stale). + healthChecksHealthy := true + var healthCheckMessage string + if !r.hasManualDeployment(&rollout) { + var err error + healthChecksHealthy, healthCheckMessage, err = r.evaluateHealthChecks(ctx, req.Namespace, &rollout) + if err != nil { + log.Error(err, "Failed to evaluate health checks") + return ctrl.Result{}, err + } + } + + // Set BakeFailureDisabled condition based on current history state. + // True when an active deployment (Deploying/InProgress) cannot be failed because + // the previous entry was not successful — health check failures during recovery + // would otherwise create a cascading-failure loop. Recomputed every reconcile so + // the condition reflects current state regardless of health check timing. + r.setBakeFailureDisabledCondition(&rollout) + + // Set DeploymentBlocked condition. True when automatic deployment of new versions is + // blocked by unhealthy health checks. Set independently of gates so it surfaces even + // when gates also block (gates blocking is signalled separately via GatesPassing / + // schedule UI). + r.setDeploymentBlockedCondition(&rollout, healthChecksHealthy, healthCheckMessage) + // Update status once with both release candidates and gated release candidates if err := r.Client.Status().Update(ctx, &rollout); err != nil { log.Error(err, "Failed to update rollout status with release candidates and gated release candidates") @@ -231,26 +259,15 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } } - // Evaluate health checks - block deployment if health checks are not healthy - // For manual deployments (WantedVersion or force-deploy), skip this check - if !r.hasManualDeployment(&rollout) { - healthChecksHealthy, healthCheckMessage, err := r.evaluateHealthChecks(ctx, req.Namespace, &rollout) - if err != nil { - log.Error(err, "Failed to evaluate health checks") - return ctrl.Result{}, err - } - if !healthChecksHealthy { - log.Info("Health checks are not healthy, blocking deployment", "message", healthCheckMessage) - // Update status before returning - if err := r.Client.Status().Update(ctx, &rollout); err != nil { - log.Error(err, "Failed to update rollout status") - } - // Emit event for health check blocking - if r.Recorder != nil { - r.Recorder.Event(&rollout, corev1.EventTypeWarning, "HealthCheckBlocking", healthCheckMessage) - } - return ctrl.Result{}, nil + // Block automatic deployment when health checks are unhealthy. Manual deployments + // (WantedVersion or force-deploy) bypass the block — that path is captured via the + // DeployedWithUnhealthyHealthChecks flag set when the new history entry is created. + if !r.hasManualDeployment(&rollout) && !healthChecksHealthy { + log.Info("Health checks are not healthy, blocking deployment", "message", healthCheckMessage) + if r.Recorder != nil { + r.Recorder.Event(&rollout, corev1.EventTypeWarning, "HealthCheckBlocking", healthCheckMessage) } + return ctrl.Result{}, nil } // Use filteredReleases instead of releases for wantedRelease selection @@ -1020,6 +1037,73 @@ func (r *RolloutReconciler) evaluateHealthChecks(ctx context.Context, namespace return true, "", nil } +// setBakeFailureDisabledCondition sets the BakeFailureDisabled condition based on current +// history state. The condition is True when the current entry is an active deployment AND +// either: +// - the previous history entry did not succeed (recovery from a failed bake), or +// - the current entry was force-deployed during an incident and bake has not started yet. +// +// In both cases handleBakeTime's recovery-mode guard prevents the rollout from being marked +// Failed even when health check errors are reported. +func (r *RolloutReconciler) setBakeFailureDisabledCondition(rollout *rolloutv1alpha1.Rollout) { + cond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutBakeFailureDisabled, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Normal", + Message: "", + } + + if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil { + current := &rollout.Status.History[0] + bakeStatus := *current.BakeStatus + active := bakeStatus == rolloutv1alpha1.BakeStatusDeploying || + bakeStatus == rolloutv1alpha1.BakeStatusInProgress + if active { + if len(rollout.Status.History) > 1 && + rollout.Status.History[1].BakeStatus != nil && + *rollout.Status.History[1].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { + cond.Status = metav1.ConditionTrue + cond.Reason = "PreviousBakeFailed" + cond.Message = "Previous deployment failed. Health check failures will not fail this deployment, allowing recovery to complete." + } else if current.DeployedWithUnhealthyHealthChecks != nil && + *current.DeployedWithUnhealthyHealthChecks && + current.BakeStartTime == nil { + cond.Status = metav1.ConditionTrue + cond.Reason = "DeployedWithUnhealthyHealthChecks" + cond.Message = "Deployed during an active incident (health checks were unhealthy at deploy time). Health check failures will not fail this deployment until health checks recover and bake starts." + } + } + } + + meta.SetStatusCondition(&rollout.Status.Conditions, cond) +} + +// setDeploymentBlockedCondition sets the DeploymentBlocked condition based on whether +// automatic deployment of new versions is blocked by unhealthy health checks. Manual +// deployments (force-deploy / WantedVersion) bypass the block, so the condition reads +// False for them. This is independent of gates blocking (which is surfaced via the +// GatesPassing condition and schedule UI) so callers can see both blockers concurrently. +func (r *RolloutReconciler) setDeploymentBlockedCondition(rollout *rolloutv1alpha1.Rollout, healthChecksHealthy bool, healthCheckMessage string) { + cond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutDeploymentBlocked, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Allowed", + Message: "", + } + if r.hasManualDeployment(rollout) { + cond.Reason = "ManualDeployment" + } else if !healthChecksHealthy { + cond.Status = metav1.ConditionTrue + cond.Reason = "UnhealthyHealthChecks" + cond.Message = healthCheckMessage + } else { + cond.Reason = "HealthChecksHealthy" + } + meta.SetStatusCondition(&rollout.Status.Conditions, cond) +} + // hasManualDeployment checks if there's a manual deployment requested (WantedVersion or force deploy) func (r *RolloutReconciler) hasManualDeployment(rollout *rolloutv1alpha1.Rollout) bool { // Check for WantedVersion in spec @@ -1200,18 +1284,39 @@ func (r *RolloutReconciler) deployRelease(ctx context.Context, rollout *rolloutv } } + // If this is a manual deployment (force-deploy or WantedVersion) and any health check is + // currently unhealthy, mark the new entry so subsequent bake logic treats it as recovery + // mode while waiting for health checks to recover. + var deployedWithUnhealthy *bool + if r.hasManualDeployment(rollout) { + hcs, err := r.listHealthChecks(ctx, rollout.Namespace, rollout) + if err == nil { + for _, hc := range hcs { + if hc.Status.Status == rolloutv1alpha1.HealthStatusUnhealthy { + deployedWithUnhealthy = k8sptr.To(true) + log.Info("Manual deployment with unhealthy health checks; new entry will be in recovery mode", + "healthCheck", hc.Name, "namespace", hc.Namespace) + break + } + } + } else { + log.Error(err, "Failed to list health checks while recording recovery-mode flag; skipping flag") + } + } + nextID := r.getNextHistoryID(rollout) now := metav1.Time{Time: r.now()} rollout.Status.History = append([]rolloutv1alpha1.DeploymentHistoryEntry{{ - ID: k8sptr.To(nextID), - Version: versionInfo, - Timestamp: now, - Message: &deploymentMessage, - TriggeredBy: triggeredBy, - BakeStatus: bakeStatus, - BakeStatusMessage: bakeStatusMsg, - BakeStartTime: nil, // Will be set when healthchecks become healthy - BakeEndTime: nil, // Will be set when bake completes (succeeds, fails, or times out) + ID: k8sptr.To(nextID), + Version: versionInfo, + Timestamp: now, + Message: &deploymentMessage, + TriggeredBy: triggeredBy, + BakeStatus: bakeStatus, + BakeStatusMessage: bakeStatusMsg, + BakeStartTime: nil, // Will be set when healthchecks become healthy + BakeEndTime: nil, // Will be set when bake completes (succeeds, fails, or times out) + DeployedWithUnhealthyHealthChecks: deployedWithUnhealthy, }}, rollout.Status.History...) // Limit history size if specified versionHistoryLimit := int32(10) // default value @@ -1625,7 +1730,8 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string // Use errorCutoff (max of deployTime and lastRetryTimestamp) so a retry // gets a fresh timeout window instead of inheriting the original deadline. if now.After(errorCutoff.Add(rollout.Spec.DeployTimeout.Duration)) { - // Only fail if previous entry was successful (or doesn't exist) + // Only fail if previous entry was successful (or doesn't exist) AND the current + // entry was not force-deployed during an active incident. shouldFail := true if len(rollout.Status.History) > 1 { previousEntry := rollout.Status.History[1] @@ -1634,6 +1740,10 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string log.Info("Previous rollout entry was not successful, not failing current rollout despite deploy timeout") } } + if currentEntry.DeployedWithUnhealthyHealthChecks != nil && *currentEntry.DeployedWithUnhealthyHealthChecks && currentEntry.BakeStartTime == nil { + shouldFail = false + log.Info("Current entry was deployed with unhealthy health checks and bake has not started; not failing despite deploy timeout") + } if shouldFail { log.Info("Deploy timeout reached before bake could start, marking rollout as failed") @@ -1677,9 +1787,10 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string } } - // If health check error detected, mark as failed (unless previous entry was not successful) + // If health check error detected, mark as failed (unless previous entry was not successful, + // or the entry was force-deployed during an active incident and bake has not yet started) if healthCheckError { - // Only fail if previous entry was successful (or doesn't exist) + // Only fail if previous entry was successful (or doesn't exist). shouldFail := true if len(rollout.Status.History) > 1 { previousEntry := rollout.Status.History[1] @@ -1688,6 +1799,12 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string log.Info("Previous rollout entry was not successful, not failing current rollout despite health check error") } } + // Force-deploy during incident: while bake hasn't started, the same pre-existing + // failure pattern is expected to recur — don't treat it as a deployment failure. + if currentEntry.DeployedWithUnhealthyHealthChecks != nil && *currentEntry.DeployedWithUnhealthyHealthChecks && currentEntry.BakeStartTime == nil { + shouldFail = false + log.Info("Current entry was deployed with unhealthy health checks and bake has not started; not failing despite health check error") + } if shouldFail { log.Info("Health check error detected, marking rollout as failed") From 5026c5c02ba60d3da8788f65a6ff23101061acfc Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Wed, 29 Apr 2026 16:16:18 +0000 Subject: [PATCH 2/3] refactor: store recovery-mode state in the BakeFailureDisabled condition Drop the DeployedWithUnhealthyHealthChecks field on DeploymentHistoryEntry and use the BakeFailureDisabled condition as the single source of truth for whether the active deploy is in recovery mode. The condition is set once in deployRelease when the new history entry is created, with the reason determined at creation time (PreviousBakeFailed or DeployedWithUnhealthyHealthChecks). It persists unchanged for the entry's lifetime and is overwritten only when the next deploy starts. handleBakeTime's shouldFail logic now reads the condition directly (via meta.IsStatusConditionTrue) instead of recomputing from history state and an entry-level flag. Both deploy-timeout and HC-error paths become a one-liner. Behavioral note: the previous design lifted the force-deploy-during- incident exemption once BakeStartTime was set. With this refactor the exemption persists for the entire deploy lifetime, matching the "don't change until next rollout starts" semantics. Tests rewritten to reflect the new model (14 specs in recovery_mode_test.go); full suite 228/228 passing. Co-Authored-By: Claude Sonnet 4.6 --- api/v1alpha1/rollout_types.go | 8 - api/v1alpha1/zz_generated.deepcopy.go | 5 - config/crd/bases/kuberik.com_rollouts.yaml | 8 - internal/controller/recovery_mode_test.go | 295 +++++++-------------- internal/controller/rollout_controller.go | 151 ++++------- 5 files changed, 152 insertions(+), 315 deletions(-) diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index fb1cc26..ff9d946 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -339,14 +339,6 @@ type DeploymentHistoryEntry struct { // ignored as they reflect the pre-retry state. // +optional LastRetryTimestamp *metav1.Time `json:"lastRetryTimestamp,omitempty"` - - // DeployedWithUnhealthyHealthChecks records that this entry was created via a manual - // deployment (WantedVersion or force-deploy) while at least one health check was already - // unhealthy. The rollout treats this as recovery-mode-during-incident: while the bake has - // not yet started (BakeStartTime is nil), health check errors do not fail the rollout, - // since the user explicitly chose to deploy into an active incident. - // +optional - DeployedWithUnhealthyHealthChecks *bool `json:"deployedWithUnhealthyHealthChecks,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 00d53a5..6807cae 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -217,11 +217,6 @@ func (in *DeploymentHistoryEntry) DeepCopyInto(out *DeploymentHistoryEntry) { in, out := &in.LastRetryTimestamp, &out.LastRetryTimestamp *out = (*in).DeepCopy() } - if in.DeployedWithUnhealthyHealthChecks != nil { - in, out := &in.DeployedWithUnhealthyHealthChecks, &out.DeployedWithUnhealthyHealthChecks - *out = new(bool) - **out = **in - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentHistoryEntry. diff --git a/config/crd/bases/kuberik.com_rollouts.yaml b/config/crd/bases/kuberik.com_rollouts.yaml index 914d883..47b9730 100644 --- a/config/crd/bases/kuberik.com_rollouts.yaml +++ b/config/crd/bases/kuberik.com_rollouts.yaml @@ -429,14 +429,6 @@ spec: BakeStatusMessage provides details about the bake state for this deployment This field contains human-readable information about why the bake status is what it is. type: string - deployedWithUnhealthyHealthChecks: - description: |- - DeployedWithUnhealthyHealthChecks records that this entry was created via a manual - deployment (WantedVersion or force-deploy) while at least one health check was already - unhealthy. The rollout treats this as recovery-mode-during-incident: while the bake has - not yet started (BakeStartTime is nil), health check errors do not fail the rollout, - since the user explicitly chose to deploy into an active incident. - type: boolean failedHealthChecks: description: |- FailedHealthChecks contains all health checks that failed during bake. diff --git a/internal/controller/recovery_mode_test.go b/internal/controller/recovery_mode_test.go index 194ac58..452aa3e 100644 --- a/internal/controller/recovery_mode_test.go +++ b/internal/controller/recovery_mode_test.go @@ -29,130 +29,6 @@ import ( rolloutv1alpha1 "github.com/kuberik/rollout-controller/api/v1alpha1" ) -// Helper that builds a rollout with the supplied current+previous bake states. -func makeRolloutWithHistory(current *string, previous *string, deployedUnhealthy *bool, bakeStartTime *metav1.Time) *rolloutv1alpha1.Rollout { - r := &rolloutv1alpha1.Rollout{} - if current == nil { - return r - } - entry := rolloutv1alpha1.DeploymentHistoryEntry{ - BakeStatus: current, - DeployedWithUnhealthyHealthChecks: deployedUnhealthy, - BakeStartTime: bakeStartTime, - } - r.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{entry} - if previous != nil { - r.Status.History = append(r.Status.History, rolloutv1alpha1.DeploymentHistoryEntry{ - BakeStatus: previous, - }) - } - return r -} - -var _ = Describe("setBakeFailureDisabledCondition", func() { - var reconciler *RolloutReconciler - BeforeEach(func() { reconciler = &RolloutReconciler{} }) - - condition := func(r *rolloutv1alpha1.Rollout) *metav1.Condition { - return meta.FindStatusCondition(r.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) - } - - It("is False when there is no history", func() { - r := &rolloutv1alpha1.Rollout{} - reconciler.setBakeFailureDisabledCondition(r) - c := condition(r) - Expect(c).NotTo(BeNil()) - Expect(c.Status).To(Equal(metav1.ConditionFalse)) - Expect(c.Reason).To(Equal("Normal")) - }) - - It("is False when the only history entry is active and there is no predecessor", func() { - // First-ever deploy can fail normally — there's no previous entry to gate on. - deploying := rolloutv1alpha1.BakeStatusDeploying - r := makeRolloutWithHistory(&deploying, nil, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) - - It("is False when the active entry's predecessor succeeded", func() { - deploying := rolloutv1alpha1.BakeStatusDeploying - succeeded := rolloutv1alpha1.BakeStatusSucceeded - r := makeRolloutWithHistory(&deploying, &succeeded, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) - - It("is True with reason PreviousBakeFailed when the predecessor failed", func() { - deploying := rolloutv1alpha1.BakeStatusDeploying - failed := rolloutv1alpha1.BakeStatusFailed - r := makeRolloutWithHistory(&deploying, &failed, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) - Expect(condition(r).Reason).To(Equal("PreviousBakeFailed")) - }) - - It("is True with reason PreviousBakeFailed when the predecessor was cancelled (any non-Succeeded)", func() { - inProgress := rolloutv1alpha1.BakeStatusInProgress - cancelled := rolloutv1alpha1.BakeStatusCancelled - r := makeRolloutWithHistory(&inProgress, &cancelled, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) - Expect(condition(r).Reason).To(Equal("PreviousBakeFailed")) - }) - - It("is True with reason DeployedWithUnhealthyHealthChecks when the flag is set and bake hasn't started", func() { - deploying := rolloutv1alpha1.BakeStatusDeploying - succeeded := rolloutv1alpha1.BakeStatusSucceeded - r := makeRolloutWithHistory(&deploying, &succeeded, k8sptr.To(true), nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionTrue)) - Expect(condition(r).Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) - }) - - It("is False once bake has started, even if DeployedWithUnhealthyHealthChecks=true", func() { - // The recovery-mode-during-incident window closes once BakeStartTime is set. - // Normal failure detection resumes for any errors during the bake. - inProgress := rolloutv1alpha1.BakeStatusInProgress - succeeded := rolloutv1alpha1.BakeStatusSucceeded - now := metav1.Now() - r := makeRolloutWithHistory(&inProgress, &succeeded, k8sptr.To(true), &now) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) - - It("is False once the bake has reached a terminal state (Succeeded)", func() { - succeeded := rolloutv1alpha1.BakeStatusSucceeded - failedPrev := rolloutv1alpha1.BakeStatusFailed - r := makeRolloutWithHistory(&succeeded, &failedPrev, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - // PreviousBakeFailed only applies while the current entry is active. - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) - - It("is False when the bake has reached a terminal state (Failed)", func() { - failed := rolloutv1alpha1.BakeStatusFailed - failedPrev := rolloutv1alpha1.BakeStatusFailed - r := makeRolloutWithHistory(&failed, &failedPrev, nil, nil) - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) - - It("treats predecessor with nil BakeStatus as not-failed (failure allowed)", func() { - // Defensive: a predecessor without a BakeStatus shouldn't accidentally trip recovery mode. - deploying := rolloutv1alpha1.BakeStatusDeploying - r := &rolloutv1alpha1.Rollout{ - Status: rolloutv1alpha1.RolloutStatus{ - History: []rolloutv1alpha1.DeploymentHistoryEntry{ - {BakeStatus: &deploying}, - {BakeStatus: nil}, - }, - }, - } - reconciler.setBakeFailureDisabledCondition(r) - Expect(condition(r).Status).To(Equal(metav1.ConditionFalse)) - }) -}) - var _ = Describe("setDeploymentBlockedCondition", func() { var reconciler *RolloutReconciler BeforeEach(func() { reconciler = &RolloutReconciler{} }) @@ -202,9 +78,10 @@ var _ = Describe("setDeploymentBlockedCondition", func() { }) }) -// Integration tests below exercise the full reconcile loop and handleBakeTime -// to verify that the recovery-mode flag survives a real reconcile. -var _ = Describe("Force-deploy during incident (recovery mode)", func() { +// Integration tests: BakeFailureDisabled condition is set once when a new history +// entry is created, persists for the entry's lifetime, and gates failure detection +// in handleBakeTime. The next deploy overwrites the condition. +var _ = Describe("BakeFailureDisabled condition (set at deploy time)", func() { ctx := context.Background() var ( @@ -268,12 +145,26 @@ var _ = Describe("Force-deploy during incident (recovery mode)", func() { Expect(k8sClient.Delete(ctx, ns)).To(Succeed()) }) - It("sets DeployedWithUnhealthyHealthChecks=true on a new entry created via WantedVersion when health checks are unhealthy", func() { - By("Marking the health check unhealthy") + bakeFailureCondition := func() *metav1.Condition { + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + return meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + } + + It("is False with reason Normal on the first deploy (no prior entry, healthy HCs)", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) + Expect(bakeFailureCondition().Reason).To(Equal("Normal")) + }) + + It("is True with reason DeployedWithUnhealthyHealthChecks on a manual deploy with unhealthy HCs", func() { healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) - By("Setting WantedVersion to force a manual deploy that bypasses the health check block") Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) rollout.Spec.WantedVersion = k8sptr.To("1.0.0") Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) @@ -281,17 +172,12 @@ var _ = Describe("Force-deploy during incident (recovery mode)", func() { _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - By("Verifying the new history entry carries the recovery-mode flag") - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - entry := rollout.Status.History[0] - Expect(entry.Version.Tag).To(Equal("1.0.0")) - Expect(entry.DeployedWithUnhealthyHealthChecks).NotTo(BeNil()) - Expect(*entry.DeployedWithUnhealthyHealthChecks).To(BeTrue()) + c := bakeFailureCondition() + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) }) - It("does NOT set DeployedWithUnhealthyHealthChecks when health checks are healthy at deploy time", func() { - By("Health check is healthy") + It("is False on a manual deploy when health checks are healthy", func() { healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) @@ -302,87 +188,83 @@ var _ = Describe("Force-deploy during incident (recovery mode)", func() { _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - Expect(rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeNil()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) }) - It("does NOT set DeployedWithUnhealthyHealthChecks for automatic deployments (the block prevents the deploy)", func() { - // Automatic deploys never reach deployRelease while health checks are unhealthy - // because the controller returns early. The flag is for force-deploy / WantedVersion only. - By("Health check is unhealthy") + It("does NOT fail the rollout via deploy timeout while BakeFailureDisabled=True", func() { + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second} + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + + fakeClock.Add(2 * time.Minute) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) - By("Verifying no deployment was created (automatic deploy was blocked)") Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(BeEmpty()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying), + "deploy timeout must not flip rollout to Failed while BakeFailureDisabled=True") }) - It("does NOT fail the rollout via deploy timeout while DeployedWithUnhealthyHealthChecks=true and bake hasn't started", func() { - By("Configure a short deploy timeout") + It("does NOT fail the rollout via health check error while BakeFailureDisabled=True", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second} rollout.Spec.WantedVersion = k8sptr.To("1.0.0") Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) - By("Health check unhealthy at deploy time") - healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy - Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) - - By("Initial reconcile creates the new history entry with the flag") _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - Expect(rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).NotTo(BeNil()) - Expect(*rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeTrue()) - By("Advance the clock past the deploy timeout") - fakeClock.Add(2 * time.Minute) + // HC fires a fresh error AFTER deploy time. + fakeClock.Add(10 * time.Second) + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - By("Bake status should still be Deploying — the deploy timeout did not flip it to Failed") Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying)) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying), + "HC error after deploy must not fail the rollout while BakeFailureDisabled=True") }) - It("does NOT fail the rollout via health check error while DeployedWithUnhealthyHealthChecks=true and bake hasn't started", func() { - By("Health check unhealthy at deploy time") - healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy - healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now().Add(-1 * time.Minute)} + It("DOES fail the rollout via HC error when BakeFailureDisabled=False", func() { + // Healthy HC at deploy time → BakeFailureDisabled=False → normal failure detection. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) rollout.Spec.WantedVersion = k8sptr.To("1.0.0") Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) - By("Initial reconcile creates the new history entry with the flag") _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) - By("Health check fires another error AFTER deploy time (post-deploy errorCutoff)") + // HC fires an error after deploy time. fakeClock.Add(10 * time.Second) + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - By("Without the recovery flag, this would have flipped to Failed (previous was Succeeded). With the flag, it stays Deploying.") Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying)) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed)) }) - It("DOES fail the rollout once bake has started even when DeployedWithUnhealthyHealthChecks=true", func() { - By("Health check unhealthy at deploy time so the new entry gets the flag") + It("persists the condition value across reconciles (does not recompute)", func() { + // Set up: deploy with unhealthy HC → True. healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) @@ -392,39 +274,58 @@ var _ = Describe("Force-deploy during incident (recovery mode)", func() { _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History).To(HaveLen(1)) - Expect(*rollout.Status.History[0].DeployedWithUnhealthyHealthChecks).To(BeTrue()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + firstTransition := bakeFailureCondition().LastTransitionTime + + // Several no-op reconciles — condition must not flap. + for i := 0; i < 3; i++ { + fakeClock.Add(5 * time.Second) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + Expect(bakeFailureCondition().LastTransitionTime).To(Equal(firstTransition)) + } + }) - By("Health check recovers and reports a fresh LastChangeTime so bake can start") - fakeClock.Add(5 * time.Second) - healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy - healthCheck.Status.LastChangeTime = &metav1.Time{Time: fakeClock.Now()} - healthCheck.Status.LastErrorTime = nil + It("is overwritten when a new deploy starts (e.g. user pins a different version)", func() { + // First deploy: unhealthy HC at deploy time → True. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) - _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + Expect(bakeFailureCondition().Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) - By("Bake should have started") - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(rollout.Status.History[0].BakeStartTime).NotTo(BeNil()) - Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusInProgress)) + // Make a new release available. + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "2.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) - By("Health check now reports an error AFTER bake started — recovery-mode no longer applies") - fakeClock.Add(10 * time.Second) - healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + // Heal the HC so the next deploy is created with healthy HCs. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + // Pin to the new version (manual deploy). + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("2.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) Expect(err).NotTo(HaveOccurred()) - Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) - Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed), - "bake errors after BakeStartTime should fail the rollout regardless of the recovery flag") + // Previous entry was Cancelled (since it was in-progress and got replaced) → + // PreviousBakeFailed reason on the new entry. + c := bakeFailureCondition() + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("PreviousBakeFailed")) }) }) +// DeploymentBlocked condition surfaces independently of gate blocking. var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", func() { ctx := context.Background() @@ -492,7 +393,6 @@ var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", fu // Regression: previously the gate early-return at the top of Reconcile prevented // the DeploymentBlocked condition from being set, so the UI showed no signal that // health checks were also unhealthy. - By("Creating a blocking gate") blockingGate := &rolloutv1alpha1.RolloutGate{ ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, Spec: rolloutv1alpha1.RolloutGateSpec{ @@ -502,7 +402,6 @@ var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", fu } Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) - By("Marking the health check unhealthy") healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy healthCheck.Status.Message = k8sptr.To("simulated incident") Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) @@ -519,7 +418,6 @@ var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", fu }) It("clears DeploymentBlocked once health checks recover, even while gates still block", func() { - By("Creating a blocking gate") blockingGate := &rolloutv1alpha1.RolloutGate{ ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, Spec: rolloutv1alpha1.RolloutGateSpec{ @@ -529,7 +427,6 @@ var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", fu } Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) - By("Health check is healthy from the start") healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index 02c44fc..b5bf01b 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -161,13 +161,6 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } } - // Set BakeFailureDisabled condition based on current history state. - // True when an active deployment (Deploying/InProgress) cannot be failed because - // the previous entry was not successful — health check failures during recovery - // would otherwise create a cascading-failure loop. Recomputed every reconcile so - // the condition reflects current state regardless of health check timing. - r.setBakeFailureDisabledCondition(&rollout) - // Set DeploymentBlocked condition. True when automatic deployment of new versions is // blocked by unhealthy health checks. Set independently of gates so it surfaces even // when gates also block (gates blocking is signalled separately via GatesPassing / @@ -1037,48 +1030,6 @@ func (r *RolloutReconciler) evaluateHealthChecks(ctx context.Context, namespace return true, "", nil } -// setBakeFailureDisabledCondition sets the BakeFailureDisabled condition based on current -// history state. The condition is True when the current entry is an active deployment AND -// either: -// - the previous history entry did not succeed (recovery from a failed bake), or -// - the current entry was force-deployed during an incident and bake has not started yet. -// -// In both cases handleBakeTime's recovery-mode guard prevents the rollout from being marked -// Failed even when health check errors are reported. -func (r *RolloutReconciler) setBakeFailureDisabledCondition(rollout *rolloutv1alpha1.Rollout) { - cond := metav1.Condition{ - Type: rolloutv1alpha1.RolloutBakeFailureDisabled, - Status: metav1.ConditionFalse, - LastTransitionTime: metav1.Now(), - Reason: "Normal", - Message: "", - } - - if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil { - current := &rollout.Status.History[0] - bakeStatus := *current.BakeStatus - active := bakeStatus == rolloutv1alpha1.BakeStatusDeploying || - bakeStatus == rolloutv1alpha1.BakeStatusInProgress - if active { - if len(rollout.Status.History) > 1 && - rollout.Status.History[1].BakeStatus != nil && - *rollout.Status.History[1].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - cond.Status = metav1.ConditionTrue - cond.Reason = "PreviousBakeFailed" - cond.Message = "Previous deployment failed. Health check failures will not fail this deployment, allowing recovery to complete." - } else if current.DeployedWithUnhealthyHealthChecks != nil && - *current.DeployedWithUnhealthyHealthChecks && - current.BakeStartTime == nil { - cond.Status = metav1.ConditionTrue - cond.Reason = "DeployedWithUnhealthyHealthChecks" - cond.Message = "Deployed during an active incident (health checks were unhealthy at deploy time). Health check failures will not fail this deployment until health checks recover and bake starts." - } - } - } - - meta.SetStatusCondition(&rollout.Status.Conditions, cond) -} - // setDeploymentBlockedCondition sets the DeploymentBlocked condition based on whether // automatic deployment of new versions is blocked by unhealthy health checks. Manual // deployments (force-deploy / WantedVersion) bypass the block, so the condition reads @@ -1284,40 +1235,67 @@ func (r *RolloutReconciler) deployRelease(ctx context.Context, rollout *rolloutv } } - // If this is a manual deployment (force-deploy or WantedVersion) and any health check is - // currently unhealthy, mark the new entry so subsequent bake logic treats it as recovery - // mode while waiting for health checks to recover. - var deployedWithUnhealthy *bool - if r.hasManualDeployment(rollout) { + // Determine the BakeFailureDisabled state for this new deployment. It's set once here, + // when the entry is created, and not recomputed during the entry's lifetime. + // Two reasons trigger recovery mode: + // - PreviousBakeFailed: the prior history entry didn't succeed; failing the new entry + // would create a cascading-failure loop (e.g. mid-rollback the app is degraded). + // - DeployedWithUnhealthyHealthChecks: a manual deploy was issued while a health check + // was already Unhealthy, signalling deploy-during-incident; the same failure pattern + // is expected to recur and shouldn't fail the new deploy. + bakeFailureReason := "" + bakeFailureMessage := "" + if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil && + *rollout.Status.History[0].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { + bakeFailureReason = "PreviousBakeFailed" + bakeFailureMessage = "Previous deployment failed. Health check failures will not fail this deployment, allowing recovery to complete." + } else if r.hasManualDeployment(rollout) { hcs, err := r.listHealthChecks(ctx, rollout.Namespace, rollout) if err == nil { for _, hc := range hcs { if hc.Status.Status == rolloutv1alpha1.HealthStatusUnhealthy { - deployedWithUnhealthy = k8sptr.To(true) + bakeFailureReason = "DeployedWithUnhealthyHealthChecks" + bakeFailureMessage = "Deployed during an active incident (health checks were unhealthy at deploy time). Health check failures will not fail this deployment." log.Info("Manual deployment with unhealthy health checks; new entry will be in recovery mode", "healthCheck", hc.Name, "namespace", hc.Namespace) break } } } else { - log.Error(err, "Failed to list health checks while recording recovery-mode flag; skipping flag") + log.Error(err, "Failed to list health checks while determining recovery-mode condition; defaulting to non-recovery") } } nextID := r.getNextHistoryID(rollout) now := metav1.Time{Time: r.now()} rollout.Status.History = append([]rolloutv1alpha1.DeploymentHistoryEntry{{ - ID: k8sptr.To(nextID), - Version: versionInfo, - Timestamp: now, - Message: &deploymentMessage, - TriggeredBy: triggeredBy, - BakeStatus: bakeStatus, - BakeStatusMessage: bakeStatusMsg, - BakeStartTime: nil, // Will be set when healthchecks become healthy - BakeEndTime: nil, // Will be set when bake completes (succeeds, fails, or times out) - DeployedWithUnhealthyHealthChecks: deployedWithUnhealthy, + ID: k8sptr.To(nextID), + Version: versionInfo, + Timestamp: now, + Message: &deploymentMessage, + TriggeredBy: triggeredBy, + BakeStatus: bakeStatus, + BakeStatusMessage: bakeStatusMsg, + BakeStartTime: nil, // Will be set when healthchecks become healthy + BakeEndTime: nil, // Will be set when bake completes (succeeds, fails, or times out) }}, rollout.Status.History...) + + // Persist the BakeFailureDisabled condition alongside the new history entry. Set once + // here and not changed during the entry's lifetime — overwritten only when the next + // new entry is created via a future deployRelease call. + bakeFailureCond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutBakeFailureDisabled, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Normal", + Message: "", + } + if bakeFailureReason != "" { + bakeFailureCond.Status = metav1.ConditionTrue + bakeFailureCond.Reason = bakeFailureReason + bakeFailureCond.Message = bakeFailureMessage + } + meta.SetStatusCondition(&rollout.Status.Conditions, bakeFailureCond) // Limit history size if specified versionHistoryLimit := int32(10) // default value if rollout.Spec.VersionHistoryLimit != nil { @@ -1730,19 +1708,12 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string // Use errorCutoff (max of deployTime and lastRetryTimestamp) so a retry // gets a fresh timeout window instead of inheriting the original deadline. if now.After(errorCutoff.Add(rollout.Spec.DeployTimeout.Duration)) { - // Only fail if previous entry was successful (or doesn't exist) AND the current - // entry was not force-deployed during an active incident. - shouldFail := true - if len(rollout.Status.History) > 1 { - previousEntry := rollout.Status.History[1] - if previousEntry.BakeStatus != nil && *previousEntry.BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - shouldFail = false - log.Info("Previous rollout entry was not successful, not failing current rollout despite deploy timeout") - } - } - if currentEntry.DeployedWithUnhealthyHealthChecks != nil && *currentEntry.DeployedWithUnhealthyHealthChecks && currentEntry.BakeStartTime == nil { - shouldFail = false - log.Info("Current entry was deployed with unhealthy health checks and bake has not started; not failing despite deploy timeout") + // The BakeFailureDisabled condition was determined when this entry was created + // and persists until the next deploy. If it's True, the rollout is in recovery + // mode and cannot be failed by deploy-timeout for this deployment. + shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + if !shouldFail { + log.Info("Rollout is in recovery mode (BakeFailureDisabled=True); not failing despite deploy timeout") } if shouldFail { @@ -1787,23 +1758,13 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string } } - // If health check error detected, mark as failed (unless previous entry was not successful, - // or the entry was force-deployed during an active incident and bake has not yet started) + // If health check error detected, mark as failed (unless the rollout is in recovery + // mode — BakeFailureDisabled was set to True when this entry was created and persists + // until the next deploy) if healthCheckError { - // Only fail if previous entry was successful (or doesn't exist). - shouldFail := true - if len(rollout.Status.History) > 1 { - previousEntry := rollout.Status.History[1] - if previousEntry.BakeStatus != nil && *previousEntry.BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - shouldFail = false - log.Info("Previous rollout entry was not successful, not failing current rollout despite health check error") - } - } - // Force-deploy during incident: while bake hasn't started, the same pre-existing - // failure pattern is expected to recur — don't treat it as a deployment failure. - if currentEntry.DeployedWithUnhealthyHealthChecks != nil && *currentEntry.DeployedWithUnhealthyHealthChecks && currentEntry.BakeStartTime == nil { - shouldFail = false - log.Info("Current entry was deployed with unhealthy health checks and bake has not started; not failing despite health check error") + shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + if !shouldFail { + log.Info("Rollout is in recovery mode (BakeFailureDisabled=True); not failing despite health check error") } if shouldFail { From ee131d0f74be4922f7882b85059ab8daf7f05417 Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Wed, 29 Apr 2026 16:55:43 +0000 Subject: [PATCH 3/3] refactor: extract setBakeFailureDisabledForNewDeploy helper, trim comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deployRelease no longer inlines the recovery-mode determination — moves ~50 lines of bookkeeping into a focused helper that mirrors setDeploymentBlockedCondition. Trimmed verbose explanatory comments in the main reconcile loop and handleBakeTime; the why is captured in the helpers' godoc. Behavior unchanged; 228/228 tests still passing. Co-Authored-By: Claude Sonnet 4.6 --- internal/controller/rollout_controller.go | 122 +++++++++------------- 1 file changed, 52 insertions(+), 70 deletions(-) diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index b5bf01b..22b64b0 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -146,10 +146,8 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } rollout.Status.GatedReleaseCandidates = gatedReleaseCandidates - // Evaluate health checks once. The result feeds both the DeploymentBlocked condition - // (set before any early returns) and the deployment-blocking guard further down. This - // avoids redundant API calls and ensures the condition is set even when gates also - // block (otherwise the gate early-return would leave the condition stale). + // Evaluate health checks once and set DeploymentBlocked. Done before the gate + // early-return so the condition surfaces even when gates also block. healthChecksHealthy := true var healthCheckMessage string if !r.hasManualDeployment(&rollout) { @@ -160,11 +158,6 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, err } } - - // Set DeploymentBlocked condition. True when automatic deployment of new versions is - // blocked by unhealthy health checks. Set independently of gates so it surfaces even - // when gates also block (gates blocking is signalled separately via GatesPassing / - // schedule UI). r.setDeploymentBlockedCondition(&rollout, healthChecksHealthy, healthCheckMessage) // Update status once with both release candidates and gated release candidates @@ -252,9 +245,8 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } } - // Block automatic deployment when health checks are unhealthy. Manual deployments - // (WantedVersion or force-deploy) bypass the block — that path is captured via the - // DeployedWithUnhealthyHealthChecks flag set when the new history entry is created. + // Block automatic deploys on unhealthy health checks. Manual deploys bypass; the + // recovery state for those is captured by setBakeFailureDisabledForNewDeploy. if !r.hasManualDeployment(&rollout) && !healthChecksHealthy { log.Info("Health checks are not healthy, blocking deployment", "message", healthCheckMessage) if r.Recorder != nil { @@ -1030,11 +1022,48 @@ func (r *RolloutReconciler) evaluateHealthChecks(ctx context.Context, namespace return true, "", nil } -// setDeploymentBlockedCondition sets the DeploymentBlocked condition based on whether -// automatic deployment of new versions is blocked by unhealthy health checks. Manual -// deployments (force-deploy / WantedVersion) bypass the block, so the condition reads -// False for them. This is independent of gates blocking (which is surfaced via the -// GatesPassing condition and schedule UI) so callers can see both blockers concurrently. +// setBakeFailureDisabledForNewDeploy sets the BakeFailureDisabled condition based on +// state at the moment a new deployment starts. The condition then persists for the +// entry's lifetime (overwritten only when the next deploy starts). +// +// Recovery-mode reasons: +// - PreviousBakeFailed: the prior entry didn't succeed (e.g. mid-rollback). +// - DeployedWithUnhealthyHealthChecks: manual deploy issued while any health check +// was already Unhealthy (deploy-during-incident). +func (r *RolloutReconciler) setBakeFailureDisabledForNewDeploy(ctx context.Context, rollout *rolloutv1alpha1.Rollout, log logr.Logger) { + cond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutBakeFailureDisabled, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Normal", + } + + if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil && + *rollout.Status.History[0].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { + cond.Status = metav1.ConditionTrue + cond.Reason = "PreviousBakeFailed" + cond.Message = "Previous deployment failed. Health check failures will not fail this deployment." + } else if r.hasManualDeployment(rollout) { + hcs, err := r.listHealthChecks(ctx, rollout.Namespace, rollout) + if err != nil { + log.Error(err, "Failed to list health checks for recovery-mode evaluation") + } else { + for _, hc := range hcs { + if hc.Status.Status == rolloutv1alpha1.HealthStatusUnhealthy { + cond.Status = metav1.ConditionTrue + cond.Reason = "DeployedWithUnhealthyHealthChecks" + cond.Message = "Deployed during an active incident. Health check failures will not fail this deployment." + break + } + } + } + } + + meta.SetStatusCondition(&rollout.Status.Conditions, cond) +} + +// setDeploymentBlockedCondition sets DeploymentBlocked based on health-check state. +// Independent of gate blocking so both blockers can surface concurrently. func (r *RolloutReconciler) setDeploymentBlockedCondition(rollout *rolloutv1alpha1.Rollout, healthChecksHealthy bool, healthCheckMessage string) { cond := metav1.Condition{ Type: rolloutv1alpha1.RolloutDeploymentBlocked, @@ -1235,36 +1264,9 @@ func (r *RolloutReconciler) deployRelease(ctx context.Context, rollout *rolloutv } } - // Determine the BakeFailureDisabled state for this new deployment. It's set once here, - // when the entry is created, and not recomputed during the entry's lifetime. - // Two reasons trigger recovery mode: - // - PreviousBakeFailed: the prior history entry didn't succeed; failing the new entry - // would create a cascading-failure loop (e.g. mid-rollback the app is degraded). - // - DeployedWithUnhealthyHealthChecks: a manual deploy was issued while a health check - // was already Unhealthy, signalling deploy-during-incident; the same failure pattern - // is expected to recur and shouldn't fail the new deploy. - bakeFailureReason := "" - bakeFailureMessage := "" - if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil && - *rollout.Status.History[0].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - bakeFailureReason = "PreviousBakeFailed" - bakeFailureMessage = "Previous deployment failed. Health check failures will not fail this deployment, allowing recovery to complete." - } else if r.hasManualDeployment(rollout) { - hcs, err := r.listHealthChecks(ctx, rollout.Namespace, rollout) - if err == nil { - for _, hc := range hcs { - if hc.Status.Status == rolloutv1alpha1.HealthStatusUnhealthy { - bakeFailureReason = "DeployedWithUnhealthyHealthChecks" - bakeFailureMessage = "Deployed during an active incident (health checks were unhealthy at deploy time). Health check failures will not fail this deployment." - log.Info("Manual deployment with unhealthy health checks; new entry will be in recovery mode", - "healthCheck", hc.Name, "namespace", hc.Namespace) - break - } - } - } else { - log.Error(err, "Failed to list health checks while determining recovery-mode condition; defaulting to non-recovery") - } - } + // Set BakeFailureDisabled before creating the new entry so it can read the current + // (about-to-become-previous) entry's BakeStatus. Persists for the entry's lifetime. + r.setBakeFailureDisabledForNewDeploy(ctx, rollout, log) nextID := r.getNextHistoryID(rollout) now := metav1.Time{Time: r.now()} @@ -1279,23 +1281,6 @@ func (r *RolloutReconciler) deployRelease(ctx context.Context, rollout *rolloutv BakeStartTime: nil, // Will be set when healthchecks become healthy BakeEndTime: nil, // Will be set when bake completes (succeeds, fails, or times out) }}, rollout.Status.History...) - - // Persist the BakeFailureDisabled condition alongside the new history entry. Set once - // here and not changed during the entry's lifetime — overwritten only when the next - // new entry is created via a future deployRelease call. - bakeFailureCond := metav1.Condition{ - Type: rolloutv1alpha1.RolloutBakeFailureDisabled, - Status: metav1.ConditionFalse, - LastTransitionTime: metav1.Now(), - Reason: "Normal", - Message: "", - } - if bakeFailureReason != "" { - bakeFailureCond.Status = metav1.ConditionTrue - bakeFailureCond.Reason = bakeFailureReason - bakeFailureCond.Message = bakeFailureMessage - } - meta.SetStatusCondition(&rollout.Status.Conditions, bakeFailureCond) // Limit history size if specified versionHistoryLimit := int32(10) // default value if rollout.Spec.VersionHistoryLimit != nil { @@ -1708,9 +1693,7 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string // Use errorCutoff (max of deployTime and lastRetryTimestamp) so a retry // gets a fresh timeout window instead of inheriting the original deadline. if now.After(errorCutoff.Add(rollout.Spec.DeployTimeout.Duration)) { - // The BakeFailureDisabled condition was determined when this entry was created - // and persists until the next deploy. If it's True, the rollout is in recovery - // mode and cannot be failed by deploy-timeout for this deployment. + // Recovery mode (BakeFailureDisabled=True, set at deploy start) suppresses failure. shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) if !shouldFail { log.Info("Rollout is in recovery mode (BakeFailureDisabled=True); not failing despite deploy timeout") @@ -1758,9 +1741,8 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string } } - // If health check error detected, mark as failed (unless the rollout is in recovery - // mode — BakeFailureDisabled was set to True when this entry was created and persists - // until the next deploy) + // If a health check error is detected, mark as failed unless the rollout is in + // recovery mode (BakeFailureDisabled=True, set at deploy start). if healthCheckError { shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) if !shouldFail {