diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index d1db042..ff9d946 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -382,6 +382,11 @@ const ( RolloutBakeTimeRetrying = "BakeTimeRetrying" // RolloutInvalidBakeTimeConfiguration means the bake time configuration is invalid. RolloutInvalidBakeTimeConfiguration = "InvalidBakeTimeConfiguration" + // RolloutDeploymentBlocked means deployment is blocked because health checks are unhealthy. + RolloutDeploymentBlocked = "DeploymentBlocked" + // RolloutBakeFailureDisabled means health check failures will not fail the current deployment. + // This occurs when the previous deployment also failed, allowing recovery without cascading failures. + RolloutBakeFailureDisabled = "BakeFailureDisabled" ) const ( diff --git a/internal/controller/recovery_mode_test.go b/internal/controller/recovery_mode_test.go new file mode 100644 index 0000000..452aa3e --- /dev/null +++ b/internal/controller/recovery_mode_test.go @@ -0,0 +1,442 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package controller + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + k8sptr "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + corev1 "k8s.io/api/core/v1" + + imagev1beta2 "github.com/fluxcd/image-reflector-controller/api/v1beta2" + fluxmeta "github.com/fluxcd/pkg/apis/meta" + rolloutv1alpha1 "github.com/kuberik/rollout-controller/api/v1alpha1" +) + +var _ = Describe("setDeploymentBlockedCondition", func() { + var reconciler *RolloutReconciler + BeforeEach(func() { reconciler = &RolloutReconciler{} }) + + condition := func(r *rolloutv1alpha1.Rollout) *metav1.Condition { + return meta.FindStatusCondition(r.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + } + + It("is False with reason ManualDeployment when WantedVersion is set", func() { + r := &rolloutv1alpha1.Rollout{ + Spec: rolloutv1alpha1.RolloutSpec{WantedVersion: k8sptr.To("v1")}, + } + // healthChecksHealthy doesn't matter for manual deployments. + reconciler.setDeploymentBlockedCondition(r, false, "ignored") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("ManualDeployment")) + }) + + It("is False with reason ManualDeployment when force-deploy annotation is present", func() { + r := &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"rollout.kuberik.com/force-deploy": "v2"}, + }, + } + reconciler.setDeploymentBlockedCondition(r, false, "ignored") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("ManualDeployment")) + }) + + It("is True with reason UnhealthyHealthChecks when health checks are unhealthy and not manual", func() { + r := &rolloutv1alpha1.Rollout{} + reconciler.setDeploymentBlockedCondition(r, false, "hc 'foo' is unhealthy") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("UnhealthyHealthChecks")) + Expect(c.Message).To(Equal("hc 'foo' is unhealthy")) + }) + + It("is False with reason HealthChecksHealthy when health checks pass and not manual", func() { + r := &rolloutv1alpha1.Rollout{} + reconciler.setDeploymentBlockedCondition(r, true, "") + c := condition(r) + Expect(c.Status).To(Equal(metav1.ConditionFalse)) + Expect(c.Reason).To(Equal("HealthChecksHealthy")) + }) +}) + +// Integration tests: BakeFailureDisabled condition is set once when a new history +// entry is created, persists for the entry's lifetime, and gates failure detection +// in handleBakeTime. The next deploy overwrites the condition. +var _ = Describe("BakeFailureDisabled condition (set at deploy time)", func() { + ctx := context.Background() + + var ( + namespace string + rollout *rolloutv1alpha1.Rollout + imagePolicy *imagev1beta2.ImagePolicy + healthCheck *rolloutv1alpha1.HealthCheck + fakeClock *FakeClock + reconciler *RolloutReconciler + key types.NamespacedName + ) + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "rec-ns-"}} + Expect(k8sClient.Create(ctx, ns)).To(Succeed()) + namespace = ns.Name + + imagePolicy = &imagev1beta2.ImagePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "rec-ip", Namespace: namespace}, + Spec: imagev1beta2.ImagePolicySpec{ + ImageRepositoryRef: fluxmeta.NamespacedObjectReference{Name: "ignored"}, + Policy: imagev1beta2.ImagePolicyChoice{ + SemVer: &imagev1beta2.SemVerPolicy{Range: ">=0.0.0"}, + }, + }, + } + Expect(k8sClient.Create(ctx, imagePolicy)).To(Succeed()) + imagePolicy.Status.Conditions = []metav1.Condition{{ + Type: "Ready", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), Reason: "Ready", + }} + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "1.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) + + rollout = &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{Name: "rec-rollout", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutSpec{ + ReleasesImagePolicy: corev1.LocalObjectReference{Name: "rec-ip"}, + BakeTime: &metav1.Duration{Duration: 5 * time.Minute}, + HealthCheckSelector: &rolloutv1alpha1.HealthCheckSelectorConfig{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test-app"}}, + }, + }, + } + Expect(k8sClient.Create(ctx, rollout)).To(Succeed()) + + healthCheck = &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rec-hc", Namespace: namespace, + Labels: map[string]string{"app": "test-app"}, + }, + } + Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed()) + + fakeClock = NewFakeClock() + reconciler = &RolloutReconciler{Client: k8sClient, Scheme: k8sClient.Scheme(), Clock: fakeClock} + key = types.NamespacedName{Name: rollout.Name, Namespace: namespace} + }) + + AfterEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} + Expect(k8sClient.Delete(ctx, ns)).To(Succeed()) + }) + + bakeFailureCondition := func() *metav1.Condition { + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + return meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + } + + It("is False with reason Normal on the first deploy (no prior entry, healthy HCs)", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) + Expect(bakeFailureCondition().Reason).To(Equal("Normal")) + }) + + It("is True with reason DeployedWithUnhealthyHealthChecks on a manual deploy with unhealthy HCs", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + c := bakeFailureCondition() + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) + }) + + It("is False on a manual deploy when health checks are healthy", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) + }) + + It("does NOT fail the rollout via deploy timeout while BakeFailureDisabled=True", func() { + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second} + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + + fakeClock.Add(2 * time.Minute) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying), + "deploy timeout must not flip rollout to Failed while BakeFailureDisabled=True") + }) + + It("does NOT fail the rollout via health check error while BakeFailureDisabled=True", func() { + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + // HC fires a fresh error AFTER deploy time. + fakeClock.Add(10 * time.Second) + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusDeploying), + "HC error after deploy must not fail the rollout while BakeFailureDisabled=True") + }) + + It("DOES fail the rollout via HC error when BakeFailureDisabled=False", func() { + // Healthy HC at deploy time → BakeFailureDisabled=False → normal failure detection. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionFalse)) + + // HC fires an error after deploy time. + fakeClock.Add(10 * time.Second) + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + healthCheck.Status.LastErrorTime = &metav1.Time{Time: fakeClock.Now()} + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + Expect(*rollout.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed)) + }) + + It("persists the condition value across reconciles (does not recompute)", func() { + // Set up: deploy with unhealthy HC → True. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + firstTransition := bakeFailureCondition().LastTransitionTime + + // Several no-op reconciles — condition must not flap. + for i := 0; i < 3; i++ { + fakeClock.Add(5 * time.Second) + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + Expect(bakeFailureCondition().LastTransitionTime).To(Equal(firstTransition)) + } + }) + + It("is overwritten when a new deploy starts (e.g. user pins a different version)", func() { + // First deploy: unhealthy HC at deploy time → True. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("1.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + Expect(bakeFailureCondition().Status).To(Equal(metav1.ConditionTrue)) + Expect(bakeFailureCondition().Reason).To(Equal("DeployedWithUnhealthyHealthChecks")) + + // Make a new release available. + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "2.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) + + // Heal the HC so the next deploy is created with healthy HCs. + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + // Pin to the new version (manual deploy). + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.WantedVersion = k8sptr.To("2.0.0") + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + // Previous entry was Cancelled (since it was in-progress and got replaced) → + // PreviousBakeFailed reason on the new entry. + c := bakeFailureCondition() + Expect(c.Status).To(Equal(metav1.ConditionTrue)) + Expect(c.Reason).To(Equal("PreviousBakeFailed")) + }) +}) + +// DeploymentBlocked condition surfaces independently of gate blocking. +var _ = Describe("DeploymentBlocked condition with concurrent gate blocking", func() { + ctx := context.Background() + + var ( + namespace string + rollout *rolloutv1alpha1.Rollout + imagePolicy *imagev1beta2.ImagePolicy + healthCheck *rolloutv1alpha1.HealthCheck + fakeClock *FakeClock + reconciler *RolloutReconciler + key types.NamespacedName + ) + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "block-ns-"}} + Expect(k8sClient.Create(ctx, ns)).To(Succeed()) + namespace = ns.Name + + imagePolicy = &imagev1beta2.ImagePolicy{ + ObjectMeta: metav1.ObjectMeta{Name: "block-ip", Namespace: namespace}, + Spec: imagev1beta2.ImagePolicySpec{ + ImageRepositoryRef: fluxmeta.NamespacedObjectReference{Name: "ignored"}, + Policy: imagev1beta2.ImagePolicyChoice{ + SemVer: &imagev1beta2.SemVerPolicy{Range: ">=0.0.0"}, + }, + }, + } + Expect(k8sClient.Create(ctx, imagePolicy)).To(Succeed()) + imagePolicy.Status.Conditions = []metav1.Condition{{ + Type: "Ready", Status: metav1.ConditionTrue, LastTransitionTime: metav1.Now(), Reason: "Ready", + }} + imagePolicy.Status.LatestRef = &imagev1beta2.ImageRef{Tag: "1.0.0"} + Expect(k8sClient.Status().Update(ctx, imagePolicy)).To(Succeed()) + + rollout = &rolloutv1alpha1.Rollout{ + ObjectMeta: metav1.ObjectMeta{Name: "block-rollout", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutSpec{ + ReleasesImagePolicy: corev1.LocalObjectReference{Name: "block-ip"}, + HealthCheckSelector: &rolloutv1alpha1.HealthCheckSelectorConfig{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "test-app"}}, + }, + }, + } + Expect(k8sClient.Create(ctx, rollout)).To(Succeed()) + + healthCheck = &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "block-hc", Namespace: namespace, + Labels: map[string]string{"app": "test-app"}, + }, + } + Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed()) + + fakeClock = NewFakeClock() + reconciler = &RolloutReconciler{Client: k8sClient, Scheme: k8sClient.Scheme(), Clock: fakeClock} + key = types.NamespacedName{Name: rollout.Name, Namespace: namespace} + }) + + AfterEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}} + Expect(k8sClient.Delete(ctx, ns)).To(Succeed()) + }) + + It("surfaces DeploymentBlocked=True even when a blocking gate would otherwise return early", func() { + // Regression: previously the gate early-return at the top of Reconcile prevented + // the DeploymentBlocked condition from being set, so the UI showed no signal that + // health checks were also unhealthy. + blockingGate := &rolloutv1alpha1.RolloutGate{ + ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutGateSpec{ + RolloutRef: &corev1.LocalObjectReference{Name: rollout.Name}, + Passing: k8sptr.To(false), + }, + } + Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) + + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusUnhealthy + healthCheck.Status.Message = k8sptr.To("simulated incident") + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + cond := meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + Expect(cond).NotTo(BeNil(), "DeploymentBlocked condition should be persisted even with gate blocking") + Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + Expect(cond.Reason).To(Equal("UnhealthyHealthChecks")) + Expect(cond.Message).To(ContainSubstring("simulated incident")) + }) + + It("clears DeploymentBlocked once health checks recover, even while gates still block", func() { + blockingGate := &rolloutv1alpha1.RolloutGate{ + ObjectMeta: metav1.ObjectMeta{Name: "block-gate", Namespace: namespace}, + Spec: rolloutv1alpha1.RolloutGateSpec{ + RolloutRef: &corev1.LocalObjectReference{Name: rollout.Name}, + Passing: k8sptr.To(false), + }, + } + Expect(k8sClient.Create(ctx, blockingGate)).To(Succeed()) + + healthCheck.Status.Status = rolloutv1alpha1.HealthStatusHealthy + Expect(k8sClient.Status().Update(ctx, healthCheck)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + cond := meta.FindStatusCondition(rollout.Status.Conditions, rolloutv1alpha1.RolloutDeploymentBlocked) + Expect(cond).NotTo(BeNil()) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal("HealthChecksHealthy")) + }) +}) diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index 1dde080..22b64b0 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -146,6 +146,20 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } rollout.Status.GatedReleaseCandidates = gatedReleaseCandidates + // Evaluate health checks once and set DeploymentBlocked. Done before the gate + // early-return so the condition surfaces even when gates also block. + healthChecksHealthy := true + var healthCheckMessage string + if !r.hasManualDeployment(&rollout) { + var err error + healthChecksHealthy, healthCheckMessage, err = r.evaluateHealthChecks(ctx, req.Namespace, &rollout) + if err != nil { + log.Error(err, "Failed to evaluate health checks") + return ctrl.Result{}, err + } + } + r.setDeploymentBlockedCondition(&rollout, healthChecksHealthy, healthCheckMessage) + // Update status once with both release candidates and gated release candidates if err := r.Client.Status().Update(ctx, &rollout); err != nil { log.Error(err, "Failed to update rollout status with release candidates and gated release candidates") @@ -231,26 +245,14 @@ func (r *RolloutReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } } - // Evaluate health checks - block deployment if health checks are not healthy - // For manual deployments (WantedVersion or force-deploy), skip this check - if !r.hasManualDeployment(&rollout) { - healthChecksHealthy, healthCheckMessage, err := r.evaluateHealthChecks(ctx, req.Namespace, &rollout) - if err != nil { - log.Error(err, "Failed to evaluate health checks") - return ctrl.Result{}, err - } - if !healthChecksHealthy { - log.Info("Health checks are not healthy, blocking deployment", "message", healthCheckMessage) - // Update status before returning - if err := r.Client.Status().Update(ctx, &rollout); err != nil { - log.Error(err, "Failed to update rollout status") - } - // Emit event for health check blocking - if r.Recorder != nil { - r.Recorder.Event(&rollout, corev1.EventTypeWarning, "HealthCheckBlocking", healthCheckMessage) - } - return ctrl.Result{}, nil + // Block automatic deploys on unhealthy health checks. Manual deploys bypass; the + // recovery state for those is captured by setBakeFailureDisabledForNewDeploy. + if !r.hasManualDeployment(&rollout) && !healthChecksHealthy { + log.Info("Health checks are not healthy, blocking deployment", "message", healthCheckMessage) + if r.Recorder != nil { + r.Recorder.Event(&rollout, corev1.EventTypeWarning, "HealthCheckBlocking", healthCheckMessage) } + return ctrl.Result{}, nil } // Use filteredReleases instead of releases for wantedRelease selection @@ -1020,6 +1022,68 @@ func (r *RolloutReconciler) evaluateHealthChecks(ctx context.Context, namespace return true, "", nil } +// setBakeFailureDisabledForNewDeploy sets the BakeFailureDisabled condition based on +// state at the moment a new deployment starts. The condition then persists for the +// entry's lifetime (overwritten only when the next deploy starts). +// +// Recovery-mode reasons: +// - PreviousBakeFailed: the prior entry didn't succeed (e.g. mid-rollback). +// - DeployedWithUnhealthyHealthChecks: manual deploy issued while any health check +// was already Unhealthy (deploy-during-incident). +func (r *RolloutReconciler) setBakeFailureDisabledForNewDeploy(ctx context.Context, rollout *rolloutv1alpha1.Rollout, log logr.Logger) { + cond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutBakeFailureDisabled, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Normal", + } + + if len(rollout.Status.History) > 0 && rollout.Status.History[0].BakeStatus != nil && + *rollout.Status.History[0].BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { + cond.Status = metav1.ConditionTrue + cond.Reason = "PreviousBakeFailed" + cond.Message = "Previous deployment failed. Health check failures will not fail this deployment." + } else if r.hasManualDeployment(rollout) { + hcs, err := r.listHealthChecks(ctx, rollout.Namespace, rollout) + if err != nil { + log.Error(err, "Failed to list health checks for recovery-mode evaluation") + } else { + for _, hc := range hcs { + if hc.Status.Status == rolloutv1alpha1.HealthStatusUnhealthy { + cond.Status = metav1.ConditionTrue + cond.Reason = "DeployedWithUnhealthyHealthChecks" + cond.Message = "Deployed during an active incident. Health check failures will not fail this deployment." + break + } + } + } + } + + meta.SetStatusCondition(&rollout.Status.Conditions, cond) +} + +// setDeploymentBlockedCondition sets DeploymentBlocked based on health-check state. +// Independent of gate blocking so both blockers can surface concurrently. +func (r *RolloutReconciler) setDeploymentBlockedCondition(rollout *rolloutv1alpha1.Rollout, healthChecksHealthy bool, healthCheckMessage string) { + cond := metav1.Condition{ + Type: rolloutv1alpha1.RolloutDeploymentBlocked, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.Now(), + Reason: "Allowed", + Message: "", + } + if r.hasManualDeployment(rollout) { + cond.Reason = "ManualDeployment" + } else if !healthChecksHealthy { + cond.Status = metav1.ConditionTrue + cond.Reason = "UnhealthyHealthChecks" + cond.Message = healthCheckMessage + } else { + cond.Reason = "HealthChecksHealthy" + } + meta.SetStatusCondition(&rollout.Status.Conditions, cond) +} + // hasManualDeployment checks if there's a manual deployment requested (WantedVersion or force deploy) func (r *RolloutReconciler) hasManualDeployment(rollout *rolloutv1alpha1.Rollout) bool { // Check for WantedVersion in spec @@ -1200,6 +1264,10 @@ func (r *RolloutReconciler) deployRelease(ctx context.Context, rollout *rolloutv } } + // Set BakeFailureDisabled before creating the new entry so it can read the current + // (about-to-become-previous) entry's BakeStatus. Persists for the entry's lifetime. + r.setBakeFailureDisabledForNewDeploy(ctx, rollout, log) + nextID := r.getNextHistoryID(rollout) now := metav1.Time{Time: r.now()} rollout.Status.History = append([]rolloutv1alpha1.DeploymentHistoryEntry{{ @@ -1625,14 +1693,10 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string // Use errorCutoff (max of deployTime and lastRetryTimestamp) so a retry // gets a fresh timeout window instead of inheriting the original deadline. if now.After(errorCutoff.Add(rollout.Spec.DeployTimeout.Duration)) { - // Only fail if previous entry was successful (or doesn't exist) - shouldFail := true - if len(rollout.Status.History) > 1 { - previousEntry := rollout.Status.History[1] - if previousEntry.BakeStatus != nil && *previousEntry.BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - shouldFail = false - log.Info("Previous rollout entry was not successful, not failing current rollout despite deploy timeout") - } + // Recovery mode (BakeFailureDisabled=True, set at deploy start) suppresses failure. + shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + if !shouldFail { + log.Info("Rollout is in recovery mode (BakeFailureDisabled=True); not failing despite deploy timeout") } if shouldFail { @@ -1677,16 +1741,12 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string } } - // If health check error detected, mark as failed (unless previous entry was not successful) + // If a health check error is detected, mark as failed unless the rollout is in + // recovery mode (BakeFailureDisabled=True, set at deploy start). if healthCheckError { - // Only fail if previous entry was successful (or doesn't exist) - shouldFail := true - if len(rollout.Status.History) > 1 { - previousEntry := rollout.Status.History[1] - if previousEntry.BakeStatus != nil && *previousEntry.BakeStatus != rolloutv1alpha1.BakeStatusSucceeded { - shouldFail = false - log.Info("Previous rollout entry was not successful, not failing current rollout despite health check error") - } + shouldFail := !meta.IsStatusConditionTrue(rollout.Status.Conditions, rolloutv1alpha1.RolloutBakeFailureDisabled) + if !shouldFail { + log.Info("Rollout is in recovery mode (BakeFailureDisabled=True); not failing despite health check error") } if shouldFail {