From 541b0af4fd8fddb5eed033b442f253cb79f43037 Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Mon, 27 Apr 2026 06:09:50 +0000 Subject: [PATCH 1/4] refactor: remove RetryMode from API; retry annotation is now value-agnostic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mode of retry (re-run vs skip) belongs to the domain that owns the tests, not to the rollout controller. Removing LastRetryMode field and RetryModeRetry/Skip constants makes the rollout-controller API generic — it only stamps LastRetryTimestamp, which is the cross-controller contract. Co-Authored-By: Claude Sonnet 4.6 --- api/v1alpha1/rollout_types.go | 21 +++------------- config/crd/bases/kuberik.com_rollouts.yaml | 9 ------- internal/controller/retry_test.go | 29 +++------------------- internal/controller/rollout_controller.go | 21 ++++++---------- 4 files changed, 13 insertions(+), 67 deletions(-) diff --git a/api/v1alpha1/rollout_types.go b/api/v1alpha1/rollout_types.go index 8b60a04..d1db042 100644 --- a/api/v1alpha1/rollout_types.go +++ b/api/v1alpha1/rollout_types.go @@ -339,13 +339,6 @@ type DeploymentHistoryEntry struct { // ignored as they reflect the pre-retry state. // +optional LastRetryTimestamp *metav1.Time `json:"lastRetryTimestamp,omitempty"` - - // LastRetryMode records how the most recent retry should treat failed RolloutTests. - // "retry" (or empty): re-run failed tests; openkruise stepgate resets them to WaitingForStep. - // "skip": mark failed tests as Skipped (treated as passing); skip the re-run. - // +kubebuilder:validation:Enum=retry;skip - // +optional - LastRetryMode string `json:"lastRetryMode,omitempty"` } // +kubebuilder:object:root=true @@ -403,18 +396,10 @@ const ( // RetryAnnotation, when set on a Rollout whose latest history entry has BakeStatus=Failed, // triggers a retry: the rollout controller resets BakeStatus to Deploying, records // LastRetryTimestamp on the history entry, and removes the annotation. The annotation - // value selects how failed RolloutTests are handled: - // "retry" (or empty/unknown): re-run failed tests. - // "skip": mark failed tests as Skipped (counted as passing) so the rollout can - // advance without re-running them. + // value is opaque to this controller — domain-specific retry modes (e.g. how failed + // RolloutTests should be handled) live as separate annotations owned by their respective + // controllers, keyed off LastRetryTimestamp as the cross-controller signal. RetryAnnotation = "rollout.kuberik.com/retry" - - // RetryModeRetry re-runs failed RolloutTests on retry (the default). - RetryModeRetry = "retry" - - // RetryModeSkip marks failed RolloutTests as Skipped on retry — the rollout advances - // without re-running them. - RetryModeSkip = "skip" ) func init() { diff --git a/config/crd/bases/kuberik.com_rollouts.yaml b/config/crd/bases/kuberik.com_rollouts.yaml index 2e51600..47b9730 100644 --- a/config/crd/bases/kuberik.com_rollouts.yaml +++ b/config/crd/bases/kuberik.com_rollouts.yaml @@ -458,15 +458,6 @@ spec: this history entry. format: int64 type: integer - lastRetryMode: - description: |- - LastRetryMode records how the most recent retry should treat failed RolloutTests. - "retry" (or empty): re-run failed tests; openkruise stepgate resets them to WaitingForStep. - "skip": mark failed tests as Skipped (treated as passing); skip the re-run. - enum: - - retry - - skip - type: string lastRetryTimestamp: description: |- LastRetryTimestamp is the time when a retry was most recently requested for this deployment. diff --git a/internal/controller/retry_test.go b/internal/controller/retry_test.go index 84f1721..8f12557 100644 --- a/internal/controller/retry_test.go +++ b/internal/controller/retry_test.go @@ -106,9 +106,10 @@ var _ = Describe("Rollout retry annotation", func() { Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) } - It("resets failed bake status, records LastRetryTimestamp, removes annotation", func() { + It("resets failed bake status, records LastRetryTimestamp, removes annotation regardless of value", func() { seedFailedHistory() - addRetryAnnotation("2026-04-22T10:00:00Z") + // Annotation value is opaque to this controller — any value triggers a retry. + addRetryAnnotation("anything-goes") Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed()) @@ -126,30 +127,6 @@ var _ = Describe("Rollout retry annotation", func() { Expect(entry.FailedHealthChecks).To(BeEmpty()) Expect(entry.LastRetryTimestamp).NotTo(BeNil()) Expect(entry.LastRetryTimestamp.Time).To(Equal(fakeClock.Now())) - // Unknown annotation value (timestamp) → default mode "retry". - Expect(entry.LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeRetry)) - }) - - It("records LastRetryMode=skip when annotation value is \"skip\"", func() { - seedFailedHistory() - addRetryAnnotation("skip") - - Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed()) - - fetched := &rolloutv1alpha1.Rollout{} - Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) - Expect(fetched.Status.History[0].LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeSkip)) - }) - - It("records LastRetryMode=retry when annotation value is \"retry\"", func() { - seedFailedHistory() - addRetryAnnotation("retry") - - Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed()) - - fetched := &rolloutv1alpha1.Rollout{} - Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) - Expect(fetched.Status.History[0].LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeRetry)) }) It("removes annotation but does not reset when BakeStatus is InProgress (double-retry guard)", func() { diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index 221e8c9..d7c6646 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -1893,14 +1893,16 @@ func (r *RolloutReconciler) now() time.Time { // It resets the most recent history entry from Failed back to Deploying, records // LastRetryTimestamp so downstream controllers (health checks, kruise step gates) // can distinguish stale from fresh failures, and clears the annotation. +// The annotation value is opaque — domain-specific retry directives (e.g. skip +// failed RolloutTests) live as separate annotations owned by their respective +// controllers and keyed off LastRetryTimestamp. // No-op unless BakeStatus is Failed and the annotation is present — double-retry // attempts during an in-flight retry are therefore idempotent. func (r *RolloutReconciler) handleRetryAnnotation(ctx context.Context, rollout *rolloutv1alpha1.Rollout, log logr.Logger) error { if rollout.Annotations == nil { return nil } - retryValue, hasAnnotation := rollout.Annotations[rolloutv1alpha1.RetryAnnotation] - if !hasAnnotation { + if _, hasAnnotation := rollout.Annotations[rolloutv1alpha1.RetryAnnotation]; !hasAnnotation { return nil } @@ -1917,21 +1919,13 @@ func (r *RolloutReconciler) handleRetryAnnotation(ctx context.Context, rollout * current := &rollout.Status.History[0] if current.BakeStatus == nil || *current.BakeStatus != rolloutv1alpha1.BakeStatusFailed { log.Info("Retry annotation ignored; BakeStatus not Failed", - "bakeStatus", k8sptr.Deref(current.BakeStatus, ""), - "retryValue", retryValue) + "bakeStatus", k8sptr.Deref(current.BakeStatus, "")) return clearAnnotation() } - // Annotation value is the mode: "retry" (default) or "skip". - mode := rolloutv1alpha1.RetryModeRetry - if retryValue == rolloutv1alpha1.RetryModeSkip { - mode = rolloutv1alpha1.RetryModeSkip - } - now := metav1.Time{Time: r.now()} deploying := rolloutv1alpha1.BakeStatusDeploying current.LastRetryTimestamp = &now - current.LastRetryMode = mode current.BakeStatus = &deploying current.BakeStatusMessage = nil current.BakeStartTime = nil @@ -1948,11 +1942,10 @@ func (r *RolloutReconciler) handleRetryAnnotation(ctx context.Context, rollout * if r.Recorder != nil { r.Recorder.Event(rollout, corev1.EventTypeNormal, "RetryRequested", - fmt.Sprintf("Retry requested (mode=%s); bake status reset at %s", mode, now.Format(time.RFC3339))) + fmt.Sprintf("Retry requested; bake status reset at %s", now.Format(time.RFC3339))) } log.Info("Processed retry annotation; bake status reset to Deploying", - "lastRetryTimestamp", now.Format(time.RFC3339), - "mode", mode) + "lastRetryTimestamp", now.Format(time.RFC3339)) return nil } From 65af6133b35324b77f0ff195088aceb8da848b78 Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Mon, 27 Apr 2026 19:12:17 +0000 Subject: [PATCH 2/4] fix: use errorCutoff (not deployTime) for deploy-timeout and canStartBake gates After retry, errorCutoff = max(deployTime, lastRetryTimestamp). Three call sites were still using raw deployTime, causing incorrect behavior post-retry: 1. rollout_controller.go:1625 - deploy-timeout deadline: if the original deploy was already past DeployTimeout when the retry happened, the next reconcile immediately marked the rollout Failed with "deploy timeout" before any health check had a chance to re-evaluate. 2. rollout_controller.go:1743 - canStartBake gate: health checks with LastChangeTime between deployTime and lastRetryTimestamp passed the gate, meaning bake could start based on a pre-retry health report. 3. rollout_controller.go:1643 - collectUnhealthyHealthChecks: FailedHealthChecks on the deploy-timeout path included pre-retry stale failures in the record. Also fix kustomizationhealth_controller.go:354 - when a kustomization is Unhealthy but getFailureConditionTime returns nil (no conditions on the managed resource), LastErrorTime was not updated, leaving a stale or nil timestamp. The rollout controller compares LastErrorTime against errorCutoff to detect fresh failures; a stale timestamp causes the failure to be silently ignored until the next healthy health-check cycle that does return a timestamp. Fall back to now when errorTime is nil so there is always a valid timestamp. Co-Authored-By: Claude Sonnet 4.6 --- .../controller/kustomizationhealth_controller.go | 10 ++++++++-- internal/controller/rollout_controller.go | 15 +++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/internal/controller/kustomizationhealth_controller.go b/internal/controller/kustomizationhealth_controller.go index ea13a34..c6f4d06 100644 --- a/internal/controller/kustomizationhealth_controller.go +++ b/internal/controller/kustomizationhealth_controller.go @@ -351,8 +351,14 @@ func (r *KustomizationHealthReconciler) updateHealthCheckStatus(ctx context.Cont // Set LastErrorTime to the actual failure condition timestamp so the rollout controller // can determine whether the failure is pre- or post-retry without involving this controller. - if newStatus == rolloutv1alpha1.HealthStatusUnhealthy && errorTime != nil { - healthCheck.Status.LastErrorTime = errorTime + // Fall back to now when the underlying object has no condition timestamp (e.g. no conditions + // set yet) so the rollout controller always has a valid timestamp to compare against. + if newStatus == rolloutv1alpha1.HealthStatusUnhealthy { + if errorTime != nil { + healthCheck.Status.LastErrorTime = errorTime + } else { + healthCheck.Status.LastErrorTime = &now + } } // Update the status diff --git a/internal/controller/rollout_controller.go b/internal/controller/rollout_controller.go index d7c6646..1dde080 100644 --- a/internal/controller/rollout_controller.go +++ b/internal/controller/rollout_controller.go @@ -1621,8 +1621,10 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string // Check deployTimeout - if bake hasn't started within deployTimeout, mark as failed // But only if the previous entry was successful (or doesn't exist) if rollout.Spec.DeployTimeout != nil && currentEntry.BakeStartTime == nil { - // Bake hasn't started yet - check if deployTimeout has been exceeded - if now.After(deployTime.Add(rollout.Spec.DeployTimeout.Duration)) { + // Bake hasn't started yet - check if deployTimeout has been exceeded. + // Use errorCutoff (max of deployTime and lastRetryTimestamp) so a retry + // gets a fresh timeout window instead of inheriting the original deadline. + if now.After(errorCutoff.Add(rollout.Spec.DeployTimeout.Duration)) { // Only fail if previous entry was successful (or doesn't exist) shouldFail := true if len(rollout.Status.History) > 1 { @@ -1639,8 +1641,9 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string currentEntry.BakeStatusMessage = k8sptr.To("Deploy timeout reached before bake could start (health checks did not become healthy in time).") currentEntry.BakeEndTime = &metav1.Time{Time: now} - // Collect unhealthy health checks that prevented bake from starting - currentEntry.FailedHealthChecks = r.collectUnhealthyHealthChecks(allHealthChecks, deployTime) + // Collect unhealthy health checks that prevented bake from starting. + // errorCutoff excludes pre-retry stale failures from the record. + currentEntry.FailedHealthChecks = r.collectUnhealthyHealthChecks(allHealthChecks, errorCutoff) meta.SetStatusCondition(&rollout.Status.Conditions, metav1.Condition{ Type: rolloutv1alpha1.RolloutReady, @@ -1740,9 +1743,9 @@ func (r *RolloutReconciler) handleBakeTime(ctx context.Context, namespace string break } - if hc.Status.LastChangeTime.Time.Before(deployTime) { + if hc.Status.LastChangeTime.Time.Before(errorCutoff) { canStartBake = false - log.Info("HealthCheck LastChangeTime is not newer than deployment time", "name", hc.Name, "namespace", hc.Namespace, "lastChangeTime", hc.Status.LastChangeTime.Time, "deployTime", deployTime) + log.Info("HealthCheck LastChangeTime is not newer than error cutoff", "name", hc.Name, "namespace", hc.Namespace, "lastChangeTime", hc.Status.LastChangeTime.Time, "errorCutoff", errorCutoff) break } } From 583799bbed0ff4e9228a3ec5476d81d31a9631ea Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Tue, 28 Apr 2026 05:56:17 +0000 Subject: [PATCH 3/4] test: cover errorCutoff gates and nil-errorTime fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new errorCutoff tests (retry_test.go): - deploy-timeout window resets after retry — no instant failure if original deadline elapsed - canStartBake blocks when health check LastChangeTime predates retry timestamp - FailedHealthChecks records health checks whose LastChangeTime predates retry Two nil-errorTime tests (kustomizationhealth_controller_test.go): - LastErrorTime is set to now when errorTime is nil (resource has no condition timestamps) - LastErrorTime uses provided errorTime when non-nil Co-Authored-By: Claude Sonnet 4.6 --- .../kustomizationhealth_controller_test.go | 70 +++++++++ internal/controller/retry_test.go | 139 ++++++++++++++++++ 2 files changed, 209 insertions(+) diff --git a/internal/controller/kustomizationhealth_controller_test.go b/internal/controller/kustomizationhealth_controller_test.go index 5c4b550..8669f77 100644 --- a/internal/controller/kustomizationhealth_controller_test.go +++ b/internal/controller/kustomizationhealth_controller_test.go @@ -1224,3 +1224,73 @@ var _ = Describe("getFailureConditionTime", func() { Expect(getFailureConditionTime(obj)).To(BeNil()) }) }) + +// These tests verify the nil-errorTime fallback in updateHealthCheckStatus. +// When a managed resource is Unhealthy but has no condition timestamps +// (getFailureConditionTime returns nil), LastErrorTime must be set to now +// so the rollout controller's errorCutoff comparison sees a fresh failure. +var _ = Describe("KustomizationHealth updateHealthCheckStatus nil errorTime", func() { + ctx := context.Background() + var namespace string + var healthCheck *rolloutv1alpha1.HealthCheck + var fakeClock *FakeClock + + BeforeEach(func() { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "khnil-ns-"}} + Expect(k8sClient.Create(ctx, ns)).To(Succeed()) + namespace = ns.Name + + healthCheck = &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{Name: "nil-errtime-hc", Namespace: namespace}, + Spec: rolloutv1alpha1.HealthCheckSpec{}, + } + Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed()) + + fakeClock = NewFakeClock() + }) + + AfterEach(func() { + Expect(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}})).To(Succeed()) + }) + + It("sets LastErrorTime to now when unhealthy but errorTime is nil", func() { + reconciler := &KustomizationHealthReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + Clock: fakeClock, + } + + By("calling updateHealthCheckStatus with nil errorTime") + msg := "no condition timestamps available" + _, err := reconciler.updateHealthCheckStatus(ctx, healthCheck, + rolloutv1alpha1.HealthStatusUnhealthy, msg, nil) + Expect(err).NotTo(HaveOccurred()) + + By("verifying LastErrorTime is set to the clock's current time") + updated := &rolloutv1alpha1.HealthCheck{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: healthCheck.Name, Namespace: namespace}, updated)).To(Succeed()) + Expect(updated.Status.LastErrorTime).NotTo(BeNil(), + "LastErrorTime must be set even when the underlying resource has no condition timestamps") + Expect(updated.Status.LastErrorTime.Time).To(Equal(fakeClock.Now()), + "LastErrorTime must be stamped with the controller clock's current time") + }) + + It("uses the provided errorTime when it is non-nil", func() { + reconciler := &KustomizationHealthReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + Clock: fakeClock, + } + + conditionTime := metav1.NewTime(fakeClock.Now().Add(-3 * time.Minute)) + _, err := reconciler.updateHealthCheckStatus(ctx, healthCheck, + rolloutv1alpha1.HealthStatusUnhealthy, "deployment progressing deadline exceeded", &conditionTime) + Expect(err).NotTo(HaveOccurred()) + + updated := &rolloutv1alpha1.HealthCheck{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: healthCheck.Name, Namespace: namespace}, updated)).To(Succeed()) + Expect(updated.Status.LastErrorTime).NotTo(BeNil()) + Expect(updated.Status.LastErrorTime.Time).To(Equal(conditionTime.Time), + "when errorTime is provided it must be used verbatim — not overridden by now") + }) +}) diff --git a/internal/controller/retry_test.go b/internal/controller/retry_test.go index 8f12557..65189db 100644 --- a/internal/controller/retry_test.go +++ b/internal/controller/retry_test.go @@ -430,4 +430,143 @@ var _ = Describe("Rollout errorCutoff with retry", func() { Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) Expect(*fetched.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed)) }) + + It("deploy-timeout deadline measured from retry time, not original deploy time", func() { + // Deployed 15 min ago, retried 2 min ago, DeployTimeout = 10 min. + // errorCutoff = retryTime (2 min ago). Deadline = retryTime + 10 min = now + 8 min. + // now.After(now + 8 min) = false → timeout must NOT fire. + // Without the fix (using deployTime): deadline = deployTime + 10 min = now - 5 min. + // now.After(now - 5 min) = true → rollout would be immediately marked Failed. + retryAt := fakeClock.Now().Add(-2 * time.Minute) + deployAt := fakeClock.Now().Add(-15 * time.Minute) + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + retryTs := metav1.NewTime(retryAt) + deploying := rolloutv1alpha1.BakeStatusDeploying + rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{ + ID: k8sptr.To[int64](1), + Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"}, + Timestamp: metav1.NewTime(deployAt), + BakeStatus: &deploying, + LastRetryTimestamp: &retryTs, + // BakeStartTime nil → deploy-timeout check is active + }} + Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed()) + + // Patch DeployTimeout = 10 min onto the rollout spec. + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 10 * time.Minute} + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + // Seed an unhealthy health check so bake cannot start. + seedHealthCheck(retryAt.Add(-5 * time.Minute)) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + fetched := &rolloutv1alpha1.Rollout{} + Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) + Expect(*fetched.Status.History[0].BakeStatus).NotTo(Equal(rolloutv1alpha1.BakeStatusFailed), + "deploy-timeout must be measured from errorCutoff (retry time), not original deploy time") + }) + + It("canStartBake requires LastChangeTime after retry time, not just deploy time", func() { + // Deploy 15 min ago, retry 5 min ago. + // Health check LastChangeTime = 7 min ago — AFTER deployTime but BEFORE retryTime. + // With errorCutoff: LastChangeTime.Before(now-5m) = true → canStartBake = false → bake blocked. + // Without fix (using deployTime): LastChangeTime.Before(now-15m) = false → bake would start. + retryAt := fakeClock.Now().Add(-5 * time.Minute) + deployAt := fakeClock.Now().Add(-15 * time.Minute) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + retryTs := metav1.NewTime(retryAt) + deploying := rolloutv1alpha1.BakeStatusDeploying + rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{ + ID: k8sptr.To[int64](1), + Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"}, + Timestamp: metav1.NewTime(deployAt), + BakeStatus: &deploying, + LastRetryTimestamp: &retryTs, + }} + Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed()) + + // Healthy health check, but LastChangeTime is between deployTime and retryTime. + hc := &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "hc-", + Namespace: namespace, + Labels: map[string]string{"suite": "errcutoff"}, + }, + Spec: rolloutv1alpha1.HealthCheckSpec{}, + } + Expect(k8sClient.Create(ctx, hc)).To(Succeed()) + changeTime := metav1.NewTime(fakeClock.Now().Add(-7 * time.Minute)) + hc.Status = rolloutv1alpha1.HealthCheckStatus{ + Status: rolloutv1alpha1.HealthStatusHealthy, + LastChangeTime: &changeTime, + } + Expect(k8sClient.Status().Update(ctx, hc)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + fetched := &rolloutv1alpha1.Rollout{} + Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) + Expect(fetched.Status.History[0].BakeStartTime).To(BeNil(), + "bake must not start when health check LastChangeTime is before the retry time") + }) + + It("FailedHealthChecks excludes health checks whose LastChangeTime predates the retry", func() { + // Deploy 15 min ago, retry 2 min ago, DeployTimeout = 30s (fires immediately since + // now > retryTime + 30s). Health check is Healthy but LastChangeTime = 7 min ago + // (before retryTime). With errorCutoff the record must include this health check + // as stale-relative-to-retry; without the fix (using deployTime) it would be + // excluded because LastChangeTime > deployTime looks "fresh enough". + retryAt := fakeClock.Now().Add(-2 * time.Minute) + deployAt := fakeClock.Now().Add(-15 * time.Minute) + + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + retryTs := metav1.NewTime(retryAt) + deploying := rolloutv1alpha1.BakeStatusDeploying + rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{ + ID: k8sptr.To[int64](1), + Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"}, + Timestamp: metav1.NewTime(deployAt), + BakeStatus: &deploying, + LastRetryTimestamp: &retryTs, + }} + Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed()) + + // DeployTimeout = 30s → deadline = retryTime + 30s = now - 90s → fires. + Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed()) + rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second} + Expect(k8sClient.Update(ctx, rollout)).To(Succeed()) + + // Healthy health check whose LastChangeTime is between deployTime and retryTime. + hc := &rolloutv1alpha1.HealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "stale-hc", + Namespace: namespace, + Labels: map[string]string{"suite": "errcutoff"}, + }, + Spec: rolloutv1alpha1.HealthCheckSpec{}, + } + Expect(k8sClient.Create(ctx, hc)).To(Succeed()) + changeTime := metav1.NewTime(fakeClock.Now().Add(-7 * time.Minute)) + hc.Status = rolloutv1alpha1.HealthCheckStatus{ + Status: rolloutv1alpha1.HealthStatusHealthy, + LastChangeTime: &changeTime, + } + Expect(k8sClient.Status().Update(ctx, hc)).To(Succeed()) + + _, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) + Expect(err).NotTo(HaveOccurred()) + + fetched := &rolloutv1alpha1.Rollout{} + Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed()) + Expect(*fetched.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed), + "deploy-timeout should have fired") + Expect(fetched.Status.History[0].FailedHealthChecks).To(ContainElement( + HaveField("Name", "stale-hc"), + ), "health check with pre-retry LastChangeTime must appear in FailedHealthChecks") + }) }) From a12485d70b0ae8f9e49a073063f40bb5afd51d86 Mon Sep 17 00:00:00 2001 From: Luka Skugor Date: Tue, 28 Apr 2026 06:17:03 +0000 Subject: [PATCH 4/4] lint --- internal/controller/healthcheck_controller.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/controller/healthcheck_controller.go b/internal/controller/healthcheck_controller.go index 99ec5c5..72146d5 100644 --- a/internal/controller/healthcheck_controller.go +++ b/internal/controller/healthcheck_controller.go @@ -256,4 +256,3 @@ func healthCheckMatchesRollout(ctx context.Context, c client.Reader, healthCheck return true } -