Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 3 additions & 18 deletions api/v1alpha1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,13 +339,6 @@ type DeploymentHistoryEntry struct {
// ignored as they reflect the pre-retry state.
// +optional
LastRetryTimestamp *metav1.Time `json:"lastRetryTimestamp,omitempty"`

// LastRetryMode records how the most recent retry should treat failed RolloutTests.
// "retry" (or empty): re-run failed tests; openkruise stepgate resets them to WaitingForStep.
// "skip": mark failed tests as Skipped (treated as passing); skip the re-run.
// +kubebuilder:validation:Enum=retry;skip
// +optional
LastRetryMode string `json:"lastRetryMode,omitempty"`
}

// +kubebuilder:object:root=true
Expand Down Expand Up @@ -403,18 +396,10 @@ const (
// RetryAnnotation, when set on a Rollout whose latest history entry has BakeStatus=Failed,
// triggers a retry: the rollout controller resets BakeStatus to Deploying, records
// LastRetryTimestamp on the history entry, and removes the annotation. The annotation
// value selects how failed RolloutTests are handled:
// "retry" (or empty/unknown): re-run failed tests.
// "skip": mark failed tests as Skipped (counted as passing) so the rollout can
// advance without re-running them.
// value is opaque to this controller — domain-specific retry modes (e.g. how failed
// RolloutTests should be handled) live as separate annotations owned by their respective
// controllers, keyed off LastRetryTimestamp as the cross-controller signal.
RetryAnnotation = "rollout.kuberik.com/retry"

// RetryModeRetry re-runs failed RolloutTests on retry (the default).
RetryModeRetry = "retry"

// RetryModeSkip marks failed RolloutTests as Skipped on retry — the rollout advances
// without re-running them.
RetryModeSkip = "skip"
)

func init() {
Expand Down
9 changes: 0 additions & 9 deletions config/crd/bases/kuberik.com_rollouts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -458,15 +458,6 @@ spec:
this history entry.
format: int64
type: integer
lastRetryMode:
description: |-
LastRetryMode records how the most recent retry should treat failed RolloutTests.
"retry" (or empty): re-run failed tests; openkruise stepgate resets them to WaitingForStep.
"skip": mark failed tests as Skipped (treated as passing); skip the re-run.
enum:
- retry
- skip
type: string
lastRetryTimestamp:
description: |-
LastRetryTimestamp is the time when a retry was most recently requested for this deployment.
Expand Down
1 change: 0 additions & 1 deletion internal/controller/healthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
// Delegate to specialized controllers based on class
if healthCheck.Spec.Class != nil {
switch *healthCheck.Spec.Class {
case "kustomization":

Check failure on line 73 in internal/controller/healthcheck_controller.go

View workflow job for this annotation

GitHub Actions / Run on Ubuntu

string `kustomization` has 3 occurrences, make it a constant (goconst)

Check failure on line 73 in internal/controller/healthcheck_controller.go

View workflow job for this annotation

GitHub Actions / Run on Ubuntu

string `kustomization` has 3 occurrences, make it a constant (goconst)
// KustomizationHealth controller will handle this
// This controller just ensures the resource exists and is properly structured
return ctrl.Result{}, nil
Expand Down Expand Up @@ -256,4 +256,3 @@

return true
}

10 changes: 8 additions & 2 deletions internal/controller/kustomizationhealth_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,14 @@ func (r *KustomizationHealthReconciler) updateHealthCheckStatus(ctx context.Cont

// Set LastErrorTime to the actual failure condition timestamp so the rollout controller
// can determine whether the failure is pre- or post-retry without involving this controller.
if newStatus == rolloutv1alpha1.HealthStatusUnhealthy && errorTime != nil {
healthCheck.Status.LastErrorTime = errorTime
// Fall back to now when the underlying object has no condition timestamp (e.g. no conditions
// set yet) so the rollout controller always has a valid timestamp to compare against.
if newStatus == rolloutv1alpha1.HealthStatusUnhealthy {
if errorTime != nil {
healthCheck.Status.LastErrorTime = errorTime
} else {
healthCheck.Status.LastErrorTime = &now
}
}

// Update the status
Expand Down
70 changes: 70 additions & 0 deletions internal/controller/kustomizationhealth_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1224,3 +1224,73 @@ var _ = Describe("getFailureConditionTime", func() {
Expect(getFailureConditionTime(obj)).To(BeNil())
})
})

// These tests verify the nil-errorTime fallback in updateHealthCheckStatus.
// When a managed resource is Unhealthy but has no condition timestamps
// (getFailureConditionTime returns nil), LastErrorTime must be set to now
// so the rollout controller's errorCutoff comparison sees a fresh failure.
var _ = Describe("KustomizationHealth updateHealthCheckStatus nil errorTime", func() {
ctx := context.Background()
var namespace string
var healthCheck *rolloutv1alpha1.HealthCheck
var fakeClock *FakeClock

BeforeEach(func() {
ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "khnil-ns-"}}
Expect(k8sClient.Create(ctx, ns)).To(Succeed())
namespace = ns.Name

healthCheck = &rolloutv1alpha1.HealthCheck{
ObjectMeta: metav1.ObjectMeta{Name: "nil-errtime-hc", Namespace: namespace},
Spec: rolloutv1alpha1.HealthCheckSpec{},
}
Expect(k8sClient.Create(ctx, healthCheck)).To(Succeed())

fakeClock = NewFakeClock()
})

AfterEach(func() {
Expect(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: namespace}})).To(Succeed())
})

It("sets LastErrorTime to now when unhealthy but errorTime is nil", func() {
reconciler := &KustomizationHealthReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
Clock: fakeClock,
}

By("calling updateHealthCheckStatus with nil errorTime")
msg := "no condition timestamps available"
_, err := reconciler.updateHealthCheckStatus(ctx, healthCheck,
rolloutv1alpha1.HealthStatusUnhealthy, msg, nil)
Expect(err).NotTo(HaveOccurred())

By("verifying LastErrorTime is set to the clock's current time")
updated := &rolloutv1alpha1.HealthCheck{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: healthCheck.Name, Namespace: namespace}, updated)).To(Succeed())
Expect(updated.Status.LastErrorTime).NotTo(BeNil(),
"LastErrorTime must be set even when the underlying resource has no condition timestamps")
Expect(updated.Status.LastErrorTime.Time).To(Equal(fakeClock.Now()),
"LastErrorTime must be stamped with the controller clock's current time")
})

It("uses the provided errorTime when it is non-nil", func() {
reconciler := &KustomizationHealthReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
Clock: fakeClock,
}

conditionTime := metav1.NewTime(fakeClock.Now().Add(-3 * time.Minute))
_, err := reconciler.updateHealthCheckStatus(ctx, healthCheck,
rolloutv1alpha1.HealthStatusUnhealthy, "deployment progressing deadline exceeded", &conditionTime)
Expect(err).NotTo(HaveOccurred())

updated := &rolloutv1alpha1.HealthCheck{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: healthCheck.Name, Namespace: namespace}, updated)).To(Succeed())
Expect(updated.Status.LastErrorTime).NotTo(BeNil())
Expect(updated.Status.LastErrorTime.Time).To(Equal(conditionTime.Time),
"when errorTime is provided it must be used verbatim — not overridden by now")
})
})
168 changes: 142 additions & 26 deletions internal/controller/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,10 @@ var _ = Describe("Rollout retry annotation", func() {
Expect(k8sClient.Update(ctx, rollout)).To(Succeed())
}

It("resets failed bake status, records LastRetryTimestamp, removes annotation", func() {
It("resets failed bake status, records LastRetryTimestamp, removes annotation regardless of value", func() {
seedFailedHistory()
addRetryAnnotation("2026-04-22T10:00:00Z")
// Annotation value is opaque to this controller — any value triggers a retry.
addRetryAnnotation("anything-goes")

Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed())

Expand All @@ -126,30 +127,6 @@ var _ = Describe("Rollout retry annotation", func() {
Expect(entry.FailedHealthChecks).To(BeEmpty())
Expect(entry.LastRetryTimestamp).NotTo(BeNil())
Expect(entry.LastRetryTimestamp.Time).To(Equal(fakeClock.Now()))
// Unknown annotation value (timestamp) → default mode "retry".
Expect(entry.LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeRetry))
})

It("records LastRetryMode=skip when annotation value is \"skip\"", func() {
seedFailedHistory()
addRetryAnnotation("skip")

Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed())

fetched := &rolloutv1alpha1.Rollout{}
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(fetched.Status.History[0].LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeSkip))
})

It("records LastRetryMode=retry when annotation value is \"retry\"", func() {
seedFailedHistory()
addRetryAnnotation("retry")

Expect(reconciler.handleRetryAnnotation(ctx, rollout, testLogger())).To(Succeed())

fetched := &rolloutv1alpha1.Rollout{}
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(fetched.Status.History[0].LastRetryMode).To(Equal(rolloutv1alpha1.RetryModeRetry))
})

It("removes annotation but does not reset when BakeStatus is InProgress (double-retry guard)", func() {
Expand Down Expand Up @@ -453,4 +430,143 @@ var _ = Describe("Rollout errorCutoff with retry", func() {
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(*fetched.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed))
})

It("deploy-timeout deadline measured from retry time, not original deploy time", func() {
// Deployed 15 min ago, retried 2 min ago, DeployTimeout = 10 min.
// errorCutoff = retryTime (2 min ago). Deadline = retryTime + 10 min = now + 8 min.
// now.After(now + 8 min) = false → timeout must NOT fire.
// Without the fix (using deployTime): deadline = deployTime + 10 min = now - 5 min.
// now.After(now - 5 min) = true → rollout would be immediately marked Failed.
retryAt := fakeClock.Now().Add(-2 * time.Minute)
deployAt := fakeClock.Now().Add(-15 * time.Minute)
Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed())
retryTs := metav1.NewTime(retryAt)
deploying := rolloutv1alpha1.BakeStatusDeploying
rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{
ID: k8sptr.To[int64](1),
Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"},
Timestamp: metav1.NewTime(deployAt),
BakeStatus: &deploying,
LastRetryTimestamp: &retryTs,
// BakeStartTime nil → deploy-timeout check is active
}}
Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed())

// Patch DeployTimeout = 10 min onto the rollout spec.
Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed())
rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 10 * time.Minute}
Expect(k8sClient.Update(ctx, rollout)).To(Succeed())

// Seed an unhealthy health check so bake cannot start.
seedHealthCheck(retryAt.Add(-5 * time.Minute))

_, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
Expect(err).NotTo(HaveOccurred())

fetched := &rolloutv1alpha1.Rollout{}
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(*fetched.Status.History[0].BakeStatus).NotTo(Equal(rolloutv1alpha1.BakeStatusFailed),
"deploy-timeout must be measured from errorCutoff (retry time), not original deploy time")
})

It("canStartBake requires LastChangeTime after retry time, not just deploy time", func() {
// Deploy 15 min ago, retry 5 min ago.
// Health check LastChangeTime = 7 min ago — AFTER deployTime but BEFORE retryTime.
// With errorCutoff: LastChangeTime.Before(now-5m) = true → canStartBake = false → bake blocked.
// Without fix (using deployTime): LastChangeTime.Before(now-15m) = false → bake would start.
retryAt := fakeClock.Now().Add(-5 * time.Minute)
deployAt := fakeClock.Now().Add(-15 * time.Minute)

Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed())
retryTs := metav1.NewTime(retryAt)
deploying := rolloutv1alpha1.BakeStatusDeploying
rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{
ID: k8sptr.To[int64](1),
Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"},
Timestamp: metav1.NewTime(deployAt),
BakeStatus: &deploying,
LastRetryTimestamp: &retryTs,
}}
Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed())

// Healthy health check, but LastChangeTime is between deployTime and retryTime.
hc := &rolloutv1alpha1.HealthCheck{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hc-",
Namespace: namespace,
Labels: map[string]string{"suite": "errcutoff"},
},
Spec: rolloutv1alpha1.HealthCheckSpec{},
}
Expect(k8sClient.Create(ctx, hc)).To(Succeed())
changeTime := metav1.NewTime(fakeClock.Now().Add(-7 * time.Minute))
hc.Status = rolloutv1alpha1.HealthCheckStatus{
Status: rolloutv1alpha1.HealthStatusHealthy,
LastChangeTime: &changeTime,
}
Expect(k8sClient.Status().Update(ctx, hc)).To(Succeed())

_, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
Expect(err).NotTo(HaveOccurred())

fetched := &rolloutv1alpha1.Rollout{}
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(fetched.Status.History[0].BakeStartTime).To(BeNil(),
"bake must not start when health check LastChangeTime is before the retry time")
})

It("FailedHealthChecks excludes health checks whose LastChangeTime predates the retry", func() {
// Deploy 15 min ago, retry 2 min ago, DeployTimeout = 30s (fires immediately since
// now > retryTime + 30s). Health check is Healthy but LastChangeTime = 7 min ago
// (before retryTime). With errorCutoff the record must include this health check
// as stale-relative-to-retry; without the fix (using deployTime) it would be
// excluded because LastChangeTime > deployTime looks "fresh enough".
retryAt := fakeClock.Now().Add(-2 * time.Minute)
deployAt := fakeClock.Now().Add(-15 * time.Minute)

Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed())
retryTs := metav1.NewTime(retryAt)
deploying := rolloutv1alpha1.BakeStatusDeploying
rollout.Status.History = []rolloutv1alpha1.DeploymentHistoryEntry{{
ID: k8sptr.To[int64](1),
Version: rolloutv1alpha1.VersionInfo{Tag: "1.0.0"},
Timestamp: metav1.NewTime(deployAt),
BakeStatus: &deploying,
LastRetryTimestamp: &retryTs,
}}
Expect(k8sClient.Status().Update(ctx, rollout)).To(Succeed())

// DeployTimeout = 30s → deadline = retryTime + 30s = now - 90s → fires.
Expect(k8sClient.Get(ctx, key, rollout)).To(Succeed())
rollout.Spec.DeployTimeout = &metav1.Duration{Duration: 30 * time.Second}
Expect(k8sClient.Update(ctx, rollout)).To(Succeed())

// Healthy health check whose LastChangeTime is between deployTime and retryTime.
hc := &rolloutv1alpha1.HealthCheck{
ObjectMeta: metav1.ObjectMeta{
Name: "stale-hc",
Namespace: namespace,
Labels: map[string]string{"suite": "errcutoff"},
},
Spec: rolloutv1alpha1.HealthCheckSpec{},
}
Expect(k8sClient.Create(ctx, hc)).To(Succeed())
changeTime := metav1.NewTime(fakeClock.Now().Add(-7 * time.Minute))
hc.Status = rolloutv1alpha1.HealthCheckStatus{
Status: rolloutv1alpha1.HealthStatusHealthy,
LastChangeTime: &changeTime,
}
Expect(k8sClient.Status().Update(ctx, hc)).To(Succeed())

_, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
Expect(err).NotTo(HaveOccurred())

fetched := &rolloutv1alpha1.Rollout{}
Expect(k8sClient.Get(ctx, key, fetched)).To(Succeed())
Expect(*fetched.Status.History[0].BakeStatus).To(Equal(rolloutv1alpha1.BakeStatusFailed),
"deploy-timeout should have fired")
Expect(fetched.Status.History[0].FailedHealthChecks).To(ContainElement(
HaveField("Name", "stale-hc"),
), "health check with pre-retry LastChangeTime must appear in FailedHealthChecks")
})
})
Loading
Loading