Skip to content

Commit 09cd670

Browse files
bdchathamclaude
andauthored
fix: re-read object in completePlan/failPlan to avoid stale resourceVersion (#82)
The plan executor patches status mid-reconcile (task/plan completion), bumping resourceVersion. completePlan/failPlan then tried to patch status using the statusBase from the start of Reconcile, which carried the old resourceVersion. This caused guaranteed optimistic lock conflicts, leaving deployments permanently stuck in Upgrading phase. Fix: completePlan and failPlan now re-read the object via r.Get before building their patch base, ensuring a fresh resourceVersion. The stale statusBase parameter is removed from both functions and drivePlan. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent dced6da commit 09cd670

3 files changed

Lines changed: 27 additions & 12 deletions

File tree

internal/controller/nodedeployment/nodes.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ func (r *SeiNodeDeploymentReconciler) detectGenesisCeremonyNeeded(group *seiv1al
8181
// by comparing the current template hash against the stored hash. Only
8282
// fields that require new nodes (image, entrypoint, chainId) are hashed;
8383
// sidecar, overrides, and replica changes propagate in-place.
84+
// TODO: guard against empty incumbentNodes — if populateIncumbentNodes found
85+
// zero nodes (e.g. missing owner references), a rollout with empty node lists
86+
// creates a plan where all tasks complete as no-ops. Should return early here
87+
// when len(group.Status.IncumbentNodes) == 0.
8488
func (r *SeiNodeDeploymentReconciler) detectDeploymentNeeded(group *seiv1alpha1.SeiNodeDeployment) {
8589
if group.Status.TemplateHash == "" {
8690
return // first reconcile, no baseline to compare against

internal/controller/nodedeployment/plan.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
func (r *SeiNodeDeploymentReconciler) reconcilePlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment, statusBase client.Patch) (ctrl.Result, error) {
2222
// Drive active plan.
2323
if group.Status.Plan != nil && group.Status.Plan.Phase == seiv1alpha1.TaskPlanActive {
24-
return r.drivePlan(ctx, group, statusBase)
24+
return r.drivePlan(ctx, group)
2525
}
2626

2727
// No active plan — ask the planner if one is needed.
@@ -38,17 +38,17 @@ func (r *SeiNodeDeploymentReconciler) reconcilePlan(ctx context.Context, group *
3838
return r.startPlan(ctx, group, statusBase, plan)
3939
}
4040

41-
func (r *SeiNodeDeploymentReconciler) drivePlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment, statusBase client.Patch) (ctrl.Result, error) {
41+
func (r *SeiNodeDeploymentReconciler) drivePlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment) (ctrl.Result, error) {
4242
result, err := r.PlanExecutor.ExecutePlan(ctx, group, group.Status.Plan)
4343
if err != nil {
4444
return result, err
4545
}
4646

4747
switch group.Status.Plan.Phase {
4848
case seiv1alpha1.TaskPlanComplete:
49-
return r.completePlan(ctx, group, statusBase)
49+
return r.completePlan(ctx, group)
5050
case seiv1alpha1.TaskPlanFailed:
51-
return r.failPlan(ctx, group, statusBase)
51+
return r.failPlan(ctx, group)
5252
}
5353

5454
return result, nil
@@ -70,9 +70,17 @@ func (r *SeiNodeDeploymentReconciler) startPlan(ctx context.Context, group *seiv
7070
return planner.ResultRequeueImmediate, nil
7171
}
7272

73-
func (r *SeiNodeDeploymentReconciler) completePlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment, statusBase client.Patch) (ctrl.Result, error) {
73+
func (r *SeiNodeDeploymentReconciler) completePlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment) (ctrl.Result, error) {
7474
logger := log.FromContext(ctx)
7575

76+
// Re-read to get a fresh resourceVersion. The plan executor patches
77+
// status mid-reconcile (task/plan completion), which invalidates the
78+
// statusBase computed at the start of Reconcile.
79+
if err := r.Get(ctx, client.ObjectKeyFromObject(group), group); err != nil {
80+
return ctrl.Result{}, fmt.Errorf("re-reading group for completePlan: %w", err)
81+
}
82+
status := client.MergeFromWithOptions(group.DeepCopy(), client.MergeFromWithOptimisticLock{})
83+
7684
isDeploymentPlan := group.Status.Rollout != nil
7785

7886
if isDeploymentPlan {
@@ -97,15 +105,20 @@ func (r *SeiNodeDeploymentReconciler) completePlan(ctx context.Context, group *s
97105
r.Recorder.Event(group, corev1.EventTypeNormal, "PlanComplete", "Plan completed successfully")
98106
logger.Info("plan completed")
99107

100-
if err := r.updateStatus(ctx, group, statusBase); err != nil {
108+
if err := r.updateStatus(ctx, group, status); err != nil {
101109
return ctrl.Result{}, fmt.Errorf("updating status after plan completion: %w", err)
102110
}
103111
return planner.ResultRequeueImmediate, nil
104112
}
105113

106-
func (r *SeiNodeDeploymentReconciler) failPlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment, statusBase client.Patch) (ctrl.Result, error) {
114+
func (r *SeiNodeDeploymentReconciler) failPlan(ctx context.Context, group *seiv1alpha1.SeiNodeDeployment) (ctrl.Result, error) {
107115
logger := log.FromContext(ctx)
108116

117+
if err := r.Get(ctx, client.ObjectKeyFromObject(group), group); err != nil {
118+
return ctrl.Result{}, fmt.Errorf("re-reading group for failPlan: %w", err)
119+
}
120+
status := client.MergeFromWithOptions(group.DeepCopy(), client.MergeFromWithOptimisticLock{})
121+
109122
group.Status.Phase = seiv1alpha1.GroupPhaseDegraded
110123
group.Status.Plan = nil
111124
if group.Status.Rollout != nil {
@@ -118,7 +131,7 @@ func (r *SeiNodeDeploymentReconciler) failPlan(ctx context.Context, group *seiv1
118131
r.Recorder.Event(group, corev1.EventTypeWarning, "PlanFailed", "Plan failed")
119132
logger.Info("plan failed")
120133

121-
if err := r.updateStatus(ctx, group, statusBase); err != nil {
134+
if err := r.updateStatus(ctx, group, status); err != nil {
122135
return ctrl.Result{}, fmt.Errorf("updating status after plan failure: %w", err)
123136
}
124137
return planner.ResultRequeueImmediate, nil

internal/controller/nodedeployment/plan_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ func TestCompletePlan_ClearsRolloutInProgress(t *testing.T) {
7878

7979
r, c := newPlanTestReconciler(t, group, childNode)
8080

81-
statusBase := client.MergeFromWithOptions(group.DeepCopy(), client.MergeFromWithOptimisticLock{})
82-
_, err := r.completePlan(ctx, group, statusBase)
81+
_, err := r.completePlan(ctx, group)
8382
g.Expect(err).NotTo(HaveOccurred())
8483

8584
fetched := &seiv1alpha1.SeiNodeDeployment{}
@@ -148,8 +147,7 @@ func TestFailPlan_ClearsRolloutInProgress(t *testing.T) {
148147

149148
r, c := newPlanTestReconciler(t, group, childRunning, childFailed, childFailed2)
150149

151-
statusBase := client.MergeFromWithOptions(group.DeepCopy(), client.MergeFromWithOptimisticLock{})
152-
_, err := r.failPlan(ctx, group, statusBase)
150+
_, err := r.failPlan(ctx, group)
153151
g.Expect(err).NotTo(HaveOccurred())
154152

155153
fetched := &seiv1alpha1.SeiNodeDeployment{}

0 commit comments

Comments
 (0)