Skip to content

Commit 31df0e8

Browse files
bdchathamclaude
andauthored
fix: separate plan creation from execution into atomic steps (#91)
A new plan is now persisted before any tasks execute. This prevents a crash-safety issue where side effects (e.g., StatefulSet updates) could occur before the plan and its conditions are flushed to the API server. Before: ResolvePlan builds a plan and ExecutePlan runs tasks in the same reconcile. A crash after task execution but before the flush loses the plan and any conditions — external observers never see the update. After: When ResolvePlan creates a new plan, the reconciler flushes it immediately and requeues. Execution starts on the next reconcile, after the plan is persisted. This guarantees external controllers (e.g., SeiNodeDeployment) see the plan and conditions before any side effects. The nodedeployment controller already followed this pattern (startPlan returns ResultRequeueImmediate without executing). This change aligns the node controller to the same atomic guarantee. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 86fc035 commit 31df0e8

2 files changed

Lines changed: 22 additions & 17 deletions

File tree

internal/controller/node/controller.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,18 +105,18 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
105105
return ctrl.Result{}, fmt.Errorf("resolving plan: %w", err)
106106
}
107107

108-
// Execute the plan. The executor advances tasks in-memory — synchronous
109-
// tasks complete in a loop, async tasks return Running with a poll interval.
110108
var result ctrl.Result
111109
var execErr error
112-
if node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive {
113-
result, execErr = r.PlanExecutor.ExecutePlan(ctx, node, node.Status.Plan)
114-
statusDirty = true
115-
}
116110

117-
// If ResolvePlan built a new plan (but there's nothing to execute yet
118-
// because it was just created this reconcile), mark dirty so it gets persisted.
119111
if !planAlreadyActive && node.Status.Plan != nil {
112+
// New plan — persist it before executing. Execution starts on the
113+
// next reconcile. This guarantees external observers see the plan
114+
// (and any conditions) before side effects occur.
115+
statusDirty = true
116+
result = planner.ResultRequeueImmediate
117+
} else if node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive {
118+
// Existing plan — execute tasks in-memory.
119+
result, execErr = r.PlanExecutor.ExecutePlan(ctx, node, node.Status.Plan)
120120
statusDirty = true
121121
}
122122

internal/controller/node/plan_execution_integration_test.go

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,17 @@ func TestIntegrationFullProgressionSnapshotMode(t *testing.T) {
7777
return n
7878
}
7979

80-
// First Reconcile: builds plan, transitions to Initializing, completes
81-
// all synchronous infrastructure tasks, and submits the first sidecar task.
80+
// First Reconcile: builds and persists the plan.
8281
reconcileOnce(t, g, r, node.Name, node.Namespace)
8382
node = fetch()
8483
g.Expect(node.Status.Plan).NotTo(BeNil())
8584
g.Expect(node.Status.Phase).To(Equal(seiv1alpha1.PhaseInitializing))
8685

87-
// Drive sidecar tasks (infrastructure tasks already completed in first reconcile).
86+
// Second Reconcile: executes all synchronous infrastructure tasks
87+
// and submits the first sidecar task.
88+
reconcileOnce(t, g, r, node.Name, node.Namespace)
89+
90+
// Drive sidecar tasks.
8891
driveTask(t, g, r, mock, fetch, planner.TaskSnapshotRestore)
8992
driveTask(t, g, r, mock, fetch, planner.TaskConfigureGenesis)
9093
driveTask(t, g, r, mock, fetch, planner.TaskConfigApply)
@@ -111,13 +114,15 @@ func TestIntegrationFullProgressionGenesisMode(t *testing.T) {
111114
return n
112115
}
113116

114-
// First Reconcile: builds plan, transitions to Initializing, completes
115-
// all synchronous infrastructure tasks, and submits the first sidecar task.
117+
// First Reconcile: builds and persists the plan.
116118
reconcileOnce(t, g, r, node.Name, node.Namespace)
117119
node = fetch()
118120
g.Expect(node.Status.Plan).NotTo(BeNil())
119121

120-
// Drive sidecar tasks (infrastructure tasks already completed in first reconcile).
122+
// Second Reconcile: executes infrastructure tasks, submits first sidecar task.
123+
reconcileOnce(t, g, r, node.Name, node.Namespace)
124+
125+
// Drive sidecar tasks.
121126
driveTask(t, g, r, mock, fetch, planner.TaskConfigureGenesis)
122127
// Completing config-apply also chain-completes the trailing fire-and-forget
123128
// tasks (config-validate, mark-ready) and the plan, transitioning to Running.
@@ -141,13 +146,13 @@ func TestIntegrationTaskFailure_FailsPlan(t *testing.T) {
141146
return n
142147
}
143148

144-
// First Reconcile: builds plan, transitions to Initializing, completes
145-
// all synchronous infrastructure tasks, and submits the first sidecar task.
149+
// First Reconcile: builds and persists the plan.
146150
reconcileOnce(t, g, r, node.Name, node.Namespace)
147151
node = fetch()
148152
g.Expect(node.Status.Plan).NotTo(BeNil())
149153

150-
// Drive snapshot-restore: the first reconcile already submitted it.
154+
// Second Reconcile: executes infrastructure tasks, submits snapshot-restore.
155+
reconcileOnce(t, g, r, node.Name, node.Namespace)
151156
node = fetch()
152157
ct := planner.CurrentTask(node.Status.Plan)
153158
g.Expect(ct).NotTo(BeNil())

0 commit comments

Comments
 (0)