refactor: planner owns condition management, not executor (#92)

bdchatham · claude · web-flow · commit c771542c3bfd · 2026-04-16T09:49:47.000-07:00
Move condition ownership from the executor to the planner. The executor
now only mutates plan/task state and phase transitions. The planner sets
conditions when creating plans and when observing terminal plans on the
next reconcile.

- Remove setPlanFailedCondition from executor
- Remove ConditionPlanFailed constant from executor
- Update failTask doc to clarify planner handles conditions
- Update planner doc.go with condition ownership section
- Update CLAUDE.md Key Patterns with condition ownership, atomic plan
  creation, and single-patch model documentation
- Update test to assert plan failure details instead of condition

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -58,6 +58,9 @@ make docker-push IMG=<image>  # Push container image
 - **Plan-driven reconciliation** — Both controllers use ordered task plans (stored in `.status.plan`) to drive lifecycle. Plans are built by `internal/planner/` (`ResolvePlan` for nodes, `ForGroup` for deployments), executed by `planner.Executor`, with individual tasks in `internal/task/`. The reconcile loop is: `ResolvePlan → persist plan → ExecutePlan`. See `internal/planner/doc.go` for the full plan lifecycle.
 - **Init plans** transition nodes from Pending → Running. They include infrastructure tasks (`ensure-data-pvc`, `apply-statefulset`, `apply-service`) followed by sidecar tasks (`configure-genesis`, `config-apply`, etc.).
 - **Convergence plans** keep Running nodes in sync. They contain only `apply-statefulset` + `apply-service` and are nilled from status after completion.
+- **Atomic plan creation** — New plans are persisted before any tasks execute. The reconciler flushes the plan, then requeues. Execution starts on the next reconcile. This guarantees external observers see the plan before side effects occur.
+- **Condition ownership** — The planner owns all condition management on the owning resource. It sets conditions when creating plans (e.g., `NodeUpdateInProgress=True`) and when observing terminal plans (e.g., `NodeUpdateInProgress=False`). The executor does not set conditions — it only mutates plan/task state and phase transitions.
+- **Single-patch model** — All status mutations (plan state, conditions, phase, currentImage) accumulate in-memory during a reconcile and are flushed in a single `Status().Patch()` at the end. Tasks mutate owned resources (StatefulSets, Services, PVCs); the executor mutates plan state in-memory; the reconciler flushes once.
 - **Resource generators** live in `internal/noderesource/` — pure functions that produce StatefulSets, Services, and PVCs from a SeiNode spec. Used by both the controller and plan tasks.
 - **Platform config** is fully environment-driven — all fields in `platform.Config` must be set via env vars (no defaults). See `internal/platform/platform.go` for the full list.
 - **Genesis resolution** is handled by the sidecar autonomously: embedded sei-config for well-known chains, S3 fallback at `{SEI_GENESIS_BUCKET}/{chainID}/genesis.json` for custom chains.
diff --git a/internal/controller/node/plan_execution_test.go b/internal/controller/node/plan_execution_test.go
@@ -732,18 +732,8 @@ func TestExecutePlan_TaskFailure_SetsPlanFailedCondition(t *testing.T) {
 	g.Expect(err).NotTo(HaveOccurred())
 
 	g.Expect(node.Status.Plan.Phase).To(Equal(seiv1alpha1.TaskPlanFailed))
-
-	// Verify PlanFailed condition was set.
-	var found bool
-	for _, cond := range node.Status.Conditions {
-		if cond.Type == planner.ConditionPlanFailed {
-			g.Expect(cond.Status).To(Equal(metav1.ConditionTrue))
-			g.Expect(cond.Reason).To(Equal("TaskFailed"))
-			g.Expect(cond.Message).To(ContainSubstring("boom"))
-			found = true
-		}
-	}
-	g.Expect(found).To(BeTrue(), "expected PlanFailed condition on node")
+	g.Expect(node.Status.Plan.FailedTaskDetail).NotTo(BeNil())
+	g.Expect(node.Status.Plan.FailedTaskDetail.Error).To(ContainSubstring("boom"))
 }
 
 // --- Nil sidecar client handling ---
diff --git a/internal/planner/doc.go b/internal/planner/doc.go
@@ -8,15 +8,23 @@
 //
 //  1. Build: ResolvePlan (for nodes) or ForGroup (for deployments) inspects the
 //     resource's current phase and spec, then builds an appropriate plan.
-//  2. Persist: The controller patches the plan into the resource's status.
-//  3. Execute: Executor.ExecutePlan drives one task per reconcile, patching
-//     status after each task completes.
-//  4. Complete: When all tasks finish, the executor sets TargetPhase on the
-//     resource and marks the plan Complete. Convergence plans (where the
-//     target phase equals the current phase) are nilled out to avoid stale
-//     data in etcd.
-//  5. Fail: If a task fails terminally, the executor sets FailedPhase and a
-//     PlanFailed condition on the resource for operator observability.
+//  2. Persist: The controller flushes the plan into the resource's status.
+//     Execution does not start until the plan is persisted (atomic creation).
+//  3. Execute: Executor.ExecutePlan drives tasks in-memory. Synchronous tasks
+//     advance in a loop; async tasks return Running for the next reconcile.
+//     The controller flushes all mutations in a single status patch.
+//  4. Complete: When all tasks finish, the executor marks the plan Complete and
+//     sets TargetPhase. On the next reconcile, the planner observes the terminal
+//     plan, updates conditions, and builds the next plan if needed.
+//  5. Fail: If a task fails terminally, the executor marks the plan Failed and
+//     sets FailedPhase. On the next reconcile, the planner observes the failure,
+//     updates conditions, and decides whether to retry.
+//
+// # Condition Ownership
+//
+// The planner owns all condition management on the owning resource. It sets
+// conditions when creating plans and when observing terminal plans. The executor
+// does not set conditions — it only mutates plan/task state and phase transitions.
 //
 // # Plan Types
 //
diff --git a/internal/planner/executor.go b/internal/planner/executor.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"time"
 
-	"k8s.io/apimachinery/pkg/api/meta"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -19,9 +18,6 @@ import (
 const (
 	TaskPollInterval = 5 * time.Second
 	maxRetryBackoff  = 30 * time.Second
-
-	// ConditionPlanFailed is set on the node when a plan fails terminally.
-	ConditionPlanFailed = "PlanFailed"
 )
 
 // ResultRequeueImmediate requests an immediate re-enqueue with a minimal
@@ -197,8 +193,8 @@ func advanceTask(
 }
 
 // failTask marks an individual task and the overall plan as Failed.
-// Sets FailedPhase on the node and a PlanFailed condition. All mutations
-// are in-memory.
+// Sets FailedPhase on the node. The planner handles condition updates
+// when it observes the failed plan on the next reconcile.
 func failTask(
 	ctx context.Context,
 	obj client.Object,
@@ -215,7 +211,6 @@ func failTask(
 	t.Error = errMsg
 	plan.Phase = seiv1alpha1.TaskPlanFailed
 	setTargetPhase(obj, plan.FailedPhase)
-	setPlanFailedCondition(obj, plan, t, errMsg)
 	for i := range plan.Tasks {
 		if plan.Tasks[i].ID == t.ID {
 			plan.FailedTaskIndex = &i
@@ -251,22 +246,6 @@ func setTargetPhase(obj client.Object, phase seiv1alpha1.SeiNodePhase) {
 	}
 }
 
-// setPlanFailedCondition sets a PlanFailed condition on the node with
-// details about which task failed and why.
-func setPlanFailedCondition(obj client.Object, plan *seiv1alpha1.TaskPlan, t *seiv1alpha1.PlannedTask, errMsg string) {
-	node, ok := obj.(*seiv1alpha1.SeiNode)
-	if !ok {
-		return
-	}
-	meta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{
-		Type:               ConditionPlanFailed,
-		Status:             metav1.ConditionTrue,
-		Reason:             "TaskFailed",
-		Message:            fmt.Sprintf("plan %s failed at task %s: %s", plan.ID, t.Type, errMsg),
-		ObservedGeneration: node.Generation,
-	})
-}
-
 // currentPhase returns the SeiNode phase, or empty for non-SeiNode objects.
 func currentPhase(obj client.Object) seiv1alpha1.SeiNodePhase {
 	if node, ok := obj.(*seiv1alpha1.SeiNode); ok {