Skip to content

Commit 59ebfbb

Browse files
bdchathamclaude
andauthored
fix: pre-NodeUpdate cleanup — metrics, deletion comment, drift-gated plans (#93)
Three cleanup items identified during the comprehensive architecture review: 1. Fix controllerName() in planner metrics — GVK is stripped by controller-runtime on fetched objects, so all metric labels were "unknown". Now uses a type switch on the known resource types. 2. Document the deletion path's separate Status().Patch() call — it's an intentional exception to the single-patch model, not an oversight. 3. Running-phase plans are now drift-gated — buildRunningPlan returns nil when spec.image == status.currentImage (no drift). This eliminates unnecessary plan creation/flush/execution every 30s for idle Running nodes, and establishes the extension point for NodeUpdate plans. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c771542 commit 59ebfbb

3 files changed

Lines changed: 24 additions & 12 deletions

File tree

internal/controller/node/controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ func (r *SeiNodeReconciler) handleNodeDeletion(ctx context.Context, node *seiv1a
216216
return ctrl.Result{}, nil
217217
}
218218

219+
// Deletion path: separate patch is intentional — this runs before the
220+
// main reconcile flow and must set Terminating before cleaning up resources.
219221
patch := client.MergeFromWithOptions(node.DeepCopy(), client.MergeFromWithOptimisticLock{})
220222
node.Status.Phase = seiv1alpha1.PhaseTerminating
221223
if err := r.Status().Patch(ctx, node, patch); err != nil {

internal/planner/metrics.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
package planner
22

33
import (
4-
"strings"
5-
64
"github.com/prometheus/client_golang/prometheus"
75
"sigs.k8s.io/controller-runtime/pkg/client"
86
"sigs.k8s.io/controller-runtime/pkg/metrics"
7+
8+
seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1"
99
)
1010

1111
var (
@@ -42,13 +42,17 @@ func init() {
4242
)
4343
}
4444

45-
// controllerName derives a lowercase controller label from the object's GVK Kind.
45+
// controllerName returns the controller label for metrics. Uses a type switch
46+
// because controller-runtime strips GVK from objects fetched via client.Get.
4647
func controllerName(obj client.Object) string {
47-
kind := obj.GetObjectKind().GroupVersionKind().Kind
48-
if kind == "" {
48+
switch obj.(type) {
49+
case *seiv1alpha1.SeiNode:
50+
return "seinode"
51+
case *seiv1alpha1.SeiNodeDeployment:
52+
return "seinodedeployment"
53+
default:
4954
return "unknown"
5055
}
51-
return strings.ToLower(kind)
5256
}
5357

5458
// CleanupPlanMetrics removes the plan_active gauge for a deleted resource.

internal/planner/planner.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -443,14 +443,20 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string {
443443
}
444444
}
445445

446-
// buildRunningPlan builds a convergence plan for a Running node.
447-
// It ensures the StatefulSet and Service match the current spec.
446+
// buildRunningPlan returns a plan for a Running node only when the controller
447+
// recognizes a scenario that requires action. Returns nil when the node is
448+
// in steady state (no drift detected).
448449
//
449-
// FailedPhase is deliberately empty: a convergence failure should not
450-
// transition the node out of Running. The executor still sets a PlanFailed
451-
// condition for observability, and the next reconcile will build a fresh
452-
// convergence plan to retry.
450+
// Currently detects image drift (spec.image != status.currentImage). This is
451+
// the extension point for future drift types (config changes, peer changes).
453452
func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) {
453+
if node.Spec.Image == node.Status.CurrentImage {
454+
return nil, nil
455+
}
456+
457+
// Image drift detected — build a convergence plan to update the StatefulSet.
458+
// FailedPhase is deliberately empty: a failure retries on the next reconcile
459+
// rather than transitioning the node out of Running.
454460
prog := []string{task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService}
455461

456462
planID := uuid.New().String()

0 commit comments

Comments
 (0)