Skip to content

Commit 358a216

Browse files
bdchathamclaude
andauthored
feat: remove monitor task system (#89)
Remove the entire MonitorTask subsystem — no concrete use case remains. This simplifies the reconcile loop and removes the sidecar client dependency from the reconciler. Deleted: - monitor.go + monitor_test.go (ensureMonitorTasks, pollMonitorTasks) - plan_execution.go (buildSidecarClient, reconcileRuntimeTasks) - reconcileRunningTasks method from controller.go - BuildSidecarClientFn field from SeiNodeReconciler - MonitorTask type and MonitorTasks field from SeiNodeStatus CRD - SnapshotUploadMonitorTask, ResultExportMonitorTask from planner - TaskSnapshotUpload, TaskResultExport constants - Monitor task metrics (sidecarUnreachableTotal, monitorTaskCompletedTotal, monitorTaskStatus) and associated emit/cleanup functions - All monitor-related tests (~200 lines) Kept: - ResultExportConfig type and ReplayerSpec.ResultExport field (the config stays on the CRD; how we manage its lifecycle is TBD) - Platform config result-export fields - observeCurrentImage (still runs for Running nodes, will be moved into plans in a follow-up) The reconcile loop is now: ensureNodeFinalizer → reconcilePeers → ResolvePlan → ExecutePlan → emit metrics → observeCurrentImage (Running). Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 23bf638 commit 358a216

13 files changed

Lines changed: 9 additions & 1270 deletions

File tree

api/v1alpha1/seinode_types.go

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -205,29 +205,6 @@ const (
205205
PhaseTerminating SeiNodePhase = "Terminating"
206206
)
207207

208-
// MonitorTask tracks a long-running sidecar task that the controller
209-
// actively polls for completion. Unlike fire-and-forget tasks,
210-
// completing a monitor task triggers a controller response (Event + Condition).
211-
// The map key in MonitorTasks serves as the task type identifier.
212-
type MonitorTask struct {
213-
// ID is the sidecar-assigned task UUID.
214-
ID string `json:"id"`
215-
216-
// Status tracks lifecycle: Pending → Complete or Failed.
217-
Status TaskStatus `json:"status"`
218-
219-
// SubmittedAt is the time the task was submitted to the sidecar.
220-
SubmittedAt metav1.Time `json:"submittedAt"`
221-
222-
// CompletedAt is the time the task reached a terminal state.
223-
// +optional
224-
CompletedAt *metav1.Time `json:"completedAt,omitempty"`
225-
226-
// Error is set when the task fails.
227-
// +optional
228-
Error string `json:"error,omitempty"`
229-
}
230-
231208
// SeiNodeStatus defines the observed state of a SeiNode.
232209
type SeiNodeStatus struct {
233210
// Phase is the high-level lifecycle state.
@@ -251,11 +228,6 @@ type SeiNodeStatus struct {
251228
// +optional
252229
Plan *TaskPlan `json:"plan,omitempty"`
253230

254-
// MonitorTasks tracks long-running sidecar tasks the controller polls
255-
// for completion. Keyed by task type for idempotent submission.
256-
// +optional
257-
MonitorTasks map[string]MonitorTask `json:"monitorTasks,omitempty"`
258-
259231
// ResolvedPeers is the current set of peer DNS hostnames discovered
260232
// from label-based peer sources. Reconciled continuously so that
261233
// future peer-update plans can detect drift.

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/sei.io_seinodes.yaml

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -573,47 +573,6 @@ spec:
573573
ingress. Used by the planner to set p2p.external_address in CometBFT
574574
config so the node advertises a reachable address for gossip discovery.
575575
type: string
576-
monitorTasks:
577-
additionalProperties:
578-
description: |-
579-
MonitorTask tracks a long-running sidecar task that the controller
580-
actively polls for completion. Unlike fire-and-forget tasks,
581-
completing a monitor task triggers a controller response (Event + Condition).
582-
The map key in MonitorTasks serves as the task type identifier.
583-
properties:
584-
completedAt:
585-
description: CompletedAt is the time the task reached a terminal
586-
state.
587-
format: date-time
588-
type: string
589-
error:
590-
description: Error is set when the task fails.
591-
type: string
592-
id:
593-
description: ID is the sidecar-assigned task UUID.
594-
type: string
595-
status:
596-
description: 'Status tracks lifecycle: Pending → Complete or
597-
Failed.'
598-
enum:
599-
- Pending
600-
- Complete
601-
- Failed
602-
type: string
603-
submittedAt:
604-
description: SubmittedAt is the time the task was submitted
605-
to the sidecar.
606-
format: date-time
607-
type: string
608-
required:
609-
- id
610-
- status
611-
- submittedAt
612-
type: object
613-
description: |-
614-
MonitorTasks tracks long-running sidecar tasks the controller polls
615-
for completion. Keyed by task type for idempotent submission.
616-
type: object
617576
phase:
618577
description: Phase is the high-level lifecycle state.
619578
enum:

internal/controller/node/controller.go

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,12 @@ import (
1616
"sigs.k8s.io/controller-runtime/pkg/builder"
1717
"sigs.k8s.io/controller-runtime/pkg/client"
1818
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
19-
"sigs.k8s.io/controller-runtime/pkg/log"
2019
"sigs.k8s.io/controller-runtime/pkg/predicate"
2120

2221
seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1"
2322
"github.com/sei-protocol/sei-k8s-controller/internal/noderesource"
2423
"github.com/sei-protocol/sei-k8s-controller/internal/planner"
2524
"github.com/sei-protocol/sei-k8s-controller/internal/platform"
26-
"github.com/sei-protocol/sei-k8s-controller/internal/task"
2725
)
2826

2927
const (
@@ -39,11 +37,10 @@ type PlatformConfig = platform.Config
3937
// SeiNodeReconciler reconciles a SeiNode object.
4038
type SeiNodeReconciler struct {
4139
client.Client
42-
Scheme *runtime.Scheme
43-
Recorder record.EventRecorder
44-
Platform PlatformConfig
45-
PlanExecutor planner.PlanExecutor[*seiv1alpha1.SeiNode]
46-
BuildSidecarClientFn func(node *seiv1alpha1.SeiNode) task.SidecarClient
40+
Scheme *runtime.Scheme
41+
Recorder record.EventRecorder
42+
Platform PlatformConfig
43+
PlanExecutor planner.PlanExecutor[*seiv1alpha1.SeiNode]
4744
}
4845

4946
// +kubebuilder:rbac:groups=sei.io,resources=seinodes,verbs=get;list;watch;create;update;patch;delete
@@ -133,31 +130,17 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
133130
}
134131
}
135132

136-
// Running phase: after the convergence plan completes, handle
137-
// runtime tasks (image observation, monitor task polling).
133+
// Running phase: observe image convergence after plan completes.
138134
if node.Status.Phase == seiv1alpha1.PhaseRunning {
139-
return r.reconcileRunningTasks(ctx, node)
135+
if err := r.observeCurrentImage(ctx, node); err != nil {
136+
return ctrl.Result{}, fmt.Errorf("observing current image: %w", err)
137+
}
138+
return ctrl.Result{RequeueAfter: statusPollInterval}, nil
140139
}
141140

142141
return result, nil
143142
}
144143

145-
// reconcileRunningTasks handles Running-phase work that is outside the plan:
146-
// image observation and sidecar monitor task polling.
147-
func (r *SeiNodeReconciler) reconcileRunningTasks(ctx context.Context, node *seiv1alpha1.SeiNode) (ctrl.Result, error) {
148-
if err := r.observeCurrentImage(ctx, node); err != nil {
149-
return ctrl.Result{}, fmt.Errorf("observing current image: %w", err)
150-
}
151-
152-
sc := r.buildSidecarClient(node)
153-
if sc == nil {
154-
sidecarUnreachableTotal.WithLabelValues(node.Namespace, node.Name).Inc()
155-
log.FromContext(ctx).Info("sidecar not reachable, will retry")
156-
return ctrl.Result{RequeueAfter: statusPollInterval}, nil
157-
}
158-
return r.reconcileRuntimeTasks(ctx, node, sc)
159-
}
160-
161144
func (r *SeiNodeReconciler) observeCurrentImage(ctx context.Context, node *seiv1alpha1.SeiNode) error {
162145
sts := &appsv1.StatefulSet{}
163146
if err := r.Get(ctx, types.NamespacedName{Name: node.Name, Namespace: node.Namespace}, sts); err != nil {

internal/controller/node/metrics.go

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,6 @@ var allNodePhases = []string{
1717
string(seiv1alpha1.PhaseTerminating),
1818
}
1919

20-
var allTaskStatuses = []string{
21-
string(seiv1alpha1.TaskPending),
22-
string(seiv1alpha1.TaskComplete),
23-
string(seiv1alpha1.TaskFailed),
24-
}
25-
2620
var (
2721
nodePhaseGauge = prometheus.NewGaugeVec(
2822
prometheus.GaugeOpts{
@@ -56,30 +50,6 @@ var (
5650
},
5751
[]string{"namespace", "name"},
5852
)
59-
60-
sidecarUnreachableTotal = prometheus.NewCounterVec(
61-
prometheus.CounterOpts{
62-
Name: "sei_controller_sidecar_unreachable_total",
63-
Help: "Number of times the sidecar was unreachable",
64-
},
65-
[]string{"namespace", "node"},
66-
)
67-
68-
monitorTaskCompletedTotal = prometheus.NewCounterVec(
69-
prometheus.CounterOpts{
70-
Name: "sei_controller_monitor_task_completed_total",
71-
Help: "Monitor task terminal state transitions (DivergenceDetected, TaskFailed, TaskLost)",
72-
},
73-
[]string{"namespace", "node", "task_type", "reason"},
74-
)
75-
76-
monitorTaskStatus = prometheus.NewGaugeVec(
77-
prometheus.GaugeOpts{
78-
Name: "sei_controller_monitor_task_status",
79-
Help: "Current status of each monitor task (1=active, 0=inactive)",
80-
},
81-
[]string{"namespace", "node", "task_type", "status"},
82-
)
8353
)
8454

8555
func init() {
@@ -88,43 +58,16 @@ func init() {
8858
nodePhaseTransitions,
8959
nodeInitDuration,
9060
nodeLastInitDuration,
91-
sidecarUnreachableTotal,
92-
monitorTaskCompletedTotal,
93-
monitorTaskStatus,
9461
)
9562
}
9663

9764
func emitNodePhase(ns, name string, phase seiv1alpha1.SeiNodePhase) {
9865
observability.EmitPhaseGauge(nodePhaseGauge, ns, name, string(phase), allNodePhases)
9966
}
10067

101-
func emitMonitorTaskTerminal(ns, node, taskType, reason string) {
102-
monitorTaskCompletedTotal.WithLabelValues(ns, node, taskType, reason).Inc()
103-
}
104-
105-
func emitMonitorTaskStatus(ns, node, taskType, status string) {
106-
for _, s := range allTaskStatuses {
107-
val := float64(0)
108-
if s == status {
109-
val = 1
110-
}
111-
monitorTaskStatus.WithLabelValues(ns, node, taskType, s).Set(val)
112-
}
113-
}
114-
115-
func cleanupMonitorTaskMetrics(ns, name string, taskTypes []string) {
116-
for _, tt := range taskTypes {
117-
for _, s := range allTaskStatuses {
118-
monitorTaskStatus.DeleteLabelValues(ns, name, tt, s)
119-
}
120-
}
121-
}
122-
12368
func cleanupNodeMetrics(namespace, name string) {
12469
observability.DeletePhaseGauge(nodePhaseGauge, namespace, name, allNodePhases)
12570
nodeLastInitDuration.DeleteLabelValues(namespace, name)
126-
sidecarUnreachableTotal.DeleteLabelValues(namespace, name)
12771
observability.ReconcileErrorsTotal.DeleteLabelValues(seiNodeControllerName, namespace, name)
128-
cleanupMonitorTaskMetrics(namespace, name, []string{planner.TaskResultExport})
12972
planner.CleanupPlanMetrics("seinode", namespace, name)
13073
}

0 commit comments

Comments
 (0)