Skip to content

Commit cf7ab01

Browse files
committed
refactor: use --halt-height instead of await-condition for pre-init
The bootstrap seid now receives --halt-height so it stops itself after committing the target block and exits 0. This removes the need for the sidecar to kill seid externally and eliminates the race where the controller tries to poll a dead sidecar after the Job completes. Changes: - preInitWaitCommand passes --halt-height to seid and uses exec - buildPreInitPlan no longer appends await-condition task - Tests updated accordingly
1 parent f62c95d commit cf7ab01

3 files changed

Lines changed: 24 additions & 32 deletions

File tree

internal/controller/node/job.go

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,13 @@ func preInitSidecarURL(node *seiv1alpha1.SeiNode) string {
7777
}
7878

7979
// preInitWaitCommand returns a shell command that waits for the sidecar
80-
// healthz to return 200 and then runs "seid start --home /sei".
81-
// Unlike sidecarWaitCommand, it does NOT use the node's custom entrypoint
82-
// because the pre-init Job runs the bootstrap image which may not support
83-
// custom flags (e.g. --skip-app-hash-validation).
80+
// healthz to return 200 and then exec's seid with --halt-height.
8481
//
85-
// seid is run as a child process (not exec'd) so the shell can exit 0
86-
// regardless of how seid terminates. The sidecar's await-condition task
87-
// sends SIGTERM directly to the seid process when the target height is
88-
// reached; the trailing "exit 0" ensures the shell reports success even
89-
// though seid exits with code 143 (SIGTERM). The trap handles the case
90-
// where Kubernetes sends SIGTERM to the shell (PID 1) during pod
91-
// termination.
92-
func preInitWaitCommand(port int32) (command []string, args []string) {
82+
// The bootstrap image's seid uses the Cosmos SDK halt-height mechanism:
83+
// after committing the block at haltHeight, seid sends itself SIGINT and
84+
// exits 0. This avoids the need for the sidecar to kill seid externally
85+
// and keeps the Job exit code clean without wrapper tricks.
86+
func preInitWaitCommand(port int32, haltHeight int64) (command []string, args []string) {
9387
script := fmt.Sprintf(
9488
`echo "waiting for sidecar to become ready..."; `+
9589
`while true; do `+
@@ -98,10 +92,9 @@ func preInitWaitCommand(port int32) (command []string, args []string) {
9892
`head -1 <&3 | grep -q "200" && break; `+
9993
`exec 3>&-; sleep 5; done; `+
10094
`exec 3>&-; `+
101-
`echo "sidecar ready, starting seid"; `+
102-
`trap 'exit 0' TERM; `+
103-
`seid start --home %s & wait $!; exit 0`,
104-
port, dataDir,
95+
`echo "sidecar ready, starting seid with halt-height %d"; `+
96+
`exec seid start --home %s --halt-height %d`,
97+
port, haltHeight, dataDir, haltHeight,
10598
)
10699
return []string{"/bin/bash", "-c"}, []string{script}
107100
}
@@ -145,7 +138,8 @@ func buildPreInitPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.SnapshotSo
145138
bootstrapImage = snap.BootstrapImage
146139
}
147140

148-
seidCmd, seidArgs := preInitWaitCommand(sidecarPort(node))
141+
haltHeight := snap.S3.TargetHeight
142+
seidCmd, seidArgs := preInitWaitCommand(sidecarPort(node), haltHeight)
149143
seidContainer := corev1.Container{
150144
Name: "seid",
151145
Image: bootstrapImage,

internal/controller/node/plan_execution_test.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,17 +1268,19 @@ func TestReconcileInitializing_PlanFailed_TransitionsToFailed(t *testing.T) {
12681268

12691269
// --- PreInitPlan builder tests ---
12701270

1271-
func TestBuildPreInitPlan_AppendsAwaitCondition(t *testing.T) {
1271+
func TestBuildPreInitPlan_DoesNotIncludeAwaitCondition(t *testing.T) {
12721272
node := bootstrapReplayerNode()
12731273
planner, _ := PlannerForNode(node, testSnapshotRegion)
12741274
plan := buildPreInitPlan(node, planner)
12751275

1276-
lastTask := plan.Tasks[len(plan.Tasks)-1]
1277-
if lastTask.Type != taskAwaitCondition {
1278-
t.Errorf("last task type = %q, want %q", lastTask.Type, taskAwaitCondition)
1276+
for _, task := range plan.Tasks {
1277+
if task.Type == taskAwaitCondition {
1278+
t.Errorf("pre-init plan should not contain await-condition (halt-height is used instead)")
1279+
}
12791280
}
1280-
if lastTask.Status != seiv1alpha1.PlannedTaskPending {
1281-
t.Errorf("last task status = %q, want Pending", lastTask.Status)
1281+
lastTask := plan.Tasks[len(plan.Tasks)-1]
1282+
if lastTask.Type != taskMarkReady {
1283+
t.Errorf("last task type = %q, want %q", lastTask.Type, taskMarkReady)
12821284
}
12831285
}
12841286

internal/controller/node/planner_bootstrap.go

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,15 @@ const taskAwaitCondition = sidecar.TaskTypeAwaitCondition
1212

1313
// buildPreInitPlan constructs the task plan for the PreInit Job. For nodes
1414
// that require bootstrap infrastructure (e.g. S3 snapshot with a bootstrap
15-
// image), it builds the standard bootstrap sequence and appends an
16-
// await-condition task. For all other nodes it returns an empty plan that
17-
// resolves trivially.
15+
// image), it builds the standard bootstrap sequence. The seid process
16+
// handles the halt-height natively (via --halt-height), so no
17+
// await-condition task is needed. For all other nodes it returns an empty
18+
// plan that resolves trivially.
1819
func buildPreInitPlan(node *seiv1alpha1.SeiNode, planner NodePlanner) *seiv1alpha1.TaskPlan {
1920
if !needsPreInit(node) {
2021
return &seiv1alpha1.TaskPlan{Phase: seiv1alpha1.TaskPlanActive, Tasks: []seiv1alpha1.PlannedTask{}}
2122
}
22-
plan := planner.BuildPlan(node)
23-
plan.Tasks = append(plan.Tasks, seiv1alpha1.PlannedTask{
24-
Type: taskAwaitCondition,
25-
Status: seiv1alpha1.PlannedTaskPending,
26-
})
27-
return plan
23+
return planner.BuildPlan(node)
2824
}
2925

3026
// buildPostBootstrapInitPlan constructs a reduced InitPlan for nodes that

0 commit comments

Comments
 (0)