Skip to content

Commit 6308a9d

Browse files
committed
fix: use standard entrypoint for pre-init Job seid container
The pre-init Job was using the node custom entrypoint (e.g. --skip-app-hash-validation) against the bootstrap image which may not support those flags, causing seid to crash after mark-ready. Use a hardcoded "seid start --home /sei" for the pre-init Job instead. Also temporarily disable pre-init resource cleanup on failure to preserve pod logs for debugging.
1 parent d43d956 commit 6308a9d

2 files changed

Lines changed: 28 additions & 7 deletions

File tree

internal/controller/node/job.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,27 @@ func preInitSidecarURL(node *seiv1alpha1.SeiNode) string {
7171
preInitPodHostname, preInitJobName(node), node.Namespace, sidecarPort(node))
7272
}
7373

74+
// preInitWaitCommand returns a shell command that waits for the sidecar
75+
// healthz to return 200 and then runs a standard "seid start --home /sei".
76+
// Unlike sidecarWaitCommand, it does NOT use the node's custom entrypoint
77+
// because the pre-init Job runs the bootstrap image which may not support
78+
// custom flags (e.g. --skip-app-hash-validation).
79+
func preInitWaitCommand(port int32) (command []string, args []string) {
80+
script := fmt.Sprintf(
81+
`echo "waiting for sidecar to become ready..."; `+
82+
`while true; do `+
83+
`{ exec 3<>/dev/tcp/localhost/%d; } 2>/dev/null && `+
84+
`printf "GET /v0/healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && `+
85+
`head -1 <&3 | grep -q "200" && break; `+
86+
`exec 3>&-; sleep 5; done; `+
87+
`exec 3>&-; `+
88+
`echo "sidecar ready, starting seid"; `+
89+
`exec seid start --home %s`,
90+
port, dataDir,
91+
)
92+
return []string{"/bin/bash", "-c"}, []string{script}
93+
}
94+
7495
func buildPreInitPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.SnapshotSource, platform PlatformConfig) corev1.PodSpec {
7596
serviceName := preInitJobName(node)
7697

@@ -110,7 +131,7 @@ func buildPreInitPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.SnapshotSo
110131
bootstrapImage = snap.BootstrapImage
111132
}
112133

113-
seidCmd, seidArgs := sidecarWaitCommand(node)
134+
seidCmd, seidArgs := preInitWaitCommand(sidecarPort(node))
114135
seidContainer := corev1.Container{
115136
Name: "seid",
116137
Image: bootstrapImage,

internal/controller/node/pre_init.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node *
3030
return r.setPhase(ctx, node, seiv1alpha1.PhaseInitializing)
3131
}
3232
if plan.Phase == seiv1alpha1.TaskPlanFailed {
33-
if err := r.cleanupPreInit(ctx, node); err != nil {
34-
log.FromContext(ctx).Error(err, "failed to clean up pre-init resources after plan failure")
35-
}
33+
// if err := r.cleanupPreInit(ctx, node); err != nil {
34+
// log.FromContext(ctx).Error(err, "failed to clean up pre-init resources after plan failure")
35+
// }
3636
return r.setPhase(ctx, node, seiv1alpha1.PhaseFailed)
3737
}
3838

@@ -85,9 +85,9 @@ func (r *SeiNodeReconciler) reconcilePreInitializing(ctx context.Context, node *
8585
return r.setPhase(ctx, node, seiv1alpha1.PhaseInitializing)
8686
}
8787
if plan.Phase == seiv1alpha1.TaskPlanFailed {
88-
if err := r.cleanupPreInit(ctx, node); err != nil {
89-
log.FromContext(ctx).Error(err, "failed to clean up pre-init resources after plan failure")
90-
}
88+
// if err := r.cleanupPreInit(ctx, node); err != nil {
89+
// log.FromContext(ctx).Error(err, "failed to clean up pre-init resources after plan failure")
90+
// }
9191
return r.setPhase(ctx, node, seiv1alpha1.PhaseFailed)
9292
}
9393
return result, nil

0 commit comments

Comments
 (0)