sei-protocol
diff --git a/‎sidecar/engine/engine.go‎
Lines changed: 40 additions & 14 deletions b/‎sidecar/engine/engine.go‎
Lines changed: 40 additions & 14 deletions
diff --git a/‎sidecar/engine/engine_test.go‎
Lines changed: 167 additions & 0 deletions b/‎sidecar/engine/engine_test.go‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎sidecar/engine/sqlite_migrations.go‎
Lines changed: 22 additions & 0 deletions b/‎sidecar/engine/sqlite_migrations.go‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎sidecar/engine/sqlite_store.go‎
Lines changed: 10 additions & 4 deletions b/‎sidecar/engine/sqlite_store.go‎
Lines changed: 10 additions & 4 deletions
@@ -3,6 +3,7 @@ package engine
 import (
 	"context"
 	"fmt"
+	"sync"
 	"sync/atomic"
 	"time"
 
@@ -35,6 +36,7 @@ type Engine struct {
 	ctx      context.Context
 	ready    atomic.Bool
 	store    ResultStore
+	mu       sync.Mutex
 }
 
 // NewEngine creates a new Engine. The engine runs until ctx is cancelled.
@@ -52,6 +54,8 @@ func NewEngine(ctx context.Context, handlers map[TaskType]TaskHandler, store Res
 
 // rehydrateStaleTasks re-executes tasks that were left in "running"
 // state by a previous process that exited before completing them.
+// Run count is NOT incremented — rehydration is crash recovery of an
+// incomplete run, not a new run.
 func (e *Engine) rehydrateStaleTasks() {
 	stale, err := e.store.ListStaleTasks()
 	if err != nil {
@@ -66,19 +70,24 @@ func (e *Engine) rehydrateStaleTasks() {
 			tr.Status = TaskStatusFailed
 			tr.Error = "no handler registered for task type"
 			tr.CompletedAt = &t
-			_ = e.store.Save(&tr)
+			if err := e.store.Save(&tr); err != nil {
+				log.Error("failed to persist stale task failure", "id", tr.ID, "err", err)
+			}
 			continue
 		}
-		log.Info("rehydrating stale task", "type", tr.Type, "id", tr.ID)
-		e.runTask(tr.ID, TaskType(tr.Type), handler, tr.Params, tr.SubmittedAt)
+		log.Info("rehydrating stale task", "type", tr.Type, "id", tr.ID, "run", tr.Run)
+		e.runTask(tr.ID, TaskType(tr.Type), handler, tr.Params, tr.SubmittedAt, tr.Run)
 	}
 }
 
-// Submit starts a task in its own goroutine and returns its ID. When
-// task.ID is set, it becomes the canonical identifier (enabling
-// deterministic IDs from the controller). When empty, a random UUID is
-// generated. If a task with the same ID already exists, the existing ID
-// is returned without re-submitting.
+// Submit starts a task in its own goroutine and returns its ID.
+//
+// The engine follows a cloud-API model for task lifecycle:
+//   - If no task with this ID exists, create and execute it (run 1).
+//   - If the task is running or completed, return its ID (idempotent no-op).
+//   - If the task failed, re-execute it with an incremented run counter.
+//
+// The caller submits a stable key and the engine owns the execution lifecycle.
 func (e *Engine) Submit(task Task) (string, error) {
 	handler, ok := e.handlers[task.Type]
 	if !ok {
@@ -94,17 +103,25 @@ func (e *Engine) Submit(task Task) (string, error) {
 		id = uuid.New().String()
 	}
 
-	// Dedup check against the store.
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	run := 1
 	if existing, _ := e.store.Get(id); existing != nil {
-		return id, nil
+		switch existing.Status {
+		case TaskStatusRunning, TaskStatusCompleted:
+			return id, nil
+		case TaskStatusFailed:
+			run = existing.Run + 1
+		}
 	}
 
 	now := time.Now().UTC()
-
 	tr := &TaskResult{
 		ID:          id,
 		Type:        string(task.Type),
 		Status:      TaskStatusRunning,
+		Run:         run,
 		Params:      task.Params,
 		SubmittedAt: now,
 	}
@@ -113,21 +130,22 @@ func (e *Engine) Submit(task Task) (string, error) {
 		return "", fmt.Errorf("persist task: %w", err)
 	}
 
-	log.Info("task submitted", "type", task.Type, "id", id)
-	e.runTask(id, task.Type, handler, task.Params, now)
+	log.Info("task submitted", "type", task.Type, "id", id, "run", run)
+	e.runTask(id, task.Type, handler, task.Params, now, run)
 
 	return id, nil
 }
 
 // runTask spawns a goroutine to run the handler and persist the result.
-func (e *Engine) runTask(id string, taskType TaskType, handler TaskHandler, params map[string]any, submittedAt time.Time) {
+func (e *Engine) runTask(id string, taskType TaskType, handler TaskHandler, params map[string]any, submittedAt time.Time, run int) {
 	go func() {
 		err := e.execute(e.ctx, taskType, handler, params)
 
 		t := time.Now().UTC()
 		tr := &TaskResult{
 			ID:          id,
 			Type:        string(taskType),
+			Run:         run,
 			Params:      params,
 			SubmittedAt: submittedAt,
 			CompletedAt: &t,
@@ -160,10 +178,18 @@ func (e *Engine) execute(ctx context.Context, taskType TaskType, handler TaskHan
 }
 
 // Healthz returns true after the engine has been marked ready.
+// Use as a readiness check.
 func (e *Engine) Healthz() bool {
 	return e.ready.Load()
 }
 
+// Livez returns nil when the engine's backing store is responsive.
+// Use as a liveness check — a non-nil error means the process is wedged
+// (e.g., SQLite WAL corruption, PVC read-only).
+func (e *Engine) Livez() error {
+	return e.store.Ping()
+}
+
 // Status returns the engine's current state.
 func (e *Engine) Status() StatusResponse {
 	status := "Initializing"
 
@@ -537,6 +537,173 @@ func TestLongRunningTaskDoesNotBlockOthers(t *testing.T) {
 	waitForResult(t, eng, id)
 }
 
+// --- Status-aware Submit tests ---
+
+func TestSubmitReExecutesFailedTask(t *testing.T) {
+	calls := 0
+	eng := newTestEngine(t, map[TaskType]TaskHandler{
+		TaskConfigPatch: func(_ context.Context, _ map[string]any) error {
+			calls++
+			if calls == 1 {
+				return errors.New("transient failure")
+			}
+			return nil
+		},
+	})
+
+	const taskID = "dddddddd-1111-2222-3333-444444444444"
+
+	// First submit: task fails.
+	id1, err := eng.Submit(Task{ID: taskID, Type: TaskConfigPatch})
+	if err != nil {
+		t.Fatalf("first submit: %v", err)
+	}
+	r1 := waitForResult(t, eng, id1)
+	if r1.Status != TaskStatusFailed {
+		t.Fatalf("expected failed, got %s", r1.Status)
+	}
+	if r1.Run != 1 {
+		t.Fatalf("expected run=1, got %d", r1.Run)
+	}
+
+	// Second submit with same ID: should re-execute.
+	id2, err := eng.Submit(Task{ID: taskID, Type: TaskConfigPatch})
+	if err != nil {
+		t.Fatalf("second submit: %v", err)
+	}
+	if id2 != id1 {
+		t.Fatalf("expected same ID, got %q and %q", id1, id2)
+	}
+
+	r2 := waitForResult(t, eng, id2)
+	if r2.Status != TaskStatusCompleted {
+		t.Fatalf("expected completed on retry, got %s", r2.Status)
+	}
+	if r2.Run != 2 {
+		t.Fatalf("expected run=2, got %d", r2.Run)
+	}
+	if calls != 2 {
+		t.Fatalf("expected handler called twice, got %d", calls)
+	}
+}
+
+func TestSubmitReExecutesFailedTaskThatFailsAgain(t *testing.T) {
+	eng := newTestEngine(t, map[TaskType]TaskHandler{
+		TaskConfigPatch: func(_ context.Context, _ map[string]any) error {
+			return errors.New("persistent failure")
+		},
+	})
+
+	const taskID = "eeeeeeee-1111-2222-3333-444444444444"
+
+	id, _ := eng.Submit(Task{ID: taskID, Type: TaskConfigPatch})
+	waitForResult(t, eng, id)
+
+	// Re-submit: still fails.
+	eng.Submit(Task{ID: taskID, Type: TaskConfigPatch})
+	r := waitForResult(t, eng, id)
+
+	if r.Status != TaskStatusFailed {
+		t.Fatalf("expected failed, got %s", r.Status)
+	}
+	if r.Run != 2 {
+		t.Fatalf("expected run=2, got %d", r.Run)
+	}
+}
+
+func TestSubmitDoesNotIncrementRunOnRehydration(t *testing.T) {
+	// Create a store with a stale running task (simulates pod crash).
+	store := newTestStore(t)
+	now := time.Now().UTC()
+	_ = store.Save(&TaskResult{
+		ID:          "ffffffff-1111-2222-3333-444444444444",
+		Type:        string(TaskConfigPatch),
+		Status:      TaskStatusRunning,
+		Run:         1,
+		SubmittedAt: now,
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+	eng := NewEngine(ctx, map[TaskType]TaskHandler{
+		TaskConfigPatch: func(_ context.Context, _ map[string]any) error { return nil },
+	}, store)
+
+	r := waitForResult(t, eng, "ffffffff-1111-2222-3333-444444444444")
+	if r.Run != 1 {
+		t.Fatalf("expected run=1 after rehydration (not incremented), got %d", r.Run)
+	}
+	if r.Status != TaskStatusCompleted {
+		t.Fatalf("expected completed after rehydration, got %s", r.Status)
+	}
+}
+
+func TestSubmitConcurrentSameFailedID(t *testing.T) {
+	started := make(chan struct{}, 2)
+	blocked := make(chan struct{})
+	var callCount atomic.Int32
+
+	ctx, cancel := context.WithCancel(context.Background())
+	t.Cleanup(cancel)
+	store := newTestStore(t)
+	eng := NewEngine(ctx, map[TaskType]TaskHandler{
+		TaskConfigPatch: func(_ context.Context, _ map[string]any) error {
+			callCount.Add(1)
+			started <- struct{}{}
+			<-blocked
+			return nil
+		},
+	}, store)
+
+	const taskID = "11111111-2222-3333-4444-555555555555"
+
+	// Seed a failed task directly in the store.
+	now := time.Now().UTC()
+	_ = store.Save(&TaskResult{
+		ID:          taskID,
+		Type:        string(TaskConfigPatch),
+		Status:      TaskStatusFailed,
+		Run:         1,
+		Error:       "failed",
+		SubmittedAt: now,
+		CompletedAt: &now,
+	})
+
+	// Two concurrent submits of the same failed ID.
+	var wg sync.WaitGroup
+	wg.Add(2)
+	for i := 0; i < 2; i++ {
+		go func() {
+			defer wg.Done()
+			eng.Submit(Task{ID: taskID, Type: TaskConfigPatch})
+		}()
+	}
+	wg.Wait()
+
+	// Unblock the handler and wait for completion.
+	close(blocked)
+	waitForResult(t, eng, taskID)
+
+	// The mutex serializes Submit: the first sees "failed" and re-executes,
+	// the second sees "running" (from the first's Save) and no-ops.
+	if c := callCount.Load(); c != 1 {
+		t.Fatalf("expected exactly 1 re-execution, got %d", c)
+	}
+}
+
+func TestSubmitRunFieldOnFirstSubmit(t *testing.T) {
+	eng := newTestEngine(t, map[TaskType]TaskHandler{
+		TaskConfigPatch: func(_ context.Context, _ map[string]any) error { return nil },
+	})
+
+	id, _ := eng.Submit(Task{Type: TaskConfigPatch})
+	r := waitForResult(t, eng, id)
+
+	if r.Run != 1 {
+		t.Fatalf("expected run=1 on first submit, got %d", r.Run)
+	}
+}
+
 func TestTaskErrorProducesRichErrorString(t *testing.T) {
 	eng := newTestEngine(t, map[TaskType]TaskHandler{
 		TaskConfigPatch: func(_ context.Context, _ map[string]any) error {
 
@@ -68,5 +68,27 @@ func migrate(db *sql.DB) error {
 		}
 	}
 
+	if version < 3 {
+		tx, err := db.Begin()
+		if err != nil {
+			return err
+		}
+		defer tx.Rollback()
+
+		if _, err := tx.Exec(`
+			ALTER TABLE task_results ADD COLUMN run INTEGER NOT NULL DEFAULT 1;
+		`); err != nil {
+			return err
+		}
+
+		if _, err := tx.Exec("PRAGMA user_version = 3"); err != nil {
+			return err
+		}
+
+		if err := tx.Commit(); err != nil {
+			return err
+		}
+	}
+
 	return nil
 }
@@ -70,11 +70,12 @@ func (s *SQLiteStore) Save(r *TaskResult) error {
 
 	_, err = s.db.Exec(`
 		INSERT OR REPLACE INTO task_results
-			(id, type, status, params, error, submitted_at, completed_at)
-		VALUES (?, ?, ?, ?, ?, ?, ?)`,
+			(id, type, status, run, params, error, submitted_at, completed_at)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
 		r.ID,
 		r.Type,
 		string(r.Status),
+		r.Run,
 		string(params),
 		r.Error,
 		r.SubmittedAt.UTC().Format(time.RFC3339Nano),
@@ -112,14 +113,19 @@ func (s *SQLiteStore) Delete(id string) (bool, error) {
 	return n > 0, nil
 }
 
+func (s *SQLiteStore) Ping() error {
+	var n int
+	return s.db.QueryRow("SELECT 1").Scan(&n)
+}
+
 func (s *SQLiteStore) Close() error {
 	return s.db.Close()
 }
 
 // --- query helpers ---
 
 const selectColumns = `
-	SELECT id, type, status, params, error, submitted_at, completed_at
+	SELECT id, type, status, run, params, error, submitted_at, completed_at
 	FROM task_results`
 
 // queryMany executes a query and scans all rows into TaskResults.
@@ -156,7 +162,7 @@ func scanTaskResult(s rowScanner) (*TaskResult, error) {
 	)
 
 	if err := s.Scan(
-		&r.ID, &r.Type, &status, &paramsJSON,
+		&r.ID, &r.Type, &status, &r.Run, &paramsJSON,
 		&r.Error, &submittedAt, &completedAt,
 	); err != nil {
 		return nil, err