Skip to content

Commit 1ad3e68

Browse files
committed
feat(runtime): Implement Fault Tolerance Supervisor Trees
1 parent afce168 commit 1ad3e68

1 file changed

Lines changed: 126 additions & 38 deletions

File tree

src/runtime/supervisor.c

Lines changed: 126 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9,61 +9,149 @@
99

1010
#include <stdio.h>
1111
#include <stdlib.h>
12+
#include <string.h>
13+
#include <stdbool.h>
14+
15+
#include "../../include/object.h"
16+
#include "../../include/value.h"
17+
18+
// ----------------------------------------------------------------------------
19+
// FAULT TOLERANCE: SUPERVISOR TREES (Erlang/OTP Style)
20+
// ----------------------------------------------------------------------------
1221

1322
typedef enum {
14-
STRATEGY_RESTART,
15-
STRATEGY_ROLLBACK,
16-
STRATEGY_ESCALATE
17-
} RecoveryStrategy;
23+
STRATEGY_ONE_FOR_ONE,
24+
STRATEGY_ALL_FOR_ONE,
25+
STRATEGY_REST_FOR_ONE
26+
} RestartStrategy;
27+
28+
typedef struct ChildSpec {
29+
int id;
30+
ObjTask* task; // The monitored task
31+
// ObjSupervisor* sub_supervisor; // Recursive
32+
int max_retries;
33+
int current_retries;
34+
struct ChildSpec* next;
35+
} ChildSpec;
1836

19-
typedef struct {
37+
typedef struct Supervisor {
2038
int id;
21-
RecoveryStrategy strategy;
22-
int maxRetries;
23-
int currentRetries;
24-
// Checkpoint state would go here
25-
} ResilientContext;
39+
RestartStrategy strategy;
40+
ChildSpec* children; // Linked list of children
41+
struct Supervisor* parent; // Up-link for escalation
42+
} Supervisor;
2643

27-
// Global supervisor state
28-
static ResilientContext *activeContexts[100];
29-
static int contextCount = 0;
44+
// Global Root Supervisor
45+
static Supervisor rootSupervisor;
46+
static bool initialized = false;
3047

3148
void initSupervisor() {
49+
rootSupervisor.id = 0;
50+
rootSupervisor.strategy = STRATEGY_ONE_FOR_ONE;
51+
rootSupervisor.children = NULL;
52+
rootSupervisor.parent = NULL;
53+
initialized = true;
3254
printf("[Supervisor] Initialized Autonomic Self-Healing Subsystem.\n");
33-
contextCount = 0;
3455
}
3556

36-
void registerResilientBlock(int id, const char *strategyStr, int retryCount) {
37-
printf("[Supervisor] Registering Resilient Block ID: %d, Strategy: %s\n", id, strategyStr);
57+
static ChildSpec* find_child(Supervisor* sup, int task_id) {
58+
ChildSpec* curr = sup->children;
59+
while (curr) {
60+
if (curr->id == task_id) return curr;
61+
curr = curr->next;
62+
}
63+
return NULL;
64+
}
65+
66+
void registerTask(int taskId, ObjTask* task, int maxRetries) {
67+
if (!initialized) initSupervisor();
68+
69+
// In full impl, we'd specify which supervisor to attach to.
70+
// Default to Root.
3871

39-
ResilientContext *ctx = malloc(sizeof(ResilientContext));
40-
ctx->id = id;
41-
ctx->maxRetries = retryCount;
42-
ctx->currentRetries = 0;
72+
ChildSpec* child = malloc(sizeof(ChildSpec));
73+
child->id = taskId;
74+
child->task = task;
75+
child->max_retries = maxRetries;
76+
child->current_retries = 0;
77+
child->next = rootSupervisor.children;
78+
rootSupervisor.children = child;
4379

44-
// Parse strategy
45-
// In real impl, use enum parsing
46-
ctx->strategy = STRATEGY_RESTART;
80+
printf("[Supervisor] Monitoring Task %d (Retries: %d)\n", taskId, maxRetries);
81+
}
4782

48-
activeContexts[contextCount++] = ctx;
83+
// Restart a specific child logic (Stub)
84+
static void restart_child(ChildSpec* child) {
85+
printf("[Supervisor] RESTARTING Child %d...\n", child->id);
86+
// In real VM:
87+
// 1. Reset Task IP/Stack
88+
// 2. Scheduler Enqueue(child->task)
89+
child->current_retries++;
90+
// Stub:
91+
// child->task->completed = false;
92+
// scheduler_enqueue(child->task);
4993
}
5094

51-
void notifyPanic(int code, const char *message) {
52-
printf("[Supervisor] ALERT: Panic caught! Code: %d, Message: %s\n", code, message);
53-
54-
// Find active context
55-
if (contextCount > 0) {
56-
ResilientContext *current = activeContexts[contextCount - 1];
57-
if (current->currentRetries < current->maxRetries) {
58-
printf("[Supervisor] Attempting Recovery: RESTART (%d/%d)\n", current->currentRetries + 1, current->maxRetries);
59-
current->currentRetries++;
60-
// Signal VM to Jump back to start of block (Checkpoint)
95+
static void handle_failure(Supervisor* sup, ChildSpec* failedChild) {
96+
printf("[Supervisor] Handling Failure for Child %d using Strategy %d\n",
97+
failedChild->id, sup->strategy);
98+
99+
if (failedChild->current_retries >= failedChild->max_retries) {
100+
printf("[Supervisor] Child %d exceeded max retries (%d). ESCALATING.\n",
101+
failedChild->id, failedChild->max_retries);
102+
// Escalate to parent
103+
if (sup->parent) {
104+
// propagate panic up
61105
} else {
62-
printf("[Supervisor] Recovery Failed. Escalating...\n");
63-
// Escalate
106+
printf("[Supervisor] Root Supervisor Gave Up. SYSTEM CRASH.\n");
107+
exit(1);
64108
}
109+
return;
110+
}
111+
112+
switch (sup->strategy) {
113+
case STRATEGY_ONE_FOR_ONE:
114+
restart_child(failedChild);
115+
break;
116+
117+
case STRATEGY_ALL_FOR_ONE:
118+
// Restart ALL children
119+
{
120+
ChildSpec* curr = sup->children;
121+
while (curr) {
122+
restart_child(curr);
123+
curr = curr->next;
124+
}
125+
}
126+
break;
127+
128+
default:
129+
break;
130+
}
131+
}
132+
133+
// Hook called by VM exception handler
134+
void notifyPanic(int taskId, const char *message) {
135+
if (!initialized) {
136+
printf("Panic before supervisor init: %s\n", message);
137+
exit(1);
138+
}
139+
140+
printf("[Supervisor] ALERT: Task %d Panicked! Msg: %s\n", taskId, message);
141+
142+
// Find who owns this task
143+
// Simplified: Search root
144+
ChildSpec* match = find_child(&rootSupervisor, taskId);
145+
if (match) {
146+
handle_failure(&rootSupervisor, match);
65147
} else {
66-
printf("[Supervisor] No active resilient context. System Crash.\n");
67-
exit(1);
148+
printf("[Supervisor] Unsupervised Task %d crashed. Ignoring.\n", taskId);
68149
}
69150
}
151+
152+
// Compatibility shim for existing calls
153+
void registerResilientBlock(int id, const char *strategyStr, int retryCount) {
154+
// Map old API to new task registration
155+
// We assume ID maps to a task somehow or creates a dummy task wrapper
156+
registerTask(id, NULL, retryCount);
157+
}

0 commit comments

Comments
 (0)