Skip to content

Commit e204f70

Browse files
committed
fix: ensure brain thread stops cleanly on shutdown
1 parent a2c41ed commit e204f70

5 files changed

Lines changed: 56 additions & 15 deletions

File tree

src/brain/runner/lifecycle.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,9 @@ def _stopped() -> bool:
373373
if state.mana_current == 0:
374374
brain_log.info("[LIFECYCLE] Warmup: mana reads 0, retrying CHARINFO...")
375375
for _retry in range(5):
376+
if _stopped():
377+
brain_log.info("[LIFECYCLE] Warmup mana retry interrupted by stop_event")
378+
return
376379
time.sleep(1.0)
377380
state = self._runner._reader.read_state(include_spawns=True)
378381
if state.mana_current > 0:

src/brain/runner/loop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def _run_setup(self) -> None:
271271
self._init_monitoring()
272272

273273
tick_rate = self._config["general"].get("tick_rate_hz", 10)
274-
self._clock = TickClock(tick_rate)
274+
self._clock = TickClock(tick_rate, stop_event=self._stop_event)
275275
self._next_snapshot = time.time() + 30.0
276276
self._next_tuning_eval = time.time() + 1800.0
277277

src/runtime/orchestrator.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -485,24 +485,34 @@ def stop_agent(self) -> dict:
485485
if not self.agent_running:
486486
return {"error": "Not running"}
487487
self.stop_event.set()
488-
orphaned = False
489488
if self.agent_thread:
489+
# First wait: cooperative shutdown via stop_event (should be fast
490+
# now that TickClock and warmup sleeps are interruptible).
490491
self.agent_thread.join(timeout=10)
491492
if self.agent_thread.is_alive():
492-
log.warning(
493-
"[LIFECYCLE] stop_agent: brain thread did not stop within 10s -- it will be orphaned"
493+
log.warning("[LIFECYCLE] stop_agent: brain thread did not stop within 10s -- waiting longer")
494+
# Second wait: allow cleanup callbacks (file I/O) to finish.
495+
self.agent_thread.join(timeout=20)
496+
if self.agent_thread.is_alive():
497+
# Thread is truly stuck (hung syscall, deadlock). Keep the
498+
# reference so start_agent() still refuses to create a
499+
# duplicate runner -- the daemon flag ensures it dies at
500+
# process exit.
501+
log.error(
502+
"[LIFECYCLE] stop_agent: brain thread stuck after 30s -- "
503+
"thread kept alive (daemon); start_agent will block until it exits"
494504
)
495-
orphaned = True
496-
else:
497-
self.agent_thread = None
505+
with self._agent_lock:
506+
self.agent_running = False
507+
defeats = self.agent_defeats
508+
self.add_log(f"Agent stopped -- {defeats} defeats (thread stuck, will block next start)")
509+
return {"running": False, "defeats": defeats, "stuck_thread": True}
510+
self.agent_thread = None
498511
with self._agent_lock:
499512
self.agent_running = False
500513
defeats = self.agent_defeats
501-
self.add_log(f"Agent stopped -- {defeats} defeats" + (" (thread orphaned)" if orphaned else ""))
502-
result: dict = {"running": False, "defeats": defeats}
503-
if orphaned:
504-
result["orphaned_thread"] = True
505-
return result
514+
self.add_log(f"Agent stopped -- {defeats} defeats")
515+
return {"running": False, "defeats": defeats}
506516

507517
def pause_agent(self) -> dict:
508518
with self._agent_lock:

src/util/clock.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,36 @@
11
"""Tick timing and dt tracking."""
22

3+
from __future__ import annotations
4+
5+
import threading
36
import time
47

58

69
class TickClock:
710
"""Regulates the main loop to a target tick rate and tracks delta time."""
811

9-
def __init__(self, tick_rate_hz: float = 10.0) -> None:
12+
def __init__(self, tick_rate_hz: float = 10.0, stop_event: threading.Event | None = None) -> None:
1013
self._interval = 1.0 / tick_rate_hz
1114
self._last_tick = time.perf_counter()
1215
self.dt: float = 0.0
1316
self.tick_count: int = 0
17+
self._stop_event = stop_event
1418

1519
def wait_for_next_tick(self) -> float:
16-
"""Sleep until the next tick is due. Returns dt since last tick."""
20+
"""Sleep until the next tick is due. Returns dt since last tick.
21+
22+
If a *stop_event* was provided at construction, the sleep is
23+
interruptible: ``stop_event.wait(timeout)`` is used instead of
24+
``time.sleep`` so the thread wakes immediately on shutdown.
25+
"""
1726
now = time.perf_counter()
1827
elapsed = now - self._last_tick
1928
sleep_time = self._interval - elapsed
2029
if sleep_time > 0:
21-
time.sleep(sleep_time)
30+
if self._stop_event is not None:
31+
self._stop_event.wait(sleep_time)
32+
else:
33+
time.sleep(sleep_time)
2234

2335
now = time.perf_counter()
2436
self.dt = now - self._last_tick

tests/test_clock.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
from __future__ import annotations
44

5+
import threading
6+
import time
7+
58
from util.clock import TickClock
69

710

@@ -39,3 +42,16 @@ def test_returns_dt(self) -> None:
3942
result = c.wait_for_next_tick()
4043
assert isinstance(result, float)
4144
assert result == c.dt
45+
46+
def test_stop_event_interrupts_sleep(self) -> None:
47+
"""A slow clock (1 Hz = 1s sleep) should return almost immediately when stop_event fires."""
48+
stop = threading.Event()
49+
c = TickClock(tick_rate_hz=1.0, stop_event=stop)
50+
c.wait_for_next_tick() # first tick, sets _last_tick
51+
52+
# Fire stop_event after 50ms -- wait_for_next_tick should unblock well before 1s
53+
threading.Timer(0.05, stop.set).start()
54+
t0 = time.perf_counter()
55+
c.wait_for_next_tick()
56+
elapsed = time.perf_counter() - t0
57+
assert elapsed < 0.3, f"expected fast wakeup, took {elapsed:.3f}s"

0 commit comments

Comments
 (0)