Skip to content
This repository was archived by the owner on Jan 23, 2026. It is now read-only.

Commit e6cf2e0

Browse files
reap zombie processes
some drivers (e.g.Shell) can spawn multiple processes and leave them behind. We also kill processes when timeout is exceeded, and that also may not necessarily kill all of them. Let's reap them when they exit so that we don't end up accumulating zombie processes, especially when running as PID 1. (cherry picked from commit 0e91d92)
1 parent b960370 commit e6cf2e0

1 file changed

Lines changed: 56 additions & 15 deletions

File tree

  • packages/jumpstarter-cli/jumpstarter_cli

packages/jumpstarter-cli/jumpstarter_cli/run.py

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,34 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
def _handle_exporter_exceptions(excgroup):
16+
"""Handle exceptions from exporter serving."""
17+
from jumpstarter_cli_common.exceptions import leaf_exceptions
18+
for exc in leaf_exceptions(excgroup):
19+
if not isinstance(exc, anyio.get_cancelled_exc_class()):
20+
click.echo(
21+
f"Exception while serving on the exporter: {type(exc).__name__}: {exc}",
22+
err=True,
23+
)
24+
25+
26+
def _reap_zombie_processes(capture_child=None):
27+
"""Reap zombie processes when running as PID 1."""
28+
try:
29+
while True:
30+
try:
31+
pid, status = os.waitpid(-1, os.WNOHANG)
32+
if pid == 0:
33+
break # No more children
34+
if capture_child and pid == capture_child['pid']:
35+
capture_child['status'] = status
36+
logger.debug(f"PARENT: Reaped zombie process {pid} with status {status}")
37+
except ChildProcessError:
38+
break # No more children
39+
except Exception as e:
40+
logger.warning(f"PARENT: Error during zombie reaping: {e}")
41+
42+
1543
def _handle_child(config):
1644
"""Handle child process with graceful shutdown."""
1745
async def serve_with_graceful_shutdown():
@@ -28,6 +56,7 @@ async def signal_handler():
2856
continue # Ignore duplicate signals
2957
received_signal = sig
3058
logger.info("CHILD: Received %d (%s)", received_signal, signal.Signals(received_signal).name)
59+
3160
if exporter:
3261
# Terminate exporter. SIGHUP waits until current lease is let go. Later SIGTERM still overrides
3362
if received_signal != signal.SIGHUP:
@@ -45,13 +74,7 @@ async def signal_handler():
4574
try:
4675
await exporter.serve()
4776
except* Exception as excgroup:
48-
from jumpstarter_cli_common.exceptions import leaf_exceptions
49-
for exc in leaf_exceptions(excgroup):
50-
if not isinstance(exc, anyio.get_cancelled_exc_class()):
51-
click.echo(
52-
f"Exception while serving on the exporter: {type(exc).__name__}: {exc}",
53-
err=True,
54-
)
77+
_handle_exporter_exceptions(excgroup)
5578

5679
# Cancel the signal handler after exporter completes
5780
signal_tg.cancel_scope.cancel()
@@ -62,21 +85,38 @@ async def signal_handler():
6285
sys.exit(anyio.run(serve_with_graceful_shutdown))
6386

6487

88+
def _wait_for_child(pid, child_info):
89+
"""Wait for child process, get status from signal handler if reaped."""
90+
try:
91+
_, status = os.waitpid(pid, 0)
92+
except ChildProcessError:
93+
status = child_info['status']
94+
return status
95+
96+
6597
def _handle_parent(pid):
6698
"""Handle parent process waiting for child and signal forwarding."""
99+
child_info = {'pid': pid, 'status': None}
100+
67101
def parent_signal_handler(signum, _):
68-
logger.info("PARENT: Received %d (%s), forwarding to child PID %d", signum, signal.Signals(signum).name, pid)
69-
if pid and pid > 0:
70-
try:
71-
os.kill(pid, signum)
72-
except ProcessLookupError:
73-
pass
102+
if signum == signal.SIGCHLD and os.getpid() == 1:
103+
_reap_zombie_processes(capture_child=child_info) # capture our own direct child if reaped
104+
elif signum != signal.SIGCHLD:
105+
logger.info("PARENT: Got %d (%s), forwarding to child PG %d", signum, signal.Signals(signum).name, pid)
106+
if pid > 0:
107+
try:
108+
os.killpg(pid, signum)
109+
except (ProcessLookupError, OSError):
110+
pass
74111

75112
# Set up signal handlers after fork
76-
for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGHUP, signal.SIGQUIT):
113+
for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGHUP, signal.SIGQUIT, signal.SIGCHLD):
77114
signal.signal(sig, parent_signal_handler)
78115

79-
_, status = os.waitpid(pid, 0)
116+
status = _wait_for_child(pid, child_info)
117+
if status is None:
118+
return None
119+
80120
if os.WIFEXITED(status):
81121
# Interpret child exit code
82122
child_exit_code = os.WEXITSTATUS(status)
@@ -100,6 +140,7 @@ def _serve_with_exc_handling(config):
100140
if (exit_code := _handle_parent(pid)) is not None:
101141
return exit_code
102142
else:
143+
os.setsid() # Become group leader so all spawned subprocesses are reached by parent's signals
103144
_handle_child(config)
104145
sys.exit(1) # should never happen
105146

0 commit comments

Comments
 (0)