Skip to content
This repository was archived by the owner on Jan 23, 2026. It is now read-only.

Commit 56d039e

Browse files
authored
Merge pull request #609 from jumpstarter-dev/backport-608-to-release-0.7
[Backport release-0.7] reap zombie processes
2 parents b960370 + 15d0cbe commit 56d039e

3 files changed

Lines changed: 59 additions & 16 deletions

File tree

  • packages
    • jumpstarter-cli/jumpstarter_cli
    • jumpstarter-driver-shell/jumpstarter_driver_shell

packages/jumpstarter-cli/jumpstarter_cli/run.py

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,34 @@
1212
logger = logging.getLogger(__name__)
1313

1414

15+
def _handle_exporter_exceptions(excgroup):
16+
"""Handle exceptions from exporter serving."""
17+
from jumpstarter_cli_common.exceptions import leaf_exceptions
18+
for exc in leaf_exceptions(excgroup):
19+
if not isinstance(exc, anyio.get_cancelled_exc_class()):
20+
click.echo(
21+
f"Exception while serving on the exporter: {type(exc).__name__}: {exc}",
22+
err=True,
23+
)
24+
25+
26+
def _reap_zombie_processes(capture_child=None):
27+
"""Reap zombie processes when running as PID 1."""
28+
try:
29+
while True:
30+
try:
31+
pid, status = os.waitpid(-1, os.WNOHANG)
32+
if pid == 0:
33+
break # No more children
34+
if capture_child and pid == capture_child['pid']:
35+
capture_child['status'] = status
36+
logger.debug(f"PARENT: Reaped zombie process {pid} with status {status}")
37+
except ChildProcessError:
38+
break # No more children
39+
except Exception as e:
40+
logger.warning(f"PARENT: Error during zombie reaping: {e}")
41+
42+
1543
def _handle_child(config):
1644
"""Handle child process with graceful shutdown."""
1745
async def serve_with_graceful_shutdown():
@@ -28,6 +56,7 @@ async def signal_handler():
2856
continue # Ignore duplicate signals
2957
received_signal = sig
3058
logger.info("CHILD: Received %d (%s)", received_signal, signal.Signals(received_signal).name)
59+
3160
if exporter:
3261
# Terminate exporter. SIGHUP waits until current lease is let go. Later SIGTERM still overrides
3362
if received_signal != signal.SIGHUP:
@@ -45,13 +74,7 @@ async def signal_handler():
4574
try:
4675
await exporter.serve()
4776
except* Exception as excgroup:
48-
from jumpstarter_cli_common.exceptions import leaf_exceptions
49-
for exc in leaf_exceptions(excgroup):
50-
if not isinstance(exc, anyio.get_cancelled_exc_class()):
51-
click.echo(
52-
f"Exception while serving on the exporter: {type(exc).__name__}: {exc}",
53-
err=True,
54-
)
77+
_handle_exporter_exceptions(excgroup)
5578

5679
# Cancel the signal handler after exporter completes
5780
signal_tg.cancel_scope.cancel()
@@ -62,21 +85,38 @@ async def signal_handler():
6285
sys.exit(anyio.run(serve_with_graceful_shutdown))
6386

6487

88+
def _wait_for_child(pid, child_info):
89+
"""Wait for child process, get status from signal handler if reaped."""
90+
try:
91+
_, status = os.waitpid(pid, 0)
92+
except ChildProcessError:
93+
status = child_info['status']
94+
return status
95+
96+
6597
def _handle_parent(pid):
6698
"""Handle parent process waiting for child and signal forwarding."""
99+
child_info = {'pid': pid, 'status': None}
100+
67101
def parent_signal_handler(signum, _):
68-
logger.info("PARENT: Received %d (%s), forwarding to child PID %d", signum, signal.Signals(signum).name, pid)
69-
if pid and pid > 0:
70-
try:
71-
os.kill(pid, signum)
72-
except ProcessLookupError:
73-
pass
102+
if signum == signal.SIGCHLD and os.getpid() == 1:
103+
_reap_zombie_processes(capture_child=child_info) # capture our own direct child if reaped
104+
elif signum != signal.SIGCHLD:
105+
logger.info("PARENT: Got %d (%s), forwarding to child PG %d", signum, signal.Signals(signum).name, pid)
106+
if pid > 0:
107+
try:
108+
os.killpg(pid, signum)
109+
except (ProcessLookupError, OSError):
110+
pass
74111

75112
# Set up signal handlers after fork
76-
for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGHUP, signal.SIGQUIT):
113+
for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGHUP, signal.SIGQUIT, signal.SIGCHLD):
77114
signal.signal(sig, parent_signal_handler)
78115

79-
_, status = os.waitpid(pid, 0)
116+
status = _wait_for_child(pid, child_info)
117+
if status is None:
118+
return None
119+
80120
if os.WIFEXITED(status):
81121
# Interpret child exit code
82122
child_exit_code = os.WEXITSTATUS(status)
@@ -100,6 +140,7 @@ def _serve_with_exc_handling(config):
100140
if (exit_code := _handle_parent(pid)) is not None:
101141
return exit_code
102142
else:
143+
os.setsid() # Become group leader so all spawned subprocesses are reached by parent's signals
103144
_handle_child(config)
104145
sys.exit(1) # should never happen
105146

packages/jumpstarter-driver-shell/jumpstarter_driver_shell/driver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ async def _run_inline_shell_script(
138138
cmd = self.shell + [script, method] + list(args)
139139

140140
# Start the process with pipes for streaming and new process group
141+
self.logger.debug( f"running {method} with cmd: {cmd} and env: {combined_env} " f"and args: {args}")
141142
process = await asyncio.create_subprocess_exec(
142143
*cmd,
143144
stdout=asyncio.subprocess.PIPE,
@@ -152,7 +153,6 @@ async def _run_inline_shell_script(
152153

153154
# Read output in real-time
154155
while process.returncode is None:
155-
self.logger.debug(f"running {method} with cmd: {cmd} and env: {combined_env} and args: {args}")
156156
if asyncio.get_event_loop().time() - start_time > self.timeout:
157157
# Send SIGTERM to entire process group for graceful termination
158158
try:

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)