1212logger = logging .getLogger (__name__ )
1313
1414
15+ def _handle_exporter_exceptions (excgroup ):
16+ """Handle exceptions from exporter serving."""
17+ from jumpstarter_cli_common .exceptions import leaf_exceptions
18+ for exc in leaf_exceptions (excgroup ):
19+ if not isinstance (exc , anyio .get_cancelled_exc_class ()):
20+ click .echo (
21+ f"Exception while serving on the exporter: { type (exc ).__name__ } : { exc } " ,
22+ err = True ,
23+ )
24+
25+
26+ def _reap_zombie_processes (capture_child = None ):
27+ """Reap zombie processes when running as PID 1."""
28+ try :
29+ while True :
30+ try :
31+ pid , status = os .waitpid (- 1 , os .WNOHANG )
32+ if pid == 0 :
33+ break # No more children
34+ if capture_child and pid == capture_child ['pid' ]:
35+ capture_child ['status' ] = status
36+ logger .debug (f"PARENT: Reaped zombie process { pid } with status { status } " )
37+ except ChildProcessError :
38+ break # No more children
39+ except Exception as e :
40+ logger .warning (f"PARENT: Error during zombie reaping: { e } " )
41+
42+
1543def _handle_child (config ):
1644 """Handle child process with graceful shutdown."""
1745 async def serve_with_graceful_shutdown ():
@@ -28,6 +56,7 @@ async def signal_handler():
2856 continue # Ignore duplicate signals
2957 received_signal = sig
3058 logger .info ("CHILD: Received %d (%s)" , received_signal , signal .Signals (received_signal ).name )
59+
3160 if exporter :
3261 # Terminate exporter. SIGHUP waits until current lease is let go. Later SIGTERM still overrides
3362 if received_signal != signal .SIGHUP :
@@ -45,13 +74,7 @@ async def signal_handler():
4574 try :
4675 await exporter .serve ()
4776 except* Exception as excgroup :
48- from jumpstarter_cli_common .exceptions import leaf_exceptions
49- for exc in leaf_exceptions (excgroup ):
50- if not isinstance (exc , anyio .get_cancelled_exc_class ()):
51- click .echo (
52- f"Exception while serving on the exporter: { type (exc ).__name__ } : { exc } " ,
53- err = True ,
54- )
77+ _handle_exporter_exceptions (excgroup )
5578
5679 # Cancel the signal handler after exporter completes
5780 signal_tg .cancel_scope .cancel ()
@@ -62,21 +85,38 @@ async def signal_handler():
6285 sys .exit (anyio .run (serve_with_graceful_shutdown ))
6386
6487
88+ def _wait_for_child (pid , child_info ):
89+ """Wait for child process, get status from signal handler if reaped."""
90+ try :
91+ _ , status = os .waitpid (pid , 0 )
92+ except ChildProcessError :
93+ status = child_info ['status' ]
94+ return status
95+
96+
6597def _handle_parent (pid ):
6698 """Handle parent process waiting for child and signal forwarding."""
99+ child_info = {'pid' : pid , 'status' : None }
100+
67101 def parent_signal_handler (signum , _ ):
68- logger .info ("PARENT: Received %d (%s), forwarding to child PID %d" , signum , signal .Signals (signum ).name , pid )
69- if pid and pid > 0 :
70- try :
71- os .kill (pid , signum )
72- except ProcessLookupError :
73- pass
102+ if signum == signal .SIGCHLD and os .getpid () == 1 :
103+ _reap_zombie_processes (capture_child = child_info ) # capture our own direct child if reaped
104+ elif signum != signal .SIGCHLD :
105+ logger .info ("PARENT: Got %d (%s), forwarding to child PG %d" , signum , signal .Signals (signum ).name , pid )
106+ if pid > 0 :
107+ try :
108+ os .killpg (pid , signum )
109+ except (ProcessLookupError , OSError ):
110+ pass
74111
75112 # Set up signal handlers after fork
76- for sig in (signal .SIGINT , signal .SIGTERM , signal .SIGHUP , signal .SIGQUIT ):
113+ for sig in (signal .SIGINT , signal .SIGTERM , signal .SIGHUP , signal .SIGQUIT , signal . SIGCHLD ):
77114 signal .signal (sig , parent_signal_handler )
78115
79- _ , status = os .waitpid (pid , 0 )
116+ status = _wait_for_child (pid , child_info )
117+ if status is None :
118+ return None
119+
80120 if os .WIFEXITED (status ):
81121 # Interpret child exit code
82122 child_exit_code = os .WEXITSTATUS (status )
@@ -100,6 +140,7 @@ def _serve_with_exc_handling(config):
100140 if (exit_code := _handle_parent (pid )) is not None :
101141 return exit_code
102142 else :
143+ os .setsid () # Become group leader so all spawned subprocesses are reached by parent's signals
103144 _handle_child (config )
104145 sys .exit (1 ) # should never happen
105146
0 commit comments