Fix parallel-coordinator hang when summary pipe saturates

GuyAv46 · GuyAv46 · commit 4a8a452e85b1 · 2026-04-23T15:54:36.000+03:00
The coordinator drained the 'summary' queue only after joining all worker
processes. With enough queued data (or a single large testsFailed dict),
the summary-pipe buffer (~64 KiB on Linux) saturates and worker feeder
threads block in pipe_write, both inside on_timeout's join_thread() and
during Python's end-of-process queue finalization. This in turn hangs the
coordinator's p.join() indefinitely.

Introduce a module-level helper _join_workers_with_summary_drain that
joins workers while continuously draining 'summary' from a background
thread, and use it in execute(). Also correct the stale comment in the
on_timeout closure to describe the actual watcher-thread os._exit(1)
flow.
diff --git a/RLTest/__main__.py b/RLTest/__main__.py
@@ -357,6 +357,44 @@ def __enter__(self):
     def __exit__(self, type, value, traceback):
         self.runner.takeEnvDown()
 
+def _join_workers_with_summary_drain(processes, summary, timeout=None):
+    """Wait for all worker processes to exit while continuously draining the
+    ``summary`` queue, and return the list of collected summary entries.
+
+    A background thread drains ``summary`` so worker feeder threads never
+    block writing to a full summary-pipe buffer. Without this,
+    ``on_timeout``'s ``summary.join_thread()`` and Python's end-of-process
+    queue finalization can both block in ``pipe_write``, in turn hanging
+    ``p.join()`` here indefinitely.
+    """
+    stop = threading.Event()
+    collected = []
+
+    def _drain():
+        while not stop.is_set():
+            try:
+                collected.append(summary.get(timeout=0.1))
+            except Exception:
+                pass
+
+    drainer = threading.Thread(target=_drain)
+    drainer.start()
+    try:
+        deadline = None if timeout is None else time.time() + timeout
+        for p in processes:
+            remaining = None if deadline is None else max(0.0, deadline - time.time())
+            p.join(timeout=remaining)
+    finally:
+        stop.set()
+        drainer.join()
+    while True:
+        try:
+            collected.append(summary.get_nowait())
+        except Exception:
+            break
+    return collected
+
+
 class TestTimeLimit(object):
     """
     A test timeout watcher. The watcher opens thread that sleep for the
@@ -963,7 +1001,12 @@ def on_timeout():
                         finally:
                             results.put({'test_name': test.name, "output": output.getvalue()}, block=False)
                             summary.put({'done': done, 'failures': self.testsFailed}, block=False)
-                            # After we return the processes will be killed, so we must make sure the queues are drained properly.
+                            # The watcher thread calls os._exit(1) immediately after this
+                            # closure returns, bypassing Python finalization. Close the
+                            # queues and join their feeder threads here so pending put()s
+                            # are flushed to the pipes first. (The coordinator drains the
+                            # summary queue concurrently, which prevents join_thread() from
+                            # blocking on a full pipe.)
                             results.close()
                             summary.close()
                             summary.join_thread()
@@ -1008,15 +1051,9 @@ def on_timeout():
                 output = res['output']
                 print('%s' % output, end="")
 
-            for p in processes:
-                p.join()
-
-            # join results
-            while True:
-                try:
-                    res = summary.get(timeout=1)
-                except Exception as e:
-                    break
+            # Join worker processes while concurrently draining `summary`,
+            # so their feeder threads do not block on a full pipe buffer.
+            for res in _join_workers_with_summary_drain(processes, summary):
                 done += res['done']
                 self.testsFailed.update(res['failures'])
 
diff --git a/tests/unit/test_parallel_drain.py b/tests/unit/test_parallel_drain.py
@@ -0,0 +1,86 @@
+"""Regression test for a hang in the parallel test coordinator.
+
+Prior to the fix, the coordinator joined all worker processes before draining
+the ``summary`` queue. With enough data queued (or a single large
+``self.testsFailed`` dict), the summary pipe buffer (~64 KiB on Linux)
+saturates and worker feeder threads block in ``pipe_write`` during Python's
+end-of-process queue finalization (and similarly inside ``on_timeout``'s
+``summary.join_thread()``). That causes the coordinator's ``p.join()`` to
+hang indefinitely.
+
+The fix drains ``summary`` from a background thread while workers are being
+joined. This test reproduces the saturation scenario and asserts the helper
+completes within a bounded time with every worker cleanly exited.
+"""
+
+import multiprocessing as mp
+import sys
+import time
+from unittest import TestCase
+
+from RLTest.__main__ import _join_workers_with_summary_drain
+
+
+# ~32 KiB per message × 8 workers = 256 KiB total, comfortably exceeding the
+# typical 64 KiB pipe buffer on Linux, so at least some feeder threads will
+# block on ``pipe_write`` unless the parent is actively reading.
+_PAYLOAD_BYTES = 32 * 1024
+_NUM_WORKERS = 8
+_JOIN_TIMEOUT_SECS = 30.0
+
+
+def _worker_puts_large_summary(summary):
+    summary.put({
+        'done': 1,
+        'failures': {},
+        'payload': 'x' * _PAYLOAD_BYTES,
+    })
+    # Return normally; Python finalization will join the feeder thread,
+    # which is where a non-draining parent would cause the hang.
+
+
+class TestJoinWorkersWithSummaryDrain(TestCase):
+
+    def setUp(self):
+        if sys.platform == 'win32':
+            self.skipTest('fork start method is unavailable on Windows')
+        self._ctx = mp.get_context('fork')
+        self._procs = []
+        self._summary = None
+
+    def tearDown(self):
+        # Safety net: if the helper ever hangs despite the fix, make sure the
+        # pytest session can still exit cleanly.
+        for p in self._procs:
+            if p.is_alive():
+                p.kill()
+                p.join(timeout=5)
+
+    def test_large_summary_does_not_hang(self):
+        self._summary = self._ctx.Queue()
+        self._procs = [
+            self._ctx.Process(
+                target=_worker_puts_large_summary,
+                args=(self._summary,),
+            )
+            for _ in range(_NUM_WORKERS)
+        ]
+        for p in self._procs:
+            p.start()
+
+        start = time.time()
+        collected = _join_workers_with_summary_drain(
+            self._procs, self._summary, timeout=_JOIN_TIMEOUT_SECS,
+        )
+        elapsed = time.time() - start
+
+        for p in self._procs:
+            self.assertFalse(
+                p.is_alive(),
+                'worker still alive after drain-join; summary pipe likely saturated',
+            )
+            self.assertEqual(p.exitcode, 0)
+        self.assertEqual(len(collected), _NUM_WORKERS)
+        # The helper should return well under its own timeout; we only assert a
+        # loose upper bound to avoid flakiness on slow machines.
+        self.assertLess(elapsed, _JOIN_TIMEOUT_SECS)