1212import time
1313import shlex
1414import json
15- from multiprocessing import Process , Queue , set_start_method
15+ from multiprocessing import Process , Queue , SimpleQueue , set_start_method
1616
1717from RLTest .env import Env , TestAssertionFailure , Defaults
1818from RLTest .utils import Colors , fix_modules , fix_modulesArgs , is_github_actions
@@ -357,25 +357,29 @@ def __enter__(self):
357357 def __exit__ (self , type , value , traceback ):
358358 self .runner .takeEnvDown ()
359359
360+ # Sentinel used to signal the summary-drainer thread to stop. Must be
361+ # pickleable (SimpleQueue pickles everything) and distinguishable from any
362+ # payload a worker might put, which is always a dict.
363+ _SUMMARY_DRAIN_STOP = ('__rltest_summary_stop__' ,)
364+
365+
360366def _join_workers_with_summary_drain (processes , summary , timeout = None ):
361367 """Wait for all worker processes to exit while continuously draining the
362368 ``summary`` queue, and return the list of collected summary entries.
363369
364- A background thread drains ``summary`` so worker feeder threads never
365- block writing to a full summary-pipe buffer. Without this,
366- ``on_timeout``'s ``summary.join_thread()`` and Python's end-of-process
367- queue finalization can both block in ``pipe_write``, in turn hanging
368- ``p.join()`` here indefinitely.
370+ A background thread drains ``summary`` so that worker ``put()`` calls
371+ never block on a full summary-pipe buffer. Without this, on_timeout and
372+ the final summary.put at worker-exit can block in ``pipe_write``, in
373+ turn hanging ``p.join()`` here indefinitely.
369374 """
370- stop = threading .Event ()
371375 collected = []
372376
373377 def _drain ():
374- while not stop . is_set () :
375- try :
376- collected . append ( summary . get ( timeout = 0.1 ))
377- except Exception :
378- pass
378+ while True :
379+ item = summary . get ()
380+ if item == _SUMMARY_DRAIN_STOP :
381+ break
382+ collected . append ( item )
379383
380384 drainer = threading .Thread (target = _drain )
381385 drainer .start ()
@@ -385,13 +389,8 @@ def _drain():
385389 remaining = None if deadline is None else max (0.0 , deadline - time .time ())
386390 p .join (timeout = remaining )
387391 finally :
388- stop . set ( )
392+ summary . put ( _SUMMARY_DRAIN_STOP )
389393 drainer .join ()
390- while True :
391- try :
392- collected .append (summary .get_nowait ())
393- except Exception :
394- break
395394 return collected
396395
397396
@@ -999,17 +998,14 @@ def on_timeout():
999998 except Exception as e :
1000999 self .handleFailure (testFullName = test .name , testname = test .name , error_msg = Colors .Bred ('Exception on timeout function %s' % str (e )))
10011000 finally :
1001+ # `summary` is a SimpleQueue, so its put() writes straight to
1002+ # the pipe and needs no explicit flush. `results` is a Queue
1003+ # with a feeder thread; close() + join_thread() ensures the
1004+ # put above is written to the pipe before the watcher thread
1005+ # calls os._exit(1), which bypasses Python finalization.
1006+ summary .put ({'done' : done , 'failures' : self .testsFailed })
10021007 results .put ({'test_name' : test .name , "output" : output .getvalue ()}, block = False )
1003- summary .put ({'done' : done , 'failures' : self .testsFailed }, block = False )
1004- # The watcher thread calls os._exit(1) immediately after this
1005- # closure returns, bypassing Python finalization. Close the
1006- # queues and join their feeder threads here so pending put()s
1007- # are flushed to the pipes first. (The coordinator drains the
1008- # summary queue concurrently, which prevents join_thread() from
1009- # blocking on a full pipe.)
10101008 results .close ()
1011- summary .close ()
1012- summary .join_thread ()
10131009 results .join_thread ()
10141010 done += self .run_single_test (test , on_timeout )
10151011
@@ -1018,10 +1014,10 @@ def on_timeout():
10181014 self .takeEnvDown (fullShutDown = True )
10191015
10201016 # serialized the results back
1021- summary .put ({'done' : done , 'failures' : self .testsFailed }, block = False )
1017+ summary .put ({'done' : done , 'failures' : self .testsFailed })
10221018
10231019 results = Queue ()
1024- summary = Queue ()
1020+ summary = SimpleQueue ()
10251021 # Open group for all tests at the start (parallel execution)
10261022 self ._openGitHubActionsTestsGroup ()
10271023 if self .parallelism == 1 :
0 commit comments