Switch BestOfN and Tournament from task+n to candidates list

congwang-mk · congwang-mk · commit eab13271875e · 2026-02-23T12:02:21.000-08:00
Signed-off-by: Cong Wang &lt;cwang@multikernel.io&gt;
diff --git a/README.md b/README.md
@@ -109,22 +109,27 @@ if outcome.committed:
 
 ### Best-of-N with scoring
 
-Run the same task N times (e.g. with different random seeds or temperatures)
-and commit the highest-scoring success.
+Run N candidates in parallel and commit the highest-scoring success.
 
 Use when quality matters more than speed: code generation where you want
 the cleanest output across multiple temperatures, translation with a BLEU
-scorer picking the best variant, or any task with a reliable quality metric
-where the same prompt can produce varying results.
+scorer picking the best variant, or any task with a reliable quality metric.
+Pairs naturally with the ``n=`` parameter in OpenAI's Chat Completions API
+to generate N variations in a single call, then test each in an isolated
+branch.
 
 ```python
 from branching import BestOfN
 
-def scored_task(path: Path, attempt: int) -> tuple[bool, float]:
-    result = run_agent(workdir=path, seed=attempt)
-    return result.passed, result.quality_score
+def make_candidate(code: str):
+    def candidate(path: Path) -> tuple[bool, float]:
+        (path / "solution.py").write_text(code)
+        passed = run_tests(path)
+        return passed, evaluate_quality(path) if passed else 0.0
+    return candidate
 
-outcome = BestOfN(scored_task, n=5)(ws)
+candidates = [make_candidate(c) for c in generate_solutions(n=5)]
+outcome = BestOfN(candidates)(ws)
 ```
 
 ### Reflexion (retry with feedback)
@@ -217,7 +222,7 @@ outcome = BeamSearch(
 
 ### Tournament (pairwise elimination)
 
-Generate N candidates in parallel, then narrow to one through pairwise
+Run N candidates in parallel, then narrow to one through pairwise
 elimination via a judge function. The convergent dual of Tree of Thoughts:
 starts wide, narrows to one.
 
@@ -229,14 +234,19 @@ any setting where relative ranking is easier than absolute scoring.
 ```python
 from branching import Tournament
 
-def generate_patch(path: Path, index: int) -> bool:
-    return run_agent(workdir=path, seed=index)
+def make_patch(code: str):
+    def candidate(path: Path) -> bool:
+        (path / "fix.patch").write_text(code)
+        return apply_and_test(path)
+    return candidate
+
+candidates = [make_patch(p) for p in generate_patches(n=8)]
 
 def judge(path_a: Path, path_b: Path) -> int:
     # 0 = a wins, 1 = b wins
     return llm_compare(path_a / "diff.patch", path_b / "diff.patch")
 
-outcome = Tournament(generate_patch, n=8, judge=judge)(ws)
+outcome = Tournament(candidates, judge=judge)(ws)
 ```
 
 ### Cascaded speculation (adaptive fan-out)
@@ -349,7 +359,7 @@ from branching import ResourceLimits, BestOfN
 
 limits = ResourceLimits(memory=512 * 1024 * 1024, cpu=0.5)  # 512 MB, 50% CPU
 
-outcome = BestOfN(scored_task, n=5, resource_limits=limits)(ws)
+outcome = BestOfN(candidates, resource_limits=limits)(ws)
 ```
 
 All patterns accept `resource_limits`: `Speculate`, `BestOfN`, `Reflexion`,
diff --git a/src/branching/agent/patterns.py b/src/branching/agent/patterns.py
@@ -19,31 +19,30 @@
 
 
 class BestOfN:
-    """Run N copies of a task in parallel, commit the highest-scoring one.
+    """Run N candidates in parallel, commit the highest-scoring one.
 
     All candidates run concurrently. Each holds its branch open after
     finishing. The main thread picks the winner based on score, then
     signals each thread to commit (winner) or abort (losers).
 
-    The task callable receives (path, attempt_index) and returns
+    Each candidate callable receives (path,) and returns
     (success: bool, score: float).
 
     Example:
-        outcome = BestOfN(scored_task, n=5)(ws)
-        # Commits the highest-scoring successful attempt
+        candidates = [lambda p: (run_tests(p), score(p)) for _ in range(5)]
+        outcome = BestOfN(candidates)(ws)
+        # Commits the highest-scoring successful candidate
     """
 
     def __init__(
         self,
-        task: Callable[[Path, int], tuple[bool, float]],
-        n: int = 3,
+        candidates: Sequence[Callable[[Path], tuple[bool, float]]],
         *,
         timeout: float | None = None,
         resource_limits: ResourceLimits | None = None,
         group_limits: ResourceLimits | None = None,
     ):
-        self._task = task
-        self._n = n
+        self._candidates = list(candidates)
         self._timeout = timeout
         self._resource_limits = resource_limits
         self._group_limits = group_limits
@@ -70,7 +69,7 @@ def __call__(self, workspace: Workspace) -> SpeculationOutcome:
                 kill_scope(root_cgroup)
 
     def _run(self, workspace: Workspace, root_cgroup: Optional[Path]) -> SpeculationOutcome:
-        n = self._n
+        n = len(self._candidates)
         results: list[Optional[SpeculationResult]] = [None] * n
         task_done = [threading.Event() for _ in range(n)]
         decision_ready = [threading.Event() for _ in range(n)]
@@ -96,7 +95,7 @@ def _on_scope(sp: Path, _i: int = index) -> None:
                             branch_scopes[_i] = sp
 
                         ret = run_in_process(
-                            self._task, (b.path, index),
+                            self._candidates[index], (b.path,),
                             workspace=b.path,
                             limits=self._resource_limits,
                             parent_cgroup=root_cgroup,
@@ -911,20 +910,19 @@ def _on_sub_scope(
 
 
 class Tournament:
-    """Pairwise elimination bracket: generate N candidates, compare
+    """Pairwise elimination bracket: run N candidates, compare
     pairwise via a judge function, commit the final winner.
 
     The convergent dual of TreeOfThoughts: starts wide, narrows to one.
 
     Example:
-        outcome = Tournament(task, n=4, judge=judge)(ws)
+        outcome = Tournament(candidates, judge=judge)(ws)
         # Commits the bracket winner
     """
 
     def __init__(
         self,
-        task: Callable[[Path, int], bool],
-        n: int = 4,
+        candidates: Sequence[Callable[[Path], bool]],
         *,
         judge: Callable[[Path, Path], int],
         timeout: float | None = None,
@@ -933,17 +931,16 @@ def __init__(
     ):
         """
         Args:
-            task: Callable(branch_path, candidate_index) → success.
-                  Produces output in the branch directory.
-            n: Number of candidates to generate.
+            candidates: Callables that take a Path (branch working dir)
+                  and return True on success. Each produces output in
+                  the branch directory for the judge to compare.
             judge: Callable(path_a, path_b) → 0 (a wins) or 1 (b wins).
                    Compares two candidates' branches during elimination.
             timeout: Overall timeout in seconds.
             resource_limits: Optional per-branch resource limits.
             group_limits: Optional resource limits for the root cgroup.
         """
-        self._task = task
-        self._n = n
+        self._candidates = list(candidates)
         self._judge = judge
         self._timeout = timeout
         self._resource_limits = resource_limits
@@ -992,7 +989,7 @@ def __call__(self, workspace: Workspace) -> SpeculationOutcome:
                 kill_scope(root_cgroup)
 
     def _run(self, workspace: Workspace, root_cgroup: Optional[Path]) -> SpeculationOutcome:
-        n = self._n
+        n = len(self._candidates)
         results: list[Optional[SpeculationResult]] = [None] * n
         branch_paths: list[Optional[Path]] = [None] * n
         task_done = [threading.Event() for _ in range(n)]
@@ -1020,7 +1017,7 @@ def _on_scope(sp: Path, _i: int = index) -> None:
                             branch_scopes[_i] = sp
 
                         success = run_in_process(
-                            self._task, (b.path, index),
+                            self._candidates[index], (b.path,),
                             workspace=b.path,
                             limits=self._resource_limits,
                             parent_cgroup=root_cgroup,
diff --git a/src/cli/best_of_n.py b/src/cli/best_of_n.py
@@ -11,14 +11,14 @@
 from . import _parse_group_limits, _parse_resource_limits, _print_error, _resolve_workspace
 
 
-def _make_task(cmd: list[str]):
-    """Wrap a command into a BestOfN task callable.
+def _make_candidate(cmd: list[str], index: int):
+    """Wrap a command into a BestOfN candidate callable.
 
-    Returns a callable(path, index) -> (success, score).
+    Returns a callable(path) -> (success, score).
     The child process can write a score float to fd 3.
     """
 
-    def task(workdir: Path, index: int) -> tuple[bool, float]:
+    def candidate(workdir: Path) -> tuple[bool, float]:
         # Create a pipe for the child to report its score.
         # Python 3.4+ creates pipe fds with CLOEXEC, so they are
         # automatically closed on exec — only fd 3 (dup2 clears
@@ -57,7 +57,7 @@ def _preexec():
 
         return (success, score)
 
-    return task
+    return candidate
 
 
 def cmd_best_of_n(args) -> int:
@@ -68,10 +68,10 @@ def cmd_best_of_n(args) -> int:
     ws_path = _resolve_workspace(args)
     ws = Workspace(ws_path)
 
-    task = _make_task(args.cmd)
+    candidates = [_make_candidate(args.cmd, i) for i in range(args.n)]
     limits = _parse_resource_limits(args)
     group_limits = _parse_group_limits(args)
-    best = BestOfN(task, n=args.n, timeout=args.timeout, resource_limits=limits, group_limits=group_limits)
+    best = BestOfN(candidates, timeout=args.timeout, resource_limits=limits, group_limits=group_limits)
     outcome = best(ws)
 
     results_summary = []
diff --git a/tests/test_resource_limits.py b/tests/test_resource_limits.py
@@ -415,7 +415,7 @@ def test_speculate_accepts_resource_limits(self):
     def test_best_of_n_accepts_resource_limits(self):
         from branching.agent.patterns import BestOfN
         rl = ResourceLimits(cpu=0.5)
-        bon = BestOfN(lambda p, i: (True, 1.0), n=2, resource_limits=rl)
+        bon = BestOfN([lambda p: (True, 1.0)] * 2, resource_limits=rl)
         assert bon._resource_limits is rl
 
     def test_reflexion_accepts_resource_limits(self):
@@ -444,7 +444,7 @@ def test_tournament_accepts_resource_limits(self):
         from branching.agent.patterns import Tournament
         rl = ResourceLimits(memory=4096)
         t = Tournament(
-            lambda p, i: True, n=2,
+            [lambda p: True] * 2,
             judge=lambda a, b: 0,
             resource_limits=rl,
         )
@@ -468,7 +468,7 @@ def test_speculate_accepts_group_limits(self):
     def test_best_of_n_accepts_group_limits(self):
         from branching.agent.patterns import BestOfN
         gl = ResourceLimits(cpu=2.0)
-        bon = BestOfN(lambda p, i: (True, 1.0), n=2, group_limits=gl)
+        bon = BestOfN([lambda p: (True, 1.0)] * 2, group_limits=gl)
         assert bon._group_limits is gl
 
     def test_reflexion_accepts_group_limits(self):
@@ -497,7 +497,7 @@ def test_tournament_accepts_group_limits(self):
         from branching.agent.patterns import Tournament
         gl = ResourceLimits(memory=4096, cpu=2.0)
         t = Tournament(
-            lambda p, i: True, n=2,
+            [lambda p: True] * 2,
             judge=lambda a, b: 0,
             group_limits=gl,
         )
@@ -923,15 +923,14 @@ def mock_rip(fn, args, *, workspace, limits,
             scope = workspace / ".scope"
             if scope_callback:
                 scope_callback(scope)
-            idx = args[1]
-            return (True, float(idx))
+            return fn(*args)
 
         ws = _mock_workspace()
         with patch("branching.agent.patterns.run_in_process", side_effect=mock_rip), \
              patch("branching.process._cgroup.kill_scope",
                    side_effect=lambda s: killed.append(s)):
             bon = BestOfN(
-                lambda p, i: (True, float(i)), n=2,
+                [lambda p, i=i: (True, float(i)) for i in range(2)],
                 resource_limits=ResourceLimits(memory=1024),
             )
             bon._run(ws, None)
@@ -949,17 +948,21 @@ def mock_rip(fn, args, *, workspace, limits,
             scope = workspace / ".scope"
             if scope_callback:
                 scope_callback(scope)
-            idx = args[1]
-            if idx == 1:
-                time.sleep(2)  # simulate stuck task
-            return (True, float(idx))
+            return fn(*args)
+
+        def fast(p):
+            return (True, 0.0)
+
+        def stuck(p):
+            time.sleep(2)
+            return (True, 1.0)
 
         ws = _mock_workspace()
         with patch("branching.agent.patterns.run_in_process", side_effect=mock_rip), \
              patch("branching.process._cgroup.kill_scope",
                    side_effect=lambda s: killed.append(s)):
             bon = BestOfN(
-                lambda p, i: (True, float(i)), n=2,
+                [fast, stuck],
                 timeout=0.05,
                 resource_limits=ResourceLimits(memory=1024),
             )
@@ -1008,14 +1011,14 @@ def mock_rip(fn, args, *, workspace, limits,
             scope = workspace / ".scope"
             if scope_callback:
                 scope_callback(scope)
-            return True
+            return fn(*args)
 
         ws = _mock_workspace()
         with patch("branching.agent.patterns.run_in_process", side_effect=mock_rip), \
              patch("branching.process._cgroup.kill_scope",
                    side_effect=lambda s: killed.append(s)):
             t = Tournament(
-                lambda p, i: True, n=2,
+                [lambda p: True] * 2,
                 judge=lambda a, b: 0,
                 resource_limits=ResourceLimits(memory=1024),
             )
@@ -1030,7 +1033,7 @@ def test_no_kill_without_resource_limits(self):
         with patch("branching.process._cgroup.kill_scope",
                    side_effect=lambda s: killed.append(s)):
             t = Tournament(
-                lambda p, i: True, n=2,
+                [lambda p: True] * 2,
                 judge=lambda a, b: 0,
             )
             t._run(ws, None)
diff --git a/tests/test_speculate.py b/tests/test_speculate.py