Add scores and evaluate params to BestOfN for logprobs integration

congwang-mk · congwang-mk · commit f24f5a8bf187 · 2026-02-23T12:26:12.000-08:00
Signed-off-by: Cong Wang &lt;cwang@multikernel.io&gt;
diff --git a/README.md b/README.md
@@ -118,6 +118,10 @@ Pairs naturally with the ``n=`` parameter in OpenAI's Chat Completions API
 to generate N variations in a single call, then test each in an isolated
 branch.
 
+Candidates can return ``bool`` or ``(bool, float)``. Scoring is flexible:
+pass pre-computed ``scores`` (e.g. from logprobs), provide an ``evaluate``
+callback for post-execution scoring, or let candidates score themselves.
+
 ```python
 from branching import BestOfN
 
@@ -132,6 +136,32 @@ candidates = [make_candidate(c) for c in generate_solutions(n=5)]
 outcome = BestOfN(candidates)(ws)
 ```
 
+**Logprobs workflow** — score candidates externally using model confidence,
+then let BestOfN pick the highest-scoring one that passes:
+
+```python
+from branching import BestOfN
+import openai
+
+client = openai.OpenAI()
+resp = client.chat.completions.create(
+    model="gpt-4o", n=5, logprobs=True, top_logprobs=1,
+    messages=[{"role": "user", "content": prompt}],
+)
+
+# Pre-computed confidence scores from logprobs
+logprob_scores = [
+    sum(t.logprob for t in c.logprobs.content) / len(c.logprobs.content)
+    for c in resp.choices
+]
+
+# Candidates just apply code and test — return bare bool
+candidates = [make_test(c.message.content) for c in resp.choices]
+
+# BestOfN picks the highest-logprob passing candidate
+outcome = BestOfN(candidates, scores=logprob_scores)(ws)
+```
+
 ### Reflexion (retry with feedback)
 
 Run a task, and if it fails, generate a critique and feed it back into the
diff --git a/src/branching/agent/patterns.py b/src/branching/agent/patterns.py
@@ -25,8 +25,15 @@ class BestOfN:
     finishing. The main thread picks the winner based on score, then
     signals each thread to commit (winner) or abort (losers).
 
-    Each candidate callable receives (path,) and returns
-    (success: bool, score: float).
+    Each candidate callable receives (path,) and returns ``bool`` or
+    ``(success: bool, score: float)``.  A bare ``bool`` defaults to
+    score 1.0/0.0 unless overridden by *scores* or *evaluate*.
+
+    Score resolution per successful candidate (highest priority first):
+        1. evaluate(path)      — external scorer callback
+        2. candidate (bool, f) — candidate's own score
+        3. scores[i]           — pre-computed score (e.g. logprobs)
+        4. 1.0                 — default
 
     Example:
         candidates = [lambda p: (run_tests(p), score(p)) for _ in range(5)]
@@ -36,17 +43,32 @@ class BestOfN:
 
     def __init__(
         self,
-        candidates: Sequence[Callable[[Path], tuple[bool, float]]],
+        candidates: Sequence[Callable[[Path], bool | tuple[bool, float]]],
         *,
+        scores: Sequence[float] | None = None,
+        evaluate: Callable[[Path], float] | None = None,
         timeout: float | None = None,
         resource_limits: ResourceLimits | None = None,
         group_limits: ResourceLimits | None = None,
     ):
         self._candidates = list(candidates)
+        self._scores = list(scores) if scores is not None else None
+        self._evaluate = evaluate
         self._timeout = timeout
         self._resource_limits = resource_limits
         self._group_limits = group_limits
 
+    def _score(self, ret, path, index):
+        """Parse candidate return and apply optional evaluator."""
+        if isinstance(ret, (tuple, list)):
+            success, score = ret
+        else:
+            success = bool(ret)
+            score = self._scores[index] if self._scores else (1.0 if success else 0.0)
+        if self._evaluate and success:
+            score = self._evaluate(path)
+        return bool(success), score
+
     def __call__(self, workspace: Workspace) -> SpeculationOutcome:
         import os as _os
 
@@ -101,10 +123,10 @@ def _on_scope(sp: Path, _i: int = index) -> None:
                             parent_cgroup=root_cgroup,
                             scope_callback=_on_scope if self._resource_limits else None,
                         )
-                        success, score = ret
-                        result.success = bool(success)
+                        success, score = self._score(ret, b.path, index)
+                        result.success = success
                         result.score = score
-                        result.return_value = (success, score)
+                        result.return_value = ret
                     except Exception as e:
                         result.exception = e
 
diff --git a/tests/test_speculate.py b/tests/test_speculate.py
@@ -121,6 +121,20 @@ def test_picks_highest_score(self):
         assert outcome.winner.score == 0.9
         assert len(outcome.all_results) == 3
 
+    def test_bool_candidates(self):
+        """Candidates returning bare bool get score 1.0/0.0."""
+        ws = _make_workspace()
+
+        candidates = [
+            lambda p: False,
+            lambda p: True,
+        ]
+
+        outcome = BestOfN(candidates)(ws)
+        assert outcome.committed
+        assert outcome.winner.branch_index == 1
+        assert outcome.winner.score == 1.0
+
     def test_skips_failures(self):
         ws = _make_workspace()
 
@@ -136,7 +150,7 @@ def test_skips_failures(self):
     def test_all_fail(self):
         ws = _make_workspace()
 
-        candidates = [lambda p: (False, 0.0) for _ in range(3)]
+        candidates = [lambda p: False for _ in range(3)]
 
         outcome = BestOfN(candidates)(ws)
         assert not outcome.committed
@@ -177,16 +191,99 @@ def test_runs_in_parallel(self):
         ws = _make_workspace()
         start = time.monotonic()
 
-        def slow(path: Path) -> tuple[bool, float]:
+        def slow(path: Path) -> bool:
             time.sleep(0.2)
-            return True, 1.0
+            return True
 
         outcome = BestOfN([slow, slow, slow])(ws)
         elapsed = time.monotonic() - start
         assert outcome.committed
         # 3 tasks @ 0.2s each; parallel should be ~0.2s, sequential ~0.6s
         assert elapsed < 0.5
 
+    def test_scores_param(self):
+        """Pre-computed scores override default 1.0/0.0 for bool candidates."""
+        ws = _make_workspace()
+
+        candidates = [lambda p: True, lambda p: True, lambda p: True]
+        logprob_scores = [-2.5, -0.1, -1.3]
+
+        outcome = BestOfN(candidates, scores=logprob_scores)(ws)
+        assert outcome.committed
+        assert outcome.winner.branch_index == 1  # highest logprob
+        assert outcome.winner.score == -0.1
+
+    def test_scores_ignored_for_tuple_return(self):
+        """When candidate returns (bool, float), scores param is ignored."""
+        ws = _make_workspace()
+
+        candidates = [
+            lambda p: (True, 5.0),   # candidate provides own score
+            lambda p: (True, 10.0),  # candidate provides own score
+        ]
+
+        outcome = BestOfN(candidates, scores=[99.0, 1.0])(ws)
+        assert outcome.committed
+        assert outcome.winner.branch_index == 1  # tuple score 10.0 wins
+        assert outcome.winner.score == 10.0
+
+    def test_scores_skipped_for_failures(self):
+        """Failed bool candidates don't use pre-computed scores."""
+        ws = _make_workspace()
+
+        candidates = [lambda p: False, lambda p: True]
+        outcome = BestOfN(candidates, scores=[99.0, 0.5])(ws)
+        assert outcome.committed
+        assert outcome.winner.branch_index == 1
+        assert outcome.winner.score == 0.5
+
+    def test_evaluate_callback(self):
+        """External evaluate callback overrides all other scores."""
+        ws = _make_workspace()
+
+        candidates = [
+            lambda p: (True, 10.0),  # candidate says 10
+            lambda p: (True, 1.0),   # candidate says 1
+        ]
+
+        calls = []
+        def evaluate(path):
+            calls.append(path)
+            return float(len(calls))  # 1.0 for first, 2.0 for second
+
+        outcome = BestOfN(candidates, evaluate=evaluate)(ws)
+        assert outcome.committed
+        assert len(calls) == 2
+
+    def test_evaluate_not_called_on_failure(self):
+        """evaluate is only called for successful candidates."""
+        ws = _make_workspace()
+
+        eval_calls = []
+        def evaluate(path):
+            eval_calls.append(path)
+            return 1.0
+
+        candidates = [lambda p: False, lambda p: True]
+        outcome = BestOfN(candidates, evaluate=evaluate)(ws)
+        assert outcome.committed
+        assert len(eval_calls) == 1
+
+    def test_evaluate_overrides_scores_param(self):
+        """evaluate takes priority over both tuple scores and scores param."""
+        ws = _make_workspace()
+
+        candidates = [lambda p: True, lambda p: True]
+
+        outcome = BestOfN(
+            candidates,
+            scores=[100.0, 1.0],
+            evaluate=lambda p: 42.0,
+        )(ws)
+        assert outcome.committed
+        # Both get evaluate score 42.0; either could win (both equal)
+        assert outcome.winner.score == 42.0
+
 
 class TestReflexion:
     def test_succeeds_first_try(self):