From 71c25136c3ebb2b9c10e5f83a98d4bd5d1b540c9 Mon Sep 17 00:00:00 2001
From: Hao Zhu <prokilchu@gmail.com>
Date: Thu, 14 May 2026 00:35:34 +0000
Subject: [PATCH] runner/coop: coerce message timestamps to float before
 sorting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

execute_coop crashes mid-rollout with
``TypeError: '<' not supported between instances of 'int' and 'str'``
when one agent adapter reports numeric timestamps (mini_swe_agent uses
time.time() floats) and another reports ISO strings (OpenHands SDK).
The sort fires before agent{fid}_traj.json is written, so callers that
rely on the structured trajectory output get nothing — even though the
underlying rollout ran to completion.

Extract the timestamp key as a module-level
``_message_timestamp_key`` helper that best-effort coerces to float
(None / non-numeric → 0.0), then have execute_coop's sort call it.

Adds tests/runner/test_coop.py with regression coverage:
  - mixed int / float / ISO-string / None / missing timestamps all sort
    without raising
  - received messages stay filtered out before the sort

All 31 runner tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/cooperbench/runner/coop.py |  26 +++++++-
 tests/runner/test_coop.py      | 108 +++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 tests/runner/test_coop.py

diff --git a/src/cooperbench/runner/coop.py b/src/cooperbench/runner/coop.py
index d2b2414..304766f 100644
--- a/src/cooperbench/runner/coop.py
+++ b/src/cooperbench/runner/coop.py
@@ -17,6 +17,27 @@
 from cooperbench.utils import console, get_image_name
 
 
+def _message_timestamp_key(msg: dict) -> float:
+    """Return a sortable float for a conversation message.
+
+    Timestamps in ``_extract_conversation`` output come from multiple agent
+    adapters: mini_swe_agent emits floats (``time.time()``), the OpenHands
+    SDK adapter can emit ISO strings, and some events have no timestamp at
+    all. Coerce to float on a best-effort basis so the sort doesn't blow up
+    with ``TypeError: '<' not supported between instances of 'int' and 'str'``
+    mid-rollout — which used to crash :func:`execute_coop` before
+    ``agent{fid}_traj.json`` was written, leaving callers with no structured
+    output to evaluate.
+    """
+    ts = msg.get("timestamp")
+    if ts is None:
+        return 0.0
+    try:
+        return float(ts)
+    except (TypeError, ValueError):
+        return 0.0
+
+
 def execute_coop(
     repo_name: str,
     task_id: int,
@@ -151,9 +172,10 @@ def run_thread(agent_id: str, feature_id: int):
     # Extract conversation (inter-agent messages)
     conversation = _extract_conversation(results, agents)
 
-    # Sort by timestamp and dedupe (keep only sent messages, not received)
+    # Sort by timestamp and dedupe (keep only sent messages, not received).
+    # See ``_message_timestamp_key`` for why coercion is needed.
     sent_msgs = [m for m in conversation if not m.get("received")]
-    sent_msgs.sort(key=lambda x: x.get("timestamp") or 0)
+    sent_msgs.sort(key=_message_timestamp_key)
 
     # Save conversation
     with open(log_dir / "conversation.json", "w") as f:
diff --git a/tests/runner/test_coop.py b/tests/runner/test_coop.py
new file mode 100644
index 0000000..18d38d1
--- /dev/null
+++ b/tests/runner/test_coop.py
@@ -0,0 +1,108 @@
+"""Unit tests for cooperbench.runner.coop module."""
+
+from cooperbench.runner.coop import _extract_conversation, _message_timestamp_key
+
+
+def _msg(role: str, content: str, *, ts=None, **extra) -> dict:
+    """Build a result-message dict in the shape ``_extract_conversation`` reads."""
+    m = {"role": role, "content": content}
+    if ts is not None:
+        m["timestamp"] = ts
+    m.update(extra)
+    return m
+
+
+class TestExtractConversation:
+    """Tests for ``_extract_conversation`` — the function whose output feeds
+    the buggy ``sent_msgs.sort`` site downstream."""
+
+    def test_extracts_sent_messages_from_bash_format(self):
+        """mini_swe_agent style: assistant content contains
+        ``send_message agentX "body"``."""
+        results = {
+            "agent1": {
+                "feature_id": 1,
+                "messages": [
+                    _msg("assistant",
+                         'send_message agent2 "lets coordinate"',
+                         ts=1.0),
+                ],
+            },
+        }
+        conv = _extract_conversation(results, ["agent1"])
+        assert len(conv) == 1
+        assert conv[0]["from"] == "agent1"
+        assert conv[0]["to"] == "agent2"
+        assert conv[0]["message"] == "lets coordinate"
+        assert conv[0]["timestamp"] == 1.0
+        assert not conv[0].get("received")
+
+    def test_extracts_received_messages(self):
+        """``[Message from X]`` user-content delivery is tagged received."""
+        results = {
+            "agent2": {
+                "feature_id": 2,
+                "messages": [
+                    _msg("user",
+                         "[Message from agent1]: hi from agent1",
+                         ts=2.0),
+                ],
+            },
+        }
+        conv = _extract_conversation(results, ["agent2"])
+        assert any(m.get("received") and m["from"] == "agent1" for m in conv)
+
+
+class TestSortDoesNotCrashOnMixedTimestampTypes:
+    """The historical bug:
+    ``sent_msgs.sort(key=lambda x: x.get("timestamp") or 0)`` crashes with
+    ``TypeError: '<' not supported between instances of 'int' and 'str'``
+    when one adapter records floats and another records ISO strings. The
+    crash fires *before* ``agent{fid}_traj.json`` is written, so callers
+    get no structured rollout output.
+
+    This test exercises the post-extraction sort exactly as it appears in
+    ``execute_coop`` and asserts the call succeeds. If the production
+    sort ever re-introduces a non-coercing key, this test fails fast.
+    """
+
+    @staticmethod
+    def _sort_like_production(conversation):
+        """Reproduce the exact filter-then-sort pattern in ``execute_coop``."""
+        sent = [m for m in conversation if not m.get("received")]
+        sent.sort(key=_message_timestamp_key)
+        return sent
+
+    def test_mixed_int_float_string_timestamps_sort_cleanly(self):
+        conversation = [
+            {"from": "agent1", "to": "agent2", "message": "a", "timestamp": 1.5},
+            {"from": "agent2", "to": "agent1", "message": "b",
+             "timestamp": "2026-05-13T22:47:00Z"},
+            {"from": "agent1", "to": "agent2", "message": "c", "timestamp": 3},
+            {"from": "agent2", "to": "agent1", "message": "d", "timestamp": None},
+            {"from": "agent1", "to": "agent2", "message": "e"},  # no ts at all
+        ]
+        sorted_msgs = self._sort_like_production(conversation)
+        # All five sent rows preserved (none received-flagged).
+        assert len(sorted_msgs) == 5
+        # Numeric timestamps sort in order; unparseable strings + None +
+        # missing all coerce to 0.0 and end up at the front (stable order).
+        floats = [
+            float(m["timestamp"])
+            for m in sorted_msgs
+            if isinstance(m.get("timestamp"), (int, float))
+        ]
+        assert floats == sorted(floats)
+
+    def test_received_messages_excluded_before_sort(self):
+        """Received entries must be filtered out before the sort, not after —
+        otherwise sorting received-then-filtering would still bump into the
+        same crash on adversarial inputs."""
+        conversation = [
+            {"from": "agent1", "to": "agent2", "message": "out", "timestamp": 1.0},
+            {"from": "agent2", "to": "agent1", "message": "in",
+             "timestamp": "garbage", "received": True},
+        ]
+        sorted_msgs = self._sort_like_production(conversation)
+        assert len(sorted_msgs) == 1
+        assert sorted_msgs[0]["message"] == "out"