From b75507cfac443f2be06d18a0069847d8ba4e29e4 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 21:17:38 +0000 Subject: [PATCH 01/11] cli: auto-load ./.env so cooperbench run doesn't need 'source .env' mini_swe_agent_v2 already loads dotenv from a global config dir (~/.config/mini-swe-agent/.env), but cooperbench itself never loaded the project-local .env, so OPENAI_API_KEY etc. only made it through when the user manually exported them. Calling dotenv.load_dotenv() at the top of cli.py auto-loads ./.env from cwd before any env-var-dependent imports run, matching how projects with python-dotenv conventionally pick up local config. --- src/cooperbench/cli.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cooperbench/cli.py b/src/cooperbench/cli.py index 900f547..28836ec 100644 --- a/src/cooperbench/cli.py +++ b/src/cooperbench/cli.py @@ -11,9 +11,13 @@ import os import sys +import dotenv + +dotenv.load_dotenv() # load ./.env from cwd before anything reads env vars + os.environ["LITELLM_LOG"] = "ERROR" -import litellm +import litellm # noqa: E402 litellm.suppress_debug_info = True # Suppress "Give Feedback / Get Help" print messages on errors From 555207a1c32a8fb2895769d3c5ed6472a3ff1333 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 21:17:54 +0000 Subject: [PATCH 02/11] mini_swe_agent_v2: harden adapter, split mini.yaml into solo/coop, submit via patch.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapter: - accept **kwargs so unknown caller-side args don't crash run() - wire up the agent_config CLI flag that was previously listed in the signature but never read; load YAML and deep-merge config: block over the defaults - sanitize content=None on tool-calling assistant turns before returning AgentResult.messages (CooperBench's downstream coop runner does '"send_message" in content' which TypeErrors on None) - drop the dead SEND_MESSAGE_TOOL import (only BASH_TOOL is registered; send_message is intercepted from inside the bash command string) - drop _get_patch() and the base_commit capture; the patch now comes straight from result['submission'] (no working-tree extraction fallback — if the agent didn't submit, there is no patch) Config: - delete config/mini.yaml (was a single file with {% if agent_id %} branches handling both solo and coop) and split into config/solo.yaml and config/coop.yaml; adapter picks based on is_coop = len(agents) > 1 - fix a leak in the solo branch where the CRITICAL REQUIREMENTS section still mentioned 'send_message to your colleague' even when the agent has no colleague - replace the bare 'echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT' submit step with the upstream mini-swe-agent SWE-bench three-step flow: curate via 'git diff -- file1 file2 > patch.txt', verify via cat, submit via 'echo COMPLETE... && cat patch.txt'. The submission field then carries the patch verbatim. --- .../agents/mini_swe_agent_v2/adapter.py | 53 +++-- .../config/{mini.yaml => coop.yaml} | 70 ++++--- .../agents/mini_swe_agent_v2/config/solo.yaml | 185 ++++++++++++++++++ 3 files changed, 263 insertions(+), 45 deletions(-) rename src/cooperbench/agents/mini_swe_agent_v2/config/{mini.yaml => coop.yaml} (82%) create mode 100644 src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml diff --git a/src/cooperbench/agents/mini_swe_agent_v2/adapter.py b/src/cooperbench/agents/mini_swe_agent_v2/adapter.py index 1c808f1..f385b6d 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/adapter.py +++ b/src/cooperbench/agents/mini_swe_agent_v2/adapter.py @@ -40,13 +40,29 @@ def run( config: dict | None = None, agent_config: str | None = None, log_dir: str | None = None, + **kwargs, ) -> AgentResult: """Run mini-swe-agent v2 on a task.""" - # Always load default config, then merge with any overrides - config_path = get_config_path("mini") + # Load coop config when multiple agents, otherwise solo config. + is_coop = bool(agents) and len(agents) > 1 + config_name = "coop" if is_coop else "solo" + config_path = get_config_path(config_name) with open(config_path) as f: default_config = yaml.safe_load(f) + # If the caller passed an agent_config YAML path, deep-merge its + # `config:` block into the defaults. This is what CooperBench's + # ``--agent-config`` flag forwards to the adapter. + if agent_config: + try: + with open(agent_config) as f: + overrides = yaml.safe_load(f) or {} + default_config = recursive_merge(default_config, overrides.get("config", overrides)) + except FileNotFoundError: + logger.error(f"agent_config file not found: {agent_config}") + except Exception as e: + logger.error(f"Error loading agent_config {agent_config}: {e}") + # Deep-merge passed config overrides into default config so that partial # overrides (e.g. only agent.compaction_enabled) don't clobber sibling keys. if config is not None: @@ -77,10 +93,6 @@ def run( env = ModalEnvironment(**env_kwargs) - # Capture base commit for patch generation - base_commit_result = env.execute({"command": "git rev-parse HEAD"}) - base_commit = base_commit_result.get("output", "").strip() - # Setup messaging connector if enabled comm = None use_messaging = messaging_enabled and comm_url and agents and len(agents) > 1 @@ -122,6 +134,7 @@ def run( # Run agent error_msg = None + result = {} try: result = agent.run(task=task) status = result.get("exit_status", "Submitted") @@ -129,8 +142,12 @@ def run( status = "Error" error_msg = str(e) - # Extract patch (committed + uncommitted changes) - patch = self._get_patch(env, base_commit) + # The agent submits its patch via ``echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT + # && cat patch.txt`` (see coop.yaml/solo.yaml). Whatever follows the + # sentinel is captured into result["submission"] by the env. No + # working-tree extraction fallback — if the agent didn't submit a + # patch, there is no patch. + patch = result.get("submission", "").strip() # Save full trajectory (includes segments when compaction occurred) if log_dir and agent._compaction_count > 0: @@ -144,20 +161,22 @@ def run( # Cleanup env.cleanup() + # Tool-calling assistant turns leave content=None (the body lives in + # tool_calls). CooperBench's downstream conversation extractor does + # ``"send_message" in content`` which raises TypeError on None — coerce + # to "" before returning. + sanitized_messages = [] + for msg in agent.messages: + if msg.get("content") is None: + msg = {**msg, "content": ""} + sanitized_messages.append(msg) + return AgentResult( status=status, patch=patch, cost=agent.cost, steps=agent.n_calls, - messages=agent.messages, + messages=sanitized_messages, sent_messages=agent.sent_messages, error=error_msg, ) - - def _get_patch(self, env, base_commit: str) -> str: - """Extract git diff from base commit to current working tree state.""" - try: - result = env.execute({"command": f"git diff {base_commit}"}) - return result.get("output", "").strip() - except Exception: - return "" diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml similarity index 82% rename from src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml rename to src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml index 4f6931e..f7bbed5 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/mini.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml @@ -1,12 +1,7 @@ agent: system_template: | - {% if agent_id %} You are a software engineer working alongside a colleague on a shared codebase. You each have your own workspace and are implementing different features in parallel. You communicate naturally — like engineers on the same team — to make sure your combined work integrates cleanly. - {% else %} - You are a helpful assistant that can interact with a computer. - {% endif %} instance_template: | - {% if agent_id %} ## Your Task {{task}} @@ -32,8 +27,7 @@ agent: 3. Edit the source code to resolve the issue 4. Verify your fix works by running your script again 5. Test edge cases to ensure your fix is robust - 6. Submit your changes: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`. - Do not combine it with any other command. After this command, you cannot continue working on this task. + 6. Submit your changes — see the **Submission** section below for the exact procedure. But you are not working solo but rather as a team, so follow the workflow above for your individual tasks but you have complete freedom to communicate with your colleague in whatever way, whenever, and however often you see fit. Think about how experienced software engineers coordinate when working on the same codebase — and do that. @@ -86,25 +80,6 @@ agent: Do NOT run: `git merge` (without --abort), `git pull`, `git rebase`, or `git reset --hard` against your colleague's branch or `origin/main`. These will corrupt your patch. {% endif %} - {% else %} - ## Your Task - - {{task}} - - ## Recommended Workflow - - This workflow should be done step-by-step so that you can iterate on your changes and any possible problems. - - 1. Analyze the codebase by finding and reading relevant files - 2. Create a script to reproduce the issue - 3. Edit the source code to resolve the issue - 4. Verify your fix works by running your script again - 5. Test edge cases to ensure your fix is robust - 6. Submit your changes: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`. - Do not combine it with any other command. After this command, you cannot continue working on this task. - - {% endif %} - ## Command Execution Rules You are operating in an environment where @@ -125,8 +100,7 @@ agent: - Your response MUST include AT LEAST ONE bash tool call — this can be a coding command, a `send_message` to your colleague, or both - Directory or environment variable changes are not persistent. Every action is executed in a new subshell. - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files - - Submit your changes and finish your work by issuing the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`. - Do not combine it with any other command. After this command, you cannot continue working on this task. + - To submit your work, follow the **Submission** section below (`echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt`). Example of a CORRECT response: @@ -185,9 +159,48 @@ agent: ```bash anything ``` + + ## Submission + + When your feature is complete, submit your patch as a git diff. Follow these + steps **IN ORDER**, with **SEPARATE** commands: + + **Step 1 — create the patch file.** Run, listing only the source files you modified: + + ```bash + git diff -- path/to/file1 path/to/file2 > patch.txt + ``` + + Do NOT commit your changes. The patch must contain ONLY the source files you + modified to implement your feature. Do NOT include: + + - reproduction or scratch test scripts you wrote + - helper scripts or tools you created + - installation/build/packaging/configuration files + - binaries or compiled files + + **Step 2 — verify the patch.** Inspect `patch.txt` to confirm it contains only + your intended changes and headers show `--- a/` and `+++ b/` paths: + + ```bash + cat patch.txt + ``` + + **Step 3 — submit.** Use this **EXACT** command: + + ```bash + echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt + ``` + + + - Creating/viewing the patch and submitting MUST be separate tool calls (not combined with `&&`). + - If `patch.txt` is missing or `cat` fails, the submission fails (non-zero exit) and you remain in the loop — fix and retry. + - After a successful submission you CANNOT continue working on this task. + step_limit: 100 cost_limit: 3. mode: confirm + compaction_token_trigger: 28000 # leave headroom on 32K-context models environment: env: PAGER: cat @@ -196,6 +209,7 @@ environment: PIP_PROGRESS_BAR: 'off' TQDM_DISABLE: '1' model: + cost_tracking: ignore_errors # vLLM-served models have no pricing data observation_template: | {%- if output.output | length < 10000 -%} { diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml new file mode 100644 index 0000000..73b933a --- /dev/null +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -0,0 +1,185 @@ +agent: + system_template: | + You are a helpful assistant that can interact with a computer. + instance_template: | + ## Your Task + + {{task}} + + ## Recommended Workflow + + This workflow should be done step-by-step so that you can iterate on your changes and any possible problems. + + 1. Analyze the codebase by finding and reading relevant files + 2. Create a script to reproduce the issue + 3. Edit the source code to resolve the issue + 4. Verify your fix works by running your script again + 5. Test edge cases to ensure your fix is robust + 6. Submit your changes — see the **Submission** section below for the exact procedure. + + ## Command Execution Rules + + You are operating in an environment where + + 1. You issue at least one command + 2. The system executes the command(s) in a subshell + 3. You see the result(s) + 4. You write your next command(s) + + Each response should include: + + 1. **Reasoning text** where you explain your analysis and plan + 2. At least one tool call with your command + + **CRITICAL REQUIREMENTS:** + + - Your response SHOULD include reasoning text explaining what you're doing + - Your response MUST include AT LEAST ONE bash tool call + - Directory or environment variable changes are not persistent. Every action is executed in a new subshell. + - However, you can prefix any action with `MY_ENV_VAR=MY_VALUE cd /path/to/working/dir && ...` or write/load environment variables from files + - To submit your work, follow the **Submission** section below (`echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt`). + + Example of a CORRECT response: + + I need to understand the structure of the repository first. Let me check what files are in the current directory to get a better understanding of the codebase. + + [Makes bash tool call with {"command": "ls -la"} as arguments] + + + + {{system}} {{release}} {{version}} {{machine}} + + + ## Useful command examples + + ### Create a new file: + + ```bash + cat <<'EOF' > newfile.py + import numpy as np + hello = "world" + print(hello) + EOF + ``` + + ### Edit files with sed: + + {%- if system == "Darwin" -%} + + You are on MacOS. For all the below examples, you need to use `sed -i ''` instead of `sed -i`. + + {%- endif -%} + + ```bash + # Replace all occurrences + sed -i 's/old_string/new_string/g' filename.py + + # Replace only first occurrence + sed -i 's/old_string/new_string/' filename.py + + # Replace first occurrence on line 1 + sed -i '1s/old_string/new_string/' filename.py + + # Replace all occurrences in lines 1-10 + sed -i '1,10s/old_string/new_string/g' filename.py + ``` + + ### View file content: + + ```bash + # View specific lines with numbers + nl -ba filename.py | sed -n '10,20p' + ``` + + ### Any other command you want to run + + ```bash + anything + ``` + + ## Submission + + When your work is complete, submit your patch as a git diff. Follow these + steps **IN ORDER**, with **SEPARATE** commands: + + **Step 1 — create the patch file.** Run, listing only the source files you modified: + + ```bash + git diff -- path/to/file1 path/to/file2 > patch.txt + ``` + + Do NOT commit your changes. The patch must contain ONLY the source files you + modified. Do NOT include: + + - reproduction or scratch test scripts you wrote + - helper scripts or tools you created + - installation/build/packaging/configuration files + - binaries or compiled files + + **Step 2 — verify the patch.** Inspect `patch.txt` to confirm it contains only + your intended changes and headers show `--- a/` and `+++ b/` paths: + + ```bash + cat patch.txt + ``` + + **Step 3 — submit.** Use this **EXACT** command: + + ```bash + echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt + ``` + + + - Creating/viewing the patch and submitting MUST be separate tool calls (not combined with `&&`). + - If `patch.txt` is missing or `cat` fails, the submission fails (non-zero exit) and you remain in the loop — fix and retry. + - After a successful submission you CANNOT continue working on this task. + + step_limit: 100 + cost_limit: 3. + mode: confirm + compaction_token_trigger: 28000 # leave headroom on 32K-context models +environment: + env: + PAGER: cat + MANPAGER: cat + LESS: -R + PIP_PROGRESS_BAR: 'off' + TQDM_DISABLE: '1' +model: + cost_tracking: ignore_errors # vLLM-served models have no pricing data + observation_template: | + {%- if output.output | length < 10000 -%} + { + "returncode": {{ output.returncode }}, + "output": {{ output.output | tojson }} + {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %} + } + {%- else -%} + { + "returncode": {{ output.returncode }}, + "output_head": {{ output.output[:5000] | tojson }}, + "output_tail": {{ output.output[-5000:] | tojson }}, + "elided_chars": {{ output.output | length - 10000 }}, + "warning": "Output too long." + {%- if output.exception_info %}, "exception_info": {{ output.exception_info | tojson }}{% endif %} + } + {%- endif -%} + format_error_template: | + Tool call error: + + + {{error}} + + + Here is general guidance on how to submit correct toolcalls: + + Every response needs to use the 'bash' tool at least once to execute commands. + + Call the bash tool with your command as the argument: + - Tool: bash + - Arguments: {"command": "your_command_here"} + + If you want to end the task, please issue the following command: `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT` + without any other command. + model_kwargs: + drop_params: true From a6ae7f1aa65ff5162be6cddfedcb98161e24652b Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 21:18:09 +0000 Subject: [PATCH 03/11] mini_swe_agent_v2: shared singleton git server, path-prefixed per-run repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Old design spun up a fresh debian-slim container per coop pair, ran 'apt-get install git' inside it, slept 3 seconds, and returned. Two problems: - the 3-second sleep was far too short for the apt install (~30-60s on a cold image), so agents' initial 'git push' raced the daemon start and got 'Connection refused' - the per-run container lived on its own bridge network cooperbench-git-, but DockerEnvironmentConfig had no 'network' field, so the kwarg the adapter passed got silently dropped by Pydantic and agent containers ended up on the default bridge — no route to the git server's IP Replaced with a Redis-style shared singleton: - one image cooperbench-git-server:local (built lazily on first use from a 4-line Dockerfile) - one container cooperbench-git running 'git daemon --base-path=/git --listen=0.0.0.0' as PID 1, with a docker volume for /git - one shared bridge network 'cooperbench' that all agent containers join - per-run isolation via path namespacing: each coop pair gets /git//repo.git, served at git://cooperbench-git:9418//repo.git DockerGitServer.create() now just ensures the singleton infra is up (idempotent, ~140ms after first call) and exec's a quick 'mkdir + git init --bare' inside the running daemon. cleanup() removes only the per-run path and leaves the singleton alive. DockerEnvironmentConfig also gets a typed 'network' field so the --network flag actually reaches docker run. --- .../connectors/git_servers/docker.py | 282 +++++++++--------- .../mini_swe_agent_v2/environments/docker.py | 8 + 2 files changed, 154 insertions(+), 136 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py index 8e32b2e..9d5c8ec 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py +++ b/src/cooperbench/agents/mini_swe_agent_v2/connectors/git_servers/docker.py @@ -1,148 +1,173 @@ -"""Docker-based git server for code collaboration.""" +"""Docker-based git server for code collaboration. + +Architecture (mirrors how Redis is used elsewhere in CooperBench): a single +long-lived ``cooperbench-git`` container runs ``git daemon`` and serves +multiple isolated per-run repositories under ``/git//repo.git``. All +agent containers join the same ``cooperbench`` bridge network so they can +resolve the server by container name. + +The infra (image, network, container) is auto-created on first use and reused +thereafter. No CLI setup step. +""" from __future__ import annotations +import io import logging import time import docker +# Singleton infra (one set per host, reused across all coop runs) +_IMAGE_TAG = "cooperbench-git-server:local" +_CONTAINER_NAME = "cooperbench-git" +_NETWORK_NAME = "cooperbench" +_VOLUME_NAME = "cooperbench-git-data" +_PORT = 9418 + +_DOCKERFILE = b"""FROM debian:bookworm-slim +RUN apt-get update -qq \\ + && apt-get install -y -qq git \\ + && rm -rf /var/lib/apt/lists/* +RUN mkdir /git +ENTRYPOINT ["git", "daemon", \\ + "--reuseaddr", \\ + "--export-all", \\ + "--enable=receive-pack", \\ + "--base-path=/git", \\ + "--listen=0.0.0.0", \\ + "/git"] +""" -class DockerGitServer: - """Shared git server container for code collaboration using Docker. - Creates a Docker container running git-daemon that agents can push/pull to. +def _wait_for_port(container, port: int, timeout: int = 30) -> None: + """Block until the daemon binds the given port inside the container. + + Uses bash's built-in ``/dev/tcp`` (no extra packages required) so it works + against the stripped-down debian-slim base. """ + deadline = time.time() + timeout + while time.time() < deadline: + probe = container.exec_run(["bash", "-c", f"exec 3<>/dev/tcp/127.0.0.1/{port}"]) + if probe.exit_code == 0: + return + time.sleep(0.5) + raise RuntimeError(f"git daemon did not bind :{port} within {timeout}s") + + +def _ensure_shared_infra(client: docker.DockerClient, logger: logging.Logger): + """Idempotently bring up image, network, and shared git server container.""" + # Image + try: + client.images.get(_IMAGE_TAG) + except docker.errors.ImageNotFound: + logger.info(f"Building {_IMAGE_TAG} (one-time, ~30s)...") + client.images.build(fileobj=io.BytesIO(_DOCKERFILE), tag=_IMAGE_TAG, rm=True) + logger.info(f"Built {_IMAGE_TAG}") + + # Network + try: + client.networks.get(_NETWORK_NAME) + except docker.errors.NotFound: + logger.debug(f"Creating shared network {_NETWORK_NAME}") + try: + client.networks.create(_NETWORK_NAME, driver="bridge") + except docker.errors.APIError: + # race: another concurrent run created it; re-fetch + client.networks.get(_NETWORK_NAME) + + # Volume (so /git survives container restarts; also lets us inspect history) + try: + client.volumes.get(_VOLUME_NAME) + except docker.errors.NotFound: + client.volumes.create(name=_VOLUME_NAME) + + # Container + try: + container = client.containers.get(_CONTAINER_NAME) + except docker.errors.NotFound: + logger.info(f"Starting shared git server container {_CONTAINER_NAME}") + try: + container = client.containers.run( + _IMAGE_TAG, + name=_CONTAINER_NAME, + detach=True, + network=_NETWORK_NAME, + volumes={_VOLUME_NAME: {"bind": "/git", "mode": "rw"}}, + restart_policy={"Name": "unless-stopped"}, + ) + except docker.errors.APIError: + # race: another concurrent run created it; re-fetch + container = client.containers.get(_CONTAINER_NAME) + + container.reload() + if container.status != "running": + container.start() + container.reload() - def __init__(self, container, hostname: str, port: int, network_name: str): - """Initialize with an existing container. + _wait_for_port(container, _PORT, timeout=30) + return container - Use DockerGitServer.create() to create a new server. - """ - self._container = container + +class DockerGitServer: + """Per-run handle on the shared git server. + + The shared container is a singleton; what's per-run is just a directory + under ``/git//repo.git`` that this class creates and tears down. + """ + + def __init__(self, *, run_id: str, hostname: str, port: int, network_name: str): + self._run_id = run_id self._hostname = hostname self._port = port self._network_name = network_name self._logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.docker") @classmethod - def create( - cls, - run_id: str, - timeout: int = 3600, - ) -> DockerGitServer: - """Create and start a git server container. + def create(cls, run_id: str, timeout: int = 3600) -> DockerGitServer: + """Ensure shared infra is up, then init a per-run bare repo on it. Args: - run_id: Unique run identifier (for container naming) - timeout: Container timeout in seconds (not enforced, for compatibility) + run_id: Unique run identifier — becomes the path prefix under /git + timeout: Unused; kept for protocol compatibility with other backends Returns: - DockerGitServer instance ready to accept connections + DockerGitServer pointing at git://:9418//repo.git """ + del timeout logger = logging.getLogger("cooperbench.agents.mini_swe_agent_v2.git_server.docker") - logger.debug(f"Creating docker git server for run {run_id}") - client = docker.from_env() - # Use a simple Debian-based image with git - image = "debian:bookworm-slim" - - # Pull image if not present - try: - client.images.get(image) - except docker.errors.ImageNotFound: - logger.debug(f"Pulling image {image}") - client.images.pull(image) - - # Create or get shared network for git server and agents - network_name = f"cooperbench-git-{run_id}" - try: - client.networks.get(network_name) - except docker.errors.NotFound: - client.networks.create(network_name, driver="bridge") - - # Container name based on run_id - container_name = f"cooperbench-git-{run_id}" - - # Remove existing container if it exists - try: - old_container = client.containers.get(container_name) - old_container.remove(force=True) - except docker.errors.NotFound: - pass - - # Create and start container with initialization script - # The script initializes the repo, then starts git daemon in foreground to keep container alive - init_script = """#!/bin/bash -set -e -apt-get update -qq -apt-get install -y -qq git > /dev/null 2>&1 -mkdir -p /git/repo.git -cd /git/repo.git -git init --bare -git config receive.denyCurrentBranch ignore -touch git-daemon-export-ok -exec git daemon --reuseaddr --export-all --enable=receive-pack --base-path=/git --listen=0.0.0.0 /git -""" + container = _ensure_shared_infra(client, logger) - container = client.containers.run( - image=image, - command=["bash", "-c", init_script], - name=container_name, - detach=True, - network=network_name, - ports={"9418/tcp": None}, # Auto-assign port for host access - remove=False, + # Per-run repo init — fast (~50ms) inside the already-running container + init_cmd = ( + f"set -e && " + f"mkdir -p /git/{run_id}/repo.git && " + f"cd /git/{run_id}/repo.git && " + f"git init --bare && " + f"git config receive.denyCurrentBranch ignore && " + f"touch git-daemon-export-ok" + ) + result = container.exec_run(["bash", "-c", init_cmd]) + if result.exit_code != 0: + raise RuntimeError( + f"Failed to init repo /git/{run_id}/repo.git: {result.output.decode('utf-8', errors='replace')}" + ) + + logger.debug(f"Per-run repo ready at git://{_CONTAINER_NAME}:{_PORT}/{run_id}/repo.git") + + return cls( + run_id=run_id, + hostname=_CONTAINER_NAME, + port=_PORT, + network_name=_NETWORK_NAME, ) - - # Wait for container to start and git daemon to initialize - time.sleep(3) - - # Verify container is running - container.reload() - if container.status != "running": - logs = container.logs().decode("utf-8", errors="replace") - container.remove(force=True) - raise RuntimeError(f"Git server container failed to start. Logs: {logs}") - - # Reload container to get port mapping - container.reload() - - # Get the host port - port_bindings = container.attrs.get("NetworkSettings", {}).get("Ports", {}) - if "9418/tcp" not in port_bindings or not port_bindings["9418/tcp"]: - container.stop() - container.remove(force=True) - raise RuntimeError("Failed to get port mapping for git daemon") - - # Get container's IP on the network for inter-container communication - container.reload() - network_settings = container.attrs.get("NetworkSettings", {}) - networks = network_settings.get("Networks", {}) - if network_name in networks: - container_ip = networks[network_name].get("IPAddress") - if container_ip: - hostname = container_ip - else: - # Fallback to container name (DNS resolution on same network) - hostname = container_name - else: - # Fallback to container name - hostname = container_name - - logger.debug(f"Git server ready at git://{hostname}:9418 (network: {network_name})") - - return cls(container=container, hostname=hostname, port=9418, network_name=network_name) @property def url(self) -> str: - """Git URL for agents to use as remote. - - Returns: - Git URL for the repository (git://hostname:port/repo.git) - """ - return f"git://{self._hostname}:{self._port}/repo.git" + """Git URL for agents to use as remote.""" + return f"git://{self._hostname}:{self._port}/{self._run_id}/repo.git" @property def network_name(self) -> str: @@ -150,26 +175,11 @@ def network_name(self) -> str: return self._network_name def cleanup(self) -> None: - """Stop and remove the git server container and network.""" - if self._container: - try: - self._container.stop(timeout=5) - except Exception: - pass - try: - self._container.remove(force=True) - except Exception: - pass - self._container = None - - # Clean up network - if hasattr(self, "_network_name") and self._network_name: - try: - client = docker.from_env() - try: - network = client.networks.get(self._network_name) - network.remove() - except docker.errors.NotFound: - pass - except Exception: - pass + """Remove this run's repo dir. Leave the shared container/network/image alone.""" + try: + client = docker.from_env() + container = client.containers.get(_CONTAINER_NAME) + container.exec_run(["rm", "-rf", f"/git/{self._run_id}"]) + except Exception: + # Best-effort: we don't want cleanup failure to mask the real run result. + pass diff --git a/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py b/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py index 148c632..293ddff 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py +++ b/src/cooperbench/agents/mini_swe_agent_v2/environments/docker.py @@ -31,6 +31,10 @@ class DockerEnvironmentConfig(BaseModel): """Additional arguments to pass to the docker/container executable. Default is ["--rm"], which removes the container after it exits. """ + network: str | None = None + """Docker network to attach the container to (passed as --network). + Required for coop+git so agent containers can reach the git server's + bridge network.""" container_timeout: str = "2h" """Max duration to keep container running. Uses the same format as the sleep command.""" pull_timeout: int = 120 @@ -84,6 +88,10 @@ def _start_container(self): self.config.cwd, "--entrypoint", "/bin/bash", + ] + if self.config.network: + cmd += ["--network", self.config.network] + cmd += [ *self.config.run_args, self.config.image, "-c", From 788f03e047554fcd22aff62f14229c6bcbf7058d Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 21:18:19 +0000 Subject: [PATCH 04/11] mini_swe_agent_v2: serialize() no longer mutates _segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DefaultAgent.run() calls self.save(self.config.output_path) in its finally clause every step, and save() calls serialize(). Once compaction has fired, serialize() was unconditionally calling _close_current_segment('solver') — which appends a snapshot of the current live messages as a new segment AND resets the buffer (which the next query() then repopulates). Net effect: each step after the first compaction added another post-compaction solver segment to _segments, each one near-superset of the previous. In a real run we observed segment counts like [86, 85, 8, 10, 12, 14, 15] where the last 5 should have been a single segment. Fix: serialize() builds a snapshot list locally without mutating self._segments. The current open buffer is appended as a transient 'solver' segment in the snapshot. Multiple calls to serialize() now produce identical output and leave state unchanged. --- src/cooperbench/agents/mini_swe_agent_v2/agents/default.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py b/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py index 9151d16..102fbac 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py +++ b/src/cooperbench/agents/mini_swe_agent_v2/agents/default.py @@ -320,8 +320,11 @@ def serialize(self, *extra_dicts) -> dict: "trajectory_format": "mini-swe-agent-1.1", } if self._compaction_count > 0: - self._close_current_segment("solver") - agent_data["segments"] = self._segments + segments = list(self._segments) + current = self._current_segment_messages or self.messages + if current: + segments.append({"kind": "solver", "messages": list(current)}) + agent_data["segments"] = segments return recursive_merge(agent_data, self.model.serialize(), self.env.serialize(), *extra_dicts) def save(self, path: Path | None, *extra_dicts) -> dict: From ec5483cbcf012085fd41ad4b7cf5119cac2820b1 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 21:20:53 +0000 Subject: [PATCH 05/11] v0.0.12: bump version, add CHANGELOG entry --- CHANGELOG.md | 20 ++++++++++++++++++++ src/cooperbench/__about__.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2875ff3..5c65ca3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.12] - 2026-04-30 + +### Changed + +- **`mini_swe_agent_v2` patch is now the agent's `submission`** — the adapter no longer captures `base_commit` and runs `git diff ` at end-of-run. Instead the patch comes directly from `result['submission']`, which the env populates with everything the agent emits after `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`. Mirrors upstream mini-swe-agent's SWE-bench config. The `coop.yaml` / `solo.yaml` prompts now instruct the agent to curate via `git diff -- file1 file2 > patch.txt`, verify with `cat patch.txt`, and submit with `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt`. No working-tree-extraction fallback — if the agent didn't submit, there is no patch. +- **`config/mini.yaml` split into `config/solo.yaml` + `config/coop.yaml`** — the previous single file conditioned everything on `{% if agent_id %}` blocks. The adapter now selects which file to load via `is_coop = len(agents) > 1`. While splitting, fixed a leak in the solo branch where the `CRITICAL REQUIREMENTS` section still mentioned `send_message to your colleague` for an agent with no colleague. +- **Shared singleton git server for `--git` coop runs** — replaces the previous design that spun up a fresh `debian:bookworm-slim` container per run, ran `apt-get install git`, slept 3s, and returned (resulting in race conditions where agents' initial `git push` beat the daemon to startup). The new design auto-creates one image (`cooperbench-git-server:local`), one network (`cooperbench`), and one container (`cooperbench-git`) on first use; per-run isolation comes from path namespacing under `/git//repo.git`. Idempotent — first run pays a ~30s image-build cost, subsequent runs reuse the singleton in ~140ms. Mirrors the Redis-style "one daemon, many namespaces" pattern. + +### Fixed + +- **`mini_swe_agent_v2` adapter no longer crashes on `content=None`** — tool-calling assistant turns leave `content=None` (the body lives in `tool_calls`), and CooperBench's downstream `_extract_conversation` does `"send_message" in content`, which raises `TypeError` on None. The adapter now coerces to `""` before populating `AgentResult.messages`. +- **`mini_swe_agent_v2` adapter wires up the `agent_config` flag** — previously listed in `MiniSweAgentV2Runner.run`'s signature but never read from. Now loads the YAML and deep-merges its `config:` block over the defaults. Forward-compatible: `**kwargs` accepted so unknown caller-side args don't crash `run()`. +- **`mini_swe_agent_v2` adapter drops the dead `SEND_MESSAGE_TOOL` import** — only `BASH_TOOL` is registered with the model; `send_message` is intercepted from inside the bash command string. The leftover import was confusing. +- **`DockerEnvironmentConfig.network`** — added a typed `network` field so the `--network ` flag reaches `docker run`. Previously the adapter passed `network=...` as a kwarg, but Pydantic silently dropped it (no such field), and agent containers ended up on the default bridge with no route to the per-run git server's IP. With the new shared-singleton git server design, agent containers must join the shared `cooperbench` network for DNS-by-name resolution to work. +- **`DefaultAgent.serialize()` no longer mutates `_segments`** — `run()` calls `save()` in its finally clause every step, and once compaction had fired, each `serialize()` call appended another snapshot of the current live messages as a fresh solver segment (and reset the buffer, which the next `query()` repopulated). Net effect: one compaction produced N+1 overlapping post-compaction solver segments instead of 1. Fix: `serialize()` builds the snapshot list locally without touching `self._segments`. + +### Added + +- **`cooperbench` CLI auto-loads `./.env`** — `cli.py` now calls `dotenv.load_dotenv()` at module load, so project-local `OPENAI_API_KEY` etc. is picked up without users having to `set -a && source .env` ahead of every invocation. Matches the convention used elsewhere in the codebase. + ## [0.0.11] - 2026-04-18 ### Added diff --git a/src/cooperbench/__about__.py b/src/cooperbench/__about__.py index 9ce1216..934c5be 100644 --- a/src/cooperbench/__about__.py +++ b/src/cooperbench/__about__.py @@ -1,3 +1,3 @@ """Version information for CooperBench.""" -__version__ = "0.0.11" +__version__ = "0.0.12" From b92995ef93753a505dad160e174749c695729d9a Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:29:20 +0000 Subject: [PATCH 06/11] eval: surface patch-apply failures, enrich per-feature schema When an agent submits a malformed patch (e.g. a 'new file mode' diff against an existing file), 'git apply' rejects it but the eval would silently commit an empty branch, then report the subsequent merge as 'clean' because there was nothing to disagree with. An agent self-sabotage (the canonical case is an agent running 'rm -rf .git' mid-run) would look like a passing eval. _setup_branches now emits explicit per-agent markers (PATCH_APPLIED / _SKIPPED / _FAILED) and returns an apply_status dict. test_merged threads it into the eval result and overrides merge.status to 'missing_input' when any patch failed. While in there, _run_tests now exposes exit_code in its result, and the per-feature dict gains feature_id / exit_code / tests_passed / tests_failed alongside the existing passed + test_output (which is still a 50KB blob). Consumers can now reason about results without grepping raw pytest output. --- src/cooperbench/eval/evaluate.py | 1 + src/cooperbench/eval/sandbox.py | 89 ++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/src/cooperbench/eval/evaluate.py b/src/cooperbench/eval/evaluate.py index e466ac2..56804d9 100644 --- a/src/cooperbench/eval/evaluate.py +++ b/src/cooperbench/eval/evaluate.py @@ -358,6 +358,7 @@ def _evaluate_single( "task_id": task_id, "features": features, "setting": "coop", + "apply_status": result.get("apply_status"), "merge": result.get("merge", {}), "feature1": result.get("feature1", {}), "feature2": result.get("feature2", {}), diff --git a/src/cooperbench/eval/sandbox.py b/src/cooperbench/eval/sandbox.py index 2278c75..25fed2f 100644 --- a/src/cooperbench/eval/sandbox.py +++ b/src/cooperbench/eval/sandbox.py @@ -162,10 +162,22 @@ def test_merged( if not base_sha: return _merged_error_result("Failed to get base commit SHA") + apply_status = setup_result.get("apply_status", {"agent1": "unknown", "agent2": "unknown"}) + any_apply_failed = "failed" in apply_status.values() + # Step 2: Try naive merge naive_result = _merge_naive(sb, base_sha) - merge_status = "clean" if not naive_result["conflict"] else "conflicts" + # If any agent's patch failed to apply, the resulting "merge" is just + # a merge of base + the surviving agent's work into the other branch + # (which is also at base). Don't pretend that's a clean merge of the + # agents' joint output — surface the apply failure as the merge status. + if any_apply_failed: + merge_status = "missing_input" + elif naive_result["conflict"]: + merge_status = "conflicts" + else: + merge_status = "clean" strategy_used = "naive" merged_diff = naive_result["diff"] @@ -200,17 +212,26 @@ def test_merged( test2_result = _run_tests(sb, "tests2.patch", "merged.patch", base_sha) return { + "apply_status": apply_status, "merge": { "status": merge_status, "strategy": strategy_used, "diff": merged_diff[:5000] if merged_diff else "", # Truncate for storage }, "feature1": { + "feature_id": feature1_id, "passed": test1_result["passed"], + "exit_code": test1_result.get("exit_code"), + "tests_passed": test1_result.get("tests_passed", 0), + "tests_failed": test1_result.get("tests_failed", 0), "test_output": test1_result["output"], }, "feature2": { + "feature_id": feature2_id, "passed": test2_result["passed"], + "exit_code": test2_result.get("exit_code"), + "tests_passed": test2_result.get("tests_passed", 0), + "tests_failed": test2_result.get("tests_failed", 0), "test_output": test2_result["output"], }, "both_passed": test1_result["passed"] and test2_result["passed"], @@ -357,7 +378,14 @@ def _write_patch(sb: Sandbox, filename: str, content: str) -> None: def _setup_branches(sb: Sandbox) -> dict: - """Set up git branches for merge testing.""" + """Set up git branches for merge testing. + + Returns ``apply_status`` per agent: ``"applied"`` / ``"skipped"`` (empty + patch) / ``"failed"`` (git apply rejected the patch). Callers must check + this — a "clean" merge between two branches where one branch's patch + silently failed to apply is not actually a clean merge of the agents' + work, just a clean merge of nothing into the other. + """ commands = """ cd /workspace/repo git config user.email "eval@cooperbench.local" @@ -367,20 +395,31 @@ def _setup_branches(sb: Sandbox) -> dict: BASE_SHA=$(git rev-parse HEAD) echo "BASE_SHA=$BASE_SHA" +apply_patch() { + local n=$1 + if [ -s /patches/patch${n}.patch ]; then + if git apply /patches/patch${n}.patch 2>&1; then + echo "PATCH${n}_APPLIED" + elif git apply --3way /patches/patch${n}.patch 2>&1; then + echo "PATCH${n}_APPLIED" + else + echo "PATCH${n}_FAILED" + fi + else + echo "PATCH${n}_SKIPPED" + fi +} + # Create agent1 branch and apply patch1 git checkout -b agent1 2>&1 -if [ -s /patches/patch1.patch ]; then - git apply /patches/patch1.patch 2>&1 || git apply --3way /patches/patch1.patch 2>&1 || echo "PATCH1_FAILED" -fi +apply_patch 1 git add -A git commit -m "Agent 1 changes" --allow-empty 2>&1 # Create agent2 branch from base and apply patch2 git checkout $BASE_SHA 2>&1 git checkout -b agent2 2>&1 -if [ -s /patches/patch2.patch ]; then - git apply /patches/patch2.patch 2>&1 || git apply --3way /patches/patch2.patch 2>&1 || echo "PATCH2_FAILED" -fi +apply_patch 2 git add -A git commit -m "Agent 2 changes" --allow-empty 2>&1 @@ -399,7 +438,19 @@ def _setup_branches(sb: Sandbox) -> dict: base_sha = line.split("=")[1].strip() break - return {"output": output, "error": None, "base_sha": base_sha} + def _status(n: int) -> str: + if f"PATCH{n}_APPLIED" in output: + return "applied" + if f"PATCH{n}_SKIPPED" in output: + return "skipped" + return "failed" + + return { + "output": output, + "error": None, + "base_sha": base_sha, + "apply_status": {"agent1": _status(1), "agent2": _status(2)}, + } def _merge_naive(sb: Sandbox, base_sha: str) -> dict: @@ -499,6 +550,7 @@ def _run_tests(sb: Sandbox, tests_patch: str, feature_patch: str, base_sha: str) return { "passed": exit_code == 0 and parsed["passed"] > 0, "output": output, + "exit_code": exit_code, "tests_passed": parsed["passed"], "tests_failed": parsed["failed"], } @@ -631,9 +683,24 @@ def _error_result(error: str) -> dict: def _merged_error_result(error: str) -> dict: return { + "apply_status": {"agent1": "unknown", "agent2": "unknown"}, "merge": {"status": "error", "strategy": None, "diff": ""}, - "feature1": {"passed": False, "test_output": ""}, - "feature2": {"passed": False, "test_output": ""}, + "feature1": { + "feature_id": None, + "passed": False, + "exit_code": None, + "tests_passed": 0, + "tests_failed": 0, + "test_output": "", + }, + "feature2": { + "feature_id": None, + "passed": False, + "exit_code": None, + "tests_passed": 0, + "tests_failed": 0, + "test_output": "", + }, "both_passed": False, "error": error, } From b0cb03b5dcee07ae18254d4a572e571a97d46ea8 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:29:32 +0000 Subject: [PATCH 07/11] mini_swe_agent_v2: simplify Submission prompt + warn against .git destruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous Submission section was ~40 lines of three-step procedure plus a CRITICAL block. Small models (Qwen 9B, observed in coop+git runs) tended to follow the recipe but still hit footguns we hadn't forbidden — the canonical failure was an agent running 'rm -rf .git' mid-task, then 'git init' to 'fix' it, then producing a malformed 'new file mode' diff that the eval silently dropped. Trim the recipe to the bare flow: git diff -- path/to/file > patch.txt cat patch.txt echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt Then a tight CRITICAL block forbidding 'rm -rf .git', 'git init', 'git rm -rf .', and 'git reset --hard' inside /workspace/repo — which corrupt the local .git/ directory regardless of whether the team remote is enabled. In a follow-up empirical run, agent6 (the agent that previously did rm -rf .git) issued zero destructive git commands and produced a clean modify-existing patch that passed all 100 of feature 6's tests. CHANGELOG: extend the v0.0.12 entry to cover both this and the preceding eval-observability commit. --- CHANGELOG.md | 4 +++ .../agents/mini_swe_agent_v2/config/coop.yaml | 33 ++++--------------- .../agents/mini_swe_agent_v2/config/solo.yaml | 33 ++++--------------- 3 files changed, 18 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c65ca3..cdb13d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,9 +12,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **`mini_swe_agent_v2` patch is now the agent's `submission`** — the adapter no longer captures `base_commit` and runs `git diff ` at end-of-run. Instead the patch comes directly from `result['submission']`, which the env populates with everything the agent emits after `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT`. Mirrors upstream mini-swe-agent's SWE-bench config. The `coop.yaml` / `solo.yaml` prompts now instruct the agent to curate via `git diff -- file1 file2 > patch.txt`, verify with `cat patch.txt`, and submit with `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt`. No working-tree-extraction fallback — if the agent didn't submit, there is no patch. - **`config/mini.yaml` split into `config/solo.yaml` + `config/coop.yaml`** — the previous single file conditioned everything on `{% if agent_id %}` blocks. The adapter now selects which file to load via `is_coop = len(agents) > 1`. While splitting, fixed a leak in the solo branch where the `CRITICAL REQUIREMENTS` section still mentioned `send_message to your colleague` for an agent with no colleague. - **Shared singleton git server for `--git` coop runs** — replaces the previous design that spun up a fresh `debian:bookworm-slim` container per run, ran `apt-get install git`, slept 3s, and returned (resulting in race conditions where agents' initial `git push` beat the daemon to startup). The new design auto-creates one image (`cooperbench-git-server:local`), one network (`cooperbench`), and one container (`cooperbench-git`) on first use; per-run isolation comes from path namespacing under `/git//repo.git`. Idempotent — first run pays a ~30s image-build cost, subsequent runs reuse the singleton in ~140ms. Mirrors the Redis-style "one daemon, many namespaces" pattern. +- **Submission prompts simplified + `.git` footgun warnings** — the `## Submission` section in `coop.yaml` / `solo.yaml` is now ~5 lines (write a `git diff` to `patch.txt`, `cat` it, submit with `echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt`). Adds an explicit `` block forbidding `rm -rf .git`, `git init`, `git rm -rf .`, and `git reset --hard` inside `/workspace/repo` — these are easy footguns for small models, observed in the wild causing patches to come out as malformed `new file mode` diffs that fail to apply. ### Fixed +- **Eval surfaces `git apply` failures instead of silently masking them.** `_setup_branches` now emits explicit `PATCH_APPLIED` / `_SKIPPED` / `_FAILED` markers per agent and returns an `apply_status` dict. `test_merged` writes that to the result and refuses to call merge `clean` when any input patch failed to apply — instead reporting `merge.status: "missing_input"`. Previously, an agent submitting a malformed patch (e.g. a `new file mode` diff against an existing file) would have its branch silently end up empty, and the subsequent merge against the other agent's branch would report `clean` despite the missing input — making the eval lie about partial success. +- **Per-feature eval result schema enriched.** `feature1` / `feature2` now carry `feature_id`, `exit_code`, `tests_passed`, and `tests_failed` (was just `passed: bool` + a 50KB `test_output` blob). Lets consumers reason about results without grepping raw pytest output. + - **`mini_swe_agent_v2` adapter no longer crashes on `content=None`** — tool-calling assistant turns leave `content=None` (the body lives in `tool_calls`), and CooperBench's downstream `_extract_conversation` does `"send_message" in content`, which raises `TypeError` on None. The adapter now coerces to `""` before populating `AgentResult.messages`. - **`mini_swe_agent_v2` adapter wires up the `agent_config` flag** — previously listed in `MiniSweAgentV2Runner.run`'s signature but never read from. Now loads the YAML and deep-merges its `config:` block over the defaults. Forward-compatible: `**kwargs` accepted so unknown caller-side args don't crash `run()`. - **`mini_swe_agent_v2` adapter drops the dead `SEND_MESSAGE_TOOL` import** — only `BASH_TOOL` is registered with the model; `send_message` is intercepted from inside the bash command string. The leftover import was confusing. diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml index f7bbed5..c7fcaa3 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml @@ -162,40 +162,21 @@ agent: ## Submission - When your feature is complete, submit your patch as a git diff. Follow these - steps **IN ORDER**, with **SEPARATE** commands: - - **Step 1 — create the patch file.** Run, listing only the source files you modified: + Edit files in place. Don't commit. When done: ```bash git diff -- path/to/file1 path/to/file2 > patch.txt - ``` - - Do NOT commit your changes. The patch must contain ONLY the source files you - modified to implement your feature. Do NOT include: - - - reproduction or scratch test scripts you wrote - - helper scripts or tools you created - - installation/build/packaging/configuration files - - binaries or compiled files - - **Step 2 — verify the patch.** Inspect `patch.txt` to confirm it contains only - your intended changes and headers show `--- a/` and `+++ b/` paths: - - ```bash cat patch.txt - ``` - - **Step 3 — submit.** Use this **EXACT** command: - - ```bash echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` + The patch must contain only source files you modified (no reproduction + scripts, helper tools, build/config files, or binaries). + - - Creating/viewing the patch and submitting MUST be separate tool calls (not combined with `&&`). - - If `patch.txt` is missing or `cat` fails, the submission fails (non-zero exit) and you remain in the loop — fix and retry. - - After a successful submission you CANNOT continue working on this task. + Do NOT run `rm -rf .git`, `git init`, `git rm -rf .`, or `git reset --hard` + inside `/workspace/repo` — these corrupt `.git/` and your patch will be + unapplyable. step_limit: 100 cost_limit: 3. diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml index 73b933a..a7f61af 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -99,40 +99,21 @@ agent: ## Submission - When your work is complete, submit your patch as a git diff. Follow these - steps **IN ORDER**, with **SEPARATE** commands: - - **Step 1 — create the patch file.** Run, listing only the source files you modified: + Edit files in place. Don't commit. When done: ```bash git diff -- path/to/file1 path/to/file2 > patch.txt - ``` - - Do NOT commit your changes. The patch must contain ONLY the source files you - modified. Do NOT include: - - - reproduction or scratch test scripts you wrote - - helper scripts or tools you created - - installation/build/packaging/configuration files - - binaries or compiled files - - **Step 2 — verify the patch.** Inspect `patch.txt` to confirm it contains only - your intended changes and headers show `--- a/` and `+++ b/` paths: - - ```bash cat patch.txt - ``` - - **Step 3 — submit.** Use this **EXACT** command: - - ```bash echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` + The patch must contain only source files you modified (no reproduction + scripts, helper tools, build/config files, or binaries). + - - Creating/viewing the patch and submitting MUST be separate tool calls (not combined with `&&`). - - If `patch.txt` is missing or `cat` fails, the submission fails (non-zero exit) and you remain in the loop — fix and retry. - - After a successful submission you CANNOT continue working on this task. + Do NOT run `rm -rf .git`, `git init`, `git rm -rf .`, or `git reset --hard` + inside `/workspace/repo` — these corrupt `.git/` and your patch will be + unapplyable. step_limit: 100 cost_limit: 3. From 6a04599dccd261b629cf4af19659342f2e118c30 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:33:32 +0000 Subject: [PATCH 08/11] mini_swe_agent_v2: restore bulleted patch-exclusion list in Submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Earlier simplification collapsed the four file-category exclusions (reproduction scripts, helper tools, build/config files, binaries) into a single inline parenthetical, which small models tend to under-weight. Restore the bulleted list — bullets are easier to parse and harder to skim past — without re-adding the Step 1/2/3 scaffolding or the env-var notation that the simplification was meant to remove. --- src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml | 8 ++++++-- src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml index c7fcaa3..666503b 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml @@ -170,8 +170,12 @@ agent: echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` - The patch must contain only source files you modified (no reproduction - scripts, helper tools, build/config files, or binaries). + The patch must contain only source files you modified. Exclude: + + - reproduction or scratch test scripts you wrote + - helper scripts or tools you created + - installation, build, packaging, or configuration files + - binaries or compiled files Do NOT run `rm -rf .git`, `git init`, `git rm -rf .`, or `git reset --hard` diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml index a7f61af..594937c 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -107,8 +107,12 @@ agent: echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` - The patch must contain only source files you modified (no reproduction - scripts, helper tools, build/config files, or binaries). + The patch must contain only source files you modified. Exclude: + + - reproduction or scratch test scripts you wrote + - helper scripts or tools you created + - installation, build, packaging, or configuration files + - binaries or compiled files Do NOT run `rm -rf .git`, `git init`, `git rm -rf .`, or `git reset --hard` From a5c9ef4c75008df9367f6d60723f6507f1954d92 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:34:16 +0000 Subject: [PATCH 09/11] mini_swe_agent_v2: split Submission steps into separate code blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single fenced bash block invited models to chain the three commands with '&&' or run them as one heredoc. That breaks the design — the env's COMPLETE sentinel detection only fires when 'echo COMPLETE...' is the first line of bash output, and chaining makes the diff happen on the same line as cat, which the env then doesn't capture as a submission. Split into three separate fenced bash blocks (write / verify / submit), with explicit 'SEPARATE bash tool call' instruction. --- .../agents/mini_swe_agent_v2/config/coop.yaml | 15 ++++++++++++++- .../agents/mini_swe_agent_v2/config/solo.yaml | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml index 666503b..7b919cc 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml @@ -162,11 +162,24 @@ agent: ## Submission - Edit files in place. Don't commit. When done: + Edit files in place. Don't commit. When done, run each of these as a + SEPARATE bash tool call (do not chain them with `&&`): + + Write the patch: ```bash git diff -- path/to/file1 path/to/file2 > patch.txt + ``` + + Verify it: + + ```bash cat patch.txt + ``` + + Submit: + + ```bash echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml index 594937c..fc9fa99 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -99,11 +99,24 @@ agent: ## Submission - Edit files in place. Don't commit. When done: + Edit files in place. Don't commit. When done, run each of these as a + SEPARATE bash tool call (do not chain them with `&&`): + + Write the patch: ```bash git diff -- path/to/file1 path/to/file2 > patch.txt + ``` + + Verify it: + + ```bash cat patch.txt + ``` + + Submit: + + ```bash echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` From 46dc09d961a51baf8435df16ae2ae39f6775c6d6 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:36:20 +0000 Subject: [PATCH 10/11] mini_swe_agent_v2: reframe Submission around patch.txt as the contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wording opened with 'Edit files in place. Don't commit.' — which contradicts the existing Shared Git Remote section (telling agents they have a branch to coordinate on) and over-prescribes the workflow. Reframe: patch.txt is the artifact we evaluate, the agent writes whatever unified diff they want to submit to that file, however fits the workflow they used. The 'git diff -- file > patch.txt' recipe stays as 'one common way', not the only way. Agents are free to commit, fetch, merge, or do whatever else — the contract is only what ends up in patch.txt. --- .../agents/mini_swe_agent_v2/config/coop.yaml | 12 +++++++----- .../agents/mini_swe_agent_v2/config/solo.yaml | 13 ++++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml index 7b919cc..9f34fc5 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/coop.yaml @@ -162,16 +162,17 @@ agent: ## Submission - Edit files in place. Don't commit. When done, run each of these as a - SEPARATE bash tool call (do not chain them with `&&`): + `patch.txt` is the artifact we evaluate — write whatever unified diff + you want to submit to that file, however it makes sense given how you + worked: - Write the patch: + Write the patch (one common way — `git diff` of your in-place edits): ```bash git diff -- path/to/file1 path/to/file2 > patch.txt ``` - Verify it: + Verify it contains what you intend: ```bash cat patch.txt @@ -183,7 +184,8 @@ agent: echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` - The patch must contain only source files you modified. Exclude: + The patch must be a unified diff and contain only source files you + intentionally modified. Exclude: - reproduction or scratch test scripts you wrote - helper scripts or tools you created diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml index fc9fa99..b5658db 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -99,16 +99,18 @@ agent: ## Submission - Edit files in place. Don't commit. When done, run each of these as a - SEPARATE bash tool call (do not chain them with `&&`): + `patch.txt` is the artifact we evaluate — write whatever unified diff + you want to submit to that file, however it makes sense given how you + worked. Run each of these as a SEPARATE bash tool call (do not chain + them with `&&`): - Write the patch: + Write the patch (one common way — `git diff` of your in-place edits): ```bash git diff -- path/to/file1 path/to/file2 > patch.txt ``` - Verify it: + Verify it contains what you intend: ```bash cat patch.txt @@ -120,7 +122,8 @@ agent: echo COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT && cat patch.txt ``` - The patch must contain only source files you modified. Exclude: + The patch must be a unified diff and contain only source files you + intentionally modified. Exclude: - reproduction or scratch test scripts you wrote - helper scripts or tools you created From 6bf32cee38b74ed1138eb579545e19e66c010db6 Mon Sep 17 00:00:00 2001 From: Arpandeep Khatua Date: Thu, 30 Apr 2026 22:37:24 +0000 Subject: [PATCH 11/11] mini_swe_agent_v2: drop the 'SEPARATE bash tool call' line from solo.yaml too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the trim already applied to coop.yaml — the sentence was unnecessary scaffolding now that the three steps are in their own fenced blocks. --- src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml index b5658db..53fcada 100644 --- a/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml +++ b/src/cooperbench/agents/mini_swe_agent_v2/config/solo.yaml @@ -101,8 +101,7 @@ agent: `patch.txt` is the artifact we evaluate — write whatever unified diff you want to submit to that file, however it makes sense given how you - worked. Run each of these as a SEPARATE bash tool call (do not chain - them with `&&`): + worked: Write the patch (one common way — `git diff` of your in-place edits):