PrimeIntellect-ai · eligotts · Jun 18, 2026 · Jun 20, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py
@@ -8,6 +8,7 @@
 concurrent rollouts tokenize in parallel instead of blocking the event loop.
 """
 
+import asyncio
 import json
 import threading
 from collections.abc import Mapping
@@ -56,6 +57,7 @@
     UserMessage,
 )
 from verifiers.utils.client_utils import setup_openai_client
+from verifiers.utils.multimodal import prepare_images_inplace
 
 # Module-level bridge counters. Incremented by every RendererClient instance
 # that tries to stitch a multi-turn prompt; callers (e.g. prime-rl's
@@ -472,6 +474,7 @@ def _get_renderer_or_pool(
     async def to_native_prompt(
         self, messages: Messages
     ) -> tuple[list[RendererMessage], dict]:
+        await asyncio.to_thread(prepare_images_inplace, messages)
         return (
             _attach_tool_call_names([_to_renderer_message(m) for m in messages]),
             {},

diff --git a/verifiers/types.py b/verifiers/types.py
@@ -213,7 +213,7 @@ class ResponseTokens(CustomBaseModel):
     completion_logprobs: list[float]
     routed_experts: RoutedExpertsPayload | None = None
     # Renderer-emitted multimodal sidecar (renderers.base.MultiModalData)
-    # carrying processed pixel_values / placeholder ranges per modality.
+    # carrying raw image descriptors / placeholder ranges per modality.
     # Populated by the renderer client when the rollout went through a
     # multimodal-aware renderer; ``None`` otherwise. Stored as ``Any`` to
     # avoid a hard import dependency on ``renderers`` at this layer.
@@ -260,7 +260,7 @@ class TrajectoryStepTokens(TypedDict):
     is_truncated: bool
     routed_experts: RoutedExpertsPayload | None
     # Renderer-emitted multimodal sidecar (renderers.base.MultiModalData)
-    # carrying processed pixel_values / placeholder ranges per modality.
+    # carrying raw image descriptors / placeholder ranges per modality.
     # ``NotRequired`` because text-only rollouts (and non-renderer client
     # types) never populate it.
     multi_modal_data: NotRequired[Any]

diff --git a/verifiers/utils/multimodal.py b/verifiers/utils/multimodal.py
@@ -0,0 +1,90 @@
+"""Multimodal ingress helpers for renderer-backed training."""
+
+from __future__ import annotations
+
+from importlib import import_module
+from pathlib import Path
+from typing import Any
+
+
+def _offload_image_url(url: object, image_dir: Path | None) -> str | None:
+    try:
+        offload_image_to_run_assets = getattr(
+            import_module("renderers.mm_store"),
+            "offload_image_to_run_assets",
+        )
+    except (
+        ImportError,
+        AttributeError,
+    ) as exc:  # pragma: no cover - dependency-version guard
+        raise RuntimeError(
+            "Multimodal training requires a renderers version with raw image "
+            "asset offload support."
+        ) from exc
+
+    return offload_image_to_run_assets(url, image_dir=image_dir)
+
+
+def _image_source_url(source: Any) -> object:
+    if isinstance(source, dict):
+        return source.get("url")
+    return getattr(source, "url", None)
+
+
+def _set_image_source_url(source: Any, url: str) -> None:
+    if isinstance(source, dict):
+        source["url"] = url
+    else:
+        source.url = url
+
+
+def _require_file_image_url(source: Any) -> None:
+    url = _image_source_url(source)
+    if not isinstance(url, str) or not url.startswith("file://"):
+        raise RuntimeError(
+            "multimodal training requires image_url entries to be offloaded "
+            "to file:// run image assets"
+        )
+
+
+def _prepare_image_source(source: Any, *, image_dir: Path | None) -> None:
+    result = _offload_image_url(_image_source_url(source), image_dir)
+    if result is not None:
+        _set_image_source_url(source, result)
+    _require_file_image_url(source)
+
+
+def prepare_images_inplace(value: Any, *, image_dir: Path | None = None) -> None:
+    """Offload image URLs reachable from ``value`` to run image assets.
+
+    Handles OpenAI wire dicts/lists and the pydantic v0/v1 message/content-part
+    models used by trajectories and traces.
+    """
+    if isinstance(value, dict):
+        if value.get("type") == "image_url":
+            source = value.get("image_url")
+            if source is not None:
+                _prepare_image_source(source, image_dir=image_dir)
-        if value.get("type") == "image_url":
-            source = value.get("image_url")
-            if source is not None:
-                _prepare_image_source(source, image_dir=image_dir)
+        if value.get("type") == "image_url":
+            source = value.get("image_url")
+            if source is not None:
+                _prepare_image_source(source, image_dir=image_dir)
+            else:
+                _require_file_image_url(value)
-        if value.get("type") == "image_url":
-            source = value.get("image_url")
-            if source is not None:
-                _prepare_image_source(source, image_dir=image_dir)
+        if value.get("type") == "image_url":
+            source = value.get("image_url")
+            if source is not None:
+                _prepare_image_source(source, image_dir=image_dir)
+            else:
+                _require_file_image_url(value)
+        for child in value.values():
+            prepare_images_inplace(child, image_dir=image_dir)
+        return
+
+    if isinstance(value, list):
+        for child in value:
+            prepare_images_inplace(child, image_dir=image_dir)
+        return
+
+    if isinstance(value, tuple):
+        for child in value:
+            prepare_images_inplace(child, image_dir=image_dir)
+        return
+
+    if getattr(value, "type", None) == "image_url":
+        source = getattr(value, "image_url", None)
+        if source is not None:
+            _prepare_image_source(source, image_dir=image_dir)
+        return
+
+    content = getattr(value, "content", None)
+    if isinstance(content, (list, tuple)):
+        prepare_images_inplace(content, image_dir=image_dir)
diff --git a/verifiers/v1/ARCHITECTURE.md b/verifiers/v1/ARCHITECTURE.md
@@ -70,12 +70,16 @@ end to end: each surviving context window is just another root→leaf path.
 
 `Trace.to_record()` (`trace.py`) is the JSON record dump (`model_dump(mode="json")`) for
 `results.jsonl` / W&B tables, minus the per-node training tensors (`MessageNode.multi_modal_data`,
-`routed_experts`, via `_NODE_DUMP_EXCLUDE`): those hold raw numpy bytes that can't round-trip JSON
-(the dump raises `UnicodeDecodeError` on real expert ids) and bloat every line. Computed views
+`routed_experts`, via `_NODE_DUMP_EXCLUDE`): routed-expert tensors hold raw numpy bytes that can't
+round-trip JSON (the dump raises `UnicodeDecodeError` on real expert ids), and multimodal
+descriptors are trainer sidecars rather than rollout records. Computed views
 (`reward`, `branches`, `num_turns`, per-span `duration`) are pydantic properties, so they're never
 serialized and recompute on load; `state` is excluded. The tensors still reach the trainer over the
 env-server *wire*, which uses msgpack `model_dump(mode="python")` and carries them as raw `bin` bytes
-(not base64) via the field serializers on `MessageNode` (`graph.py`); only the JSON record strips them.
+(not base64) via the field serializers on `MessageNode` (`graph.py`); only the JSON record strips
+them. Multimodal training uses raw run-image assets: the train client rewrites base64 image parts to
+`file://` refs before tracing, and `MessageNode.multi_modal_data` carries lightweight renderer
+descriptors (hashes, placeholder ranges, image metadata/refs) rather than image processor outputs.
 
 ### Branching: message-level vs renderer-level, and the token invariant
 
@@ -111,9 +115,10 @@ The renderer client avoids the break entirely when it can: instead of re-renderi
 each turn, the train client (`clients/train.py`) calls `renderer.bridge_to_next_turn(...)`, which
 keeps the prior `prompt_ids + completion_ids` **verbatim** and only renders the new tail. Verbatim
 prior ⇒ the stored prefix matches token-for-token ⇒ no fork, one linear branch, invariant intact.
-The token-identity check in `commit` is the backstop for when the bridge can't apply (the renderer
-returns `None`, multimodal, the eval relay): the break still surfaces as honest branches rather than
-silent corruption.
+For multimodal renderers, the train client also passes the reusable prefix's `multi_modal_data` so
+prior image placeholders and descriptors remain aligned. The token-identity check in `commit` is the
+backstop for when the bridge can't apply (the renderer returns `None`, the eval relay): the break
+still surfaces as honest branches rather than silent corruption.
 
 ## Model access — interception, dialects, clients
 

diff --git a/verifiers/v1/cli/dashboard/eval.py b/verifiers/v1/cli/dashboard/eval.py
@@ -190,10 +190,12 @@ def _breakdown(done: list[Trace]) -> Table | None:
             names.extend(n for n in getattr(trace, source) if n not in names)
         if not names:
             continue
-        segments = [
-            f"{name} {format_mean(done, lambda t, n=name, s=source: getattr(t, s).get(n, 0.0))}"
-            for name in names
-        ]
+        segments = []
+        for name in names:
+            value = format_mean(
+                done, lambda t, n=name, s=source: getattr(t, s).get(n, 0.0)
+            )
+            segments.append(f"{name} {value}")
         grid.add_row(label, "  ·  ".join(segments))
 
     # Resource use over every completed rollout (errored ones still spent tokens/time): tokens and

diff --git a/verifiers/v1/clients/client.py b/verifiers/v1/clients/client.py
@@ -33,6 +33,19 @@ class RelayReply:
 
 
 class Client(ABC):
+    async def prepare_request_body(self, dialect: Dialect, body: dict) -> dict:
+        """Normalize a provider request before the interception server parses/traces it.
+
+        Relay clients keep the request verbatim. Training clients may rewrite heavy
+        in-process payloads (for example base64 images) into stable run-asset refs so the
+        trace, renderer, and trainer all see the same cheap message content.
+        """
+        return body
+
+    async def prepare_messages(self, dialect: Dialect, messages: list) -> list:
+        """Normalize typed simulator messages before adding them to the wire body/trace."""
+        return messages
+
     @abstractmethod
     async def get_response(
         self,

diff --git a/verifiers/v1/clients/train.py b/verifiers/v1/clients/train.py
@@ -8,6 +8,7 @@
 needs a running vLLM engine.
 """
 
+import asyncio
 import json
 from collections.abc import Mapping
 from typing import Any
@@ -16,6 +17,7 @@
 from renderers import RenderedTokens
 from renderers import OverlongPromptError as RendererOverlongPromptError
 from renderers import RendererConfig
+from renderers.base import is_multimodal
 
 from verifiers.v1.clients.client import SESSION_ID_HEADER, Client
 from verifiers.v1.dialects import FINISH_REASONS, ChatDialect, Dialect, parse_tools
@@ -32,6 +34,7 @@
     TurnTokens,
     Usage,
 )
+from verifiers.utils.multimodal import prepare_images_inplace
 
 
 def tool_to_wire(tool: Tool) -> dict:
@@ -167,16 +170,6 @@ def _is_valid_incremental_tail(messages: list[dict[str, Any]]) -> bool:
     return all(role == "tool" for role in roles)
 
 
-def _has_multimodal_content(messages) -> bool:
-    for message in messages:
-        content = getattr(message, "content", None)
-        if not isinstance(content, list):
-            continue
-        if any(getattr(part, "type", None) == "image_url" for part in content):
-            return True
-    return False
-
-
 class TrainClient(Client):
     """Renders prompts to token ids and calls a vLLM `/inference/v1/generate` engine."""
 
@@ -213,6 +206,16 @@ def _renderer_pool(
             )
         return self._pool
 
+    async def prepare_request_body(self, dialect: Dialect, body: dict) -> dict:
+        if isinstance(dialect, ChatDialect):
+            await asyncio.to_thread(prepare_images_inplace, body)
+        return body
+
+    async def prepare_messages(self, dialect: Dialect, messages: list) -> list:
+        if isinstance(dialect, ChatDialect):
+            await asyncio.to_thread(prepare_images_inplace, messages)
+        return messages
+
     async def get_response(
         self,
         dialect: Dialect,
@@ -263,23 +266,24 @@ async def get_response(
         )
         bridged_turn: PendingTurn | None = None
 
-        # Only build the (O(context)) previous-turn token ids once the cheap guards pass — a
-        # multimodal prompt or a tail that isn't a clean `[tool*, user?]` extension can't bridge.
-        can_bridge = (
-            turn is not None
-            and not _has_multimodal_content(prompt)
-            and _is_valid_incremental_tail(wire_messages)
-        )
+        # Only build the (O(context)) previous-turn token ids once the cheap guards pass: a
+        # tail that isn't a clean `[tool*, user?]` extension can't bridge.
+        can_bridge = turn is not None and _is_valid_incremental_tail(wire_messages)
         previous_ids = turn.previous_token_ids() if can_bridge else None
         if previous_ids is not None:
             previous_prompt_ids, previous_completion_ids = previous_ids
 
             def bridge():
+                kwargs: dict[str, Any] = {"tools": wire_tools}
+                if is_multimodal(renderer):
+                    kwargs["previous_multi_modal_data"] = (
+                        turn.previous_multi_modal_data()
+                    )
                 return renderer.bridge_to_next_turn(
                     previous_prompt_ids,
                     previous_completion_ids,
                     wire_messages,
-                    tools=wire_tools,
+                    **kwargs,
                 )
 
             bridged = await _maybe_offload(renderer, bridge)