Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions verifiers/clients/renderer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
concurrent rollouts tokenize in parallel instead of blocking the event loop.
"""

import asyncio
import json
import threading
from collections.abc import Mapping
Expand Down Expand Up @@ -56,6 +57,7 @@
UserMessage,
)
from verifiers.utils.client_utils import setup_openai_client
from verifiers.utils.multimodal import prepare_images_inplace

# Module-level bridge counters. Incremented by every RendererClient instance
# that tries to stitch a multi-turn prompt; callers (e.g. prime-rl's
Expand Down Expand Up @@ -472,6 +474,7 @@ def _get_renderer_or_pool(
async def to_native_prompt(
self, messages: Messages
) -> tuple[list[RendererMessage], dict]:
await asyncio.to_thread(prepare_images_inplace, messages)
return (
_attach_tool_call_names([_to_renderer_message(m) for m in messages]),
{},
Expand Down
4 changes: 2 additions & 2 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ class ResponseTokens(CustomBaseModel):
completion_logprobs: list[float]
routed_experts: RoutedExpertsPayload | None = None
# Renderer-emitted multimodal sidecar (renderers.base.MultiModalData)
# carrying processed pixel_values / placeholder ranges per modality.
# carrying raw image descriptors / placeholder ranges per modality.
# Populated by the renderer client when the rollout went through a
# multimodal-aware renderer; ``None`` otherwise. Stored as ``Any`` to
# avoid a hard import dependency on ``renderers`` at this layer.
Expand Down Expand Up @@ -260,7 +260,7 @@ class TrajectoryStepTokens(TypedDict):
is_truncated: bool
routed_experts: RoutedExpertsPayload | None
# Renderer-emitted multimodal sidecar (renderers.base.MultiModalData)
# carrying processed pixel_values / placeholder ranges per modality.
# carrying raw image descriptors / placeholder ranges per modality.
# ``NotRequired`` because text-only rollouts (and non-renderer client
# types) never populate it.
multi_modal_data: NotRequired[Any]
Expand Down
90 changes: 90 additions & 0 deletions verifiers/utils/multimodal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Multimodal ingress helpers for renderer-backed training."""

from __future__ import annotations

from importlib import import_module
from pathlib import Path
from typing import Any


def _offload_image_url(url: object, image_dir: Path | None) -> str | None:
try:
offload_image_to_run_assets = getattr(
import_module("renderers.mm_store"),
"offload_image_to_run_assets",
)
except (
ImportError,
AttributeError,
) as exc: # pragma: no cover - dependency-version guard
raise RuntimeError(
"Multimodal training requires a renderers version with raw image "
"asset offload support."
) from exc

return offload_image_to_run_assets(url, image_dir=image_dir)


def _image_source_url(source: Any) -> object:
if isinstance(source, dict):
return source.get("url")
return getattr(source, "url", None)


def _set_image_source_url(source: Any, url: str) -> None:
if isinstance(source, dict):
source["url"] = url
else:
source.url = url


def _require_file_image_url(source: Any) -> None:
url = _image_source_url(source)
if not isinstance(url, str) or not url.startswith("file://"):
raise RuntimeError(
"multimodal training requires image_url entries to be offloaded "
"to file:// run image assets"
)


def _prepare_image_source(source: Any, *, image_dir: Path | None) -> None:
result = _offload_image_url(_image_source_url(source), image_dir)
if result is not None:
_set_image_source_url(source, result)
_require_file_image_url(source)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inline image URLs always rejected

High Severity

The v1 train path always requires every image_url to become a file:// run asset after preparation, even when offload leaves a data:image/...;base64,... URL unchanged. That conflicts with the intended inline multimodal storage mode where base64 image URLs stay in the message payload, so inline training rollouts fail at request preparation instead of validating in place.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 0b1d73f. Configure here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed inline mode so this is irrelevant



def prepare_images_inplace(value: Any, *, image_dir: Path | None = None) -> None:
"""Offload image URLs reachable from ``value`` to run image assets.

Handles OpenAI wire dicts/lists and the pydantic v0/v1 message/content-part
models used by trajectories and traces.
"""
if isinstance(value, dict):
if value.get("type") == "image_url":
source = value.get("image_url")
if source is not None:
_prepare_image_source(source, image_dir=image_dir)
Comment on lines +64 to +67

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Medium utils/multimodal.py:64

prepare_images_inplace skips validation when an image_url part has a missing or None image_url field: lines 65-67 only call _prepare_image_source when source is not None, so the malformed part passes through unchecked. Downstream, ChatDialect.parse_request normalizes it to ImageUrlSource(url=""), forwarding a request with an empty image URL instead of rejecting it. Consider calling _require_file_image_url(value) (or otherwise validating) when source is None so malformed parts are rejected.

Suggested change
if value.get("type") == "image_url":
source = value.get("image_url")
if source is not None:
_prepare_image_source(source, image_dir=image_dir)
if value.get("type") == "image_url":
source = value.get("image_url")
if source is not None:
_prepare_image_source(source, image_dir=image_dir)
else:
_require_file_image_url(value)
🚀 Reply "fix it for me" or copy this AI Prompt for your agent:
In file @verifiers/utils/multimodal.py around lines 64-67:

`prepare_images_inplace` skips validation when an `image_url` part has a missing or `None` `image_url` field: lines 65-67 only call `_prepare_image_source` when `source is not None`, so the malformed part passes through unchecked. Downstream, `ChatDialect.parse_request` normalizes it to `ImageUrlSource(url="")`, forwarding a request with an empty image URL instead of rejecting it. Consider calling `_require_file_image_url(value)` (or otherwise validating) when `source` is `None` so malformed parts are rejected.

for child in value.values():
prepare_images_inplace(child, image_dir=image_dir)
return

if isinstance(value, list):
for child in value:
prepare_images_inplace(child, image_dir=image_dir)
return

if isinstance(value, tuple):
for child in value:
prepare_images_inplace(child, image_dir=image_dir)
return

if getattr(value, "type", None) == "image_url":
source = getattr(value, "image_url", None)
if source is not None:
_prepare_image_source(source, image_dir=image_dir)
return

content = getattr(value, "content", None)
if isinstance(content, (list, tuple)):
prepare_images_inplace(content, image_dir=image_dir)
17 changes: 11 additions & 6 deletions verifiers/v1/ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,16 @@ end to end: each surviving context window is just another root→leaf path.

`Trace.to_record()` (`trace.py`) is the JSON record dump (`model_dump(mode="json")`) for
`results.jsonl` / W&B tables, minus the per-node training tensors (`MessageNode.multi_modal_data`,
`routed_experts`, via `_NODE_DUMP_EXCLUDE`): those hold raw numpy bytes that can't round-trip JSON
(the dump raises `UnicodeDecodeError` on real expert ids) and bloat every line. Computed views
`routed_experts`, via `_NODE_DUMP_EXCLUDE`): routed-expert tensors hold raw numpy bytes that can't
round-trip JSON (the dump raises `UnicodeDecodeError` on real expert ids), and multimodal
descriptors are trainer sidecars rather than rollout records. Computed views
(`reward`, `branches`, `num_turns`, per-span `duration`) are pydantic properties, so they're never
serialized and recompute on load; `state` is excluded. The tensors still reach the trainer over the
env-server *wire*, which uses msgpack `model_dump(mode="python")` and carries them as raw `bin` bytes
(not base64) via the field serializers on `MessageNode` (`graph.py`); only the JSON record strips them.
(not base64) via the field serializers on `MessageNode` (`graph.py`); only the JSON record strips
them. Multimodal training uses raw run-image assets: the train client rewrites base64 image parts to
`file://` refs before tracing, and `MessageNode.multi_modal_data` carries lightweight renderer
descriptors (hashes, placeholder ranges, image metadata/refs) rather than image processor outputs.

### Branching: message-level vs renderer-level, and the token invariant

Expand Down Expand Up @@ -111,9 +115,10 @@ The renderer client avoids the break entirely when it can: instead of re-renderi
each turn, the train client (`clients/train.py`) calls `renderer.bridge_to_next_turn(...)`, which
keeps the prior `prompt_ids + completion_ids` **verbatim** and only renders the new tail. Verbatim
prior ⇒ the stored prefix matches token-for-token ⇒ no fork, one linear branch, invariant intact.
The token-identity check in `commit` is the backstop for when the bridge can't apply (the renderer
returns `None`, multimodal, the eval relay): the break still surfaces as honest branches rather than
silent corruption.
For multimodal renderers, the train client also passes the reusable prefix's `multi_modal_data` so
prior image placeholders and descriptors remain aligned. The token-identity check in `commit` is the
backstop for when the bridge can't apply (the renderer returns `None`, the eval relay): the break
still surfaces as honest branches rather than silent corruption.

## Model access — interception, dialects, clients

Expand Down
10 changes: 6 additions & 4 deletions verifiers/v1/cli/dashboard/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,12 @@ def _breakdown(done: list[Trace]) -> Table | None:
names.extend(n for n in getattr(trace, source) if n not in names)
if not names:
continue
segments = [
f"{name} {format_mean(done, lambda t, n=name, s=source: getattr(t, s).get(n, 0.0))}"
for name in names
]
segments = []
for name in names:
value = format_mean(
done, lambda t, n=name, s=source: getattr(t, s).get(n, 0.0)
)
segments.append(f"{name} {value}")
grid.add_row(label, " · ".join(segments))

# Resource use over every completed rollout (errored ones still spent tokens/time): tokens and
Expand Down
13 changes: 13 additions & 0 deletions verifiers/v1/clients/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@ class RelayReply:


class Client(ABC):
async def prepare_request_body(self, dialect: Dialect, body: dict) -> dict:
"""Normalize a provider request before the interception server parses/traces it.

Relay clients keep the request verbatim. Training clients may rewrite heavy
in-process payloads (for example base64 images) into stable run-asset refs so the
trace, renderer, and trainer all see the same cheap message content.
"""
return body

async def prepare_messages(self, dialect: Dialect, messages: list) -> list:
"""Normalize typed simulator messages before adding them to the wire body/trace."""
return messages

@abstractmethod
async def get_response(
self,
Expand Down
40 changes: 22 additions & 18 deletions verifiers/v1/clients/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
needs a running vLLM engine.
"""

import asyncio
import json
from collections.abc import Mapping
from typing import Any
Expand All @@ -16,6 +17,7 @@
from renderers import RenderedTokens
from renderers import OverlongPromptError as RendererOverlongPromptError
from renderers import RendererConfig
from renderers.base import is_multimodal

from verifiers.v1.clients.client import SESSION_ID_HEADER, Client
from verifiers.v1.dialects import FINISH_REASONS, ChatDialect, Dialect, parse_tools
Expand All @@ -32,6 +34,7 @@
TurnTokens,
Usage,
)
from verifiers.utils.multimodal import prepare_images_inplace


def tool_to_wire(tool: Tool) -> dict:
Expand Down Expand Up @@ -167,16 +170,6 @@ def _is_valid_incremental_tail(messages: list[dict[str, Any]]) -> bool:
return all(role == "tool" for role in roles)


def _has_multimodal_content(messages) -> bool:
for message in messages:
content = getattr(message, "content", None)
if not isinstance(content, list):
continue
if any(getattr(part, "type", None) == "image_url" for part in content):
return True
return False


class TrainClient(Client):
"""Renders prompts to token ids and calls a vLLM `/inference/v1/generate` engine."""

Expand Down Expand Up @@ -213,6 +206,16 @@ def _renderer_pool(
)
return self._pool

async def prepare_request_body(self, dialect: Dialect, body: dict) -> dict:
if isinstance(dialect, ChatDialect):
await asyncio.to_thread(prepare_images_inplace, body)
return body
Comment thread
macroscopeapp[bot] marked this conversation as resolved.

async def prepare_messages(self, dialect: Dialect, messages: list) -> list:
if isinstance(dialect, ChatDialect):
await asyncio.to_thread(prepare_images_inplace, messages)
return messages

async def get_response(
self,
dialect: Dialect,
Expand Down Expand Up @@ -263,23 +266,24 @@ async def get_response(
)
bridged_turn: PendingTurn | None = None

# Only build the (O(context)) previous-turn token ids once the cheap guards pass — a
# multimodal prompt or a tail that isn't a clean `[tool*, user?]` extension can't bridge.
can_bridge = (
turn is not None
and not _has_multimodal_content(prompt)
and _is_valid_incremental_tail(wire_messages)
)
# Only build the (O(context)) previous-turn token ids once the cheap guards pass: a
# tail that isn't a clean `[tool*, user?]` extension can't bridge.
can_bridge = turn is not None and _is_valid_incremental_tail(wire_messages)
previous_ids = turn.previous_token_ids() if can_bridge else None
if previous_ids is not None:
previous_prompt_ids, previous_completion_ids = previous_ids

def bridge():
kwargs: dict[str, Any] = {"tools": wire_tools}
if is_multimodal(renderer):
kwargs["previous_multi_modal_data"] = (
turn.previous_multi_modal_data()
)
return renderer.bridge_to_next_turn(
previous_prompt_ids,
previous_completion_ids,
wire_messages,
tools=wire_tools,
**kwargs,
)

bridged = await _maybe_offload(renderer, bridge)
Expand Down
Loading
Loading